diff options
Diffstat (limited to 'llvm/lib/Target')
246 files changed, 7178 insertions, 1735 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index f092c039b58e..b332e9dcb176 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -650,6 +650,7 @@ include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" include "AArch64SchedAmpere1.td" +include "AArch64SchedNeoverseN2.td" def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors">; @@ -1137,7 +1138,7 @@ def : ProcessorModel<"cortex-a78", CortexA57Model, ProcessorFeatures.A78, [TuneA78]>; def : ProcessorModel<"cortex-a78c", CortexA57Model, ProcessorFeatures.A78C, [TuneA78C]>; -def : ProcessorModel<"cortex-a710", CortexA57Model, ProcessorFeatures.A710, +def : ProcessorModel<"cortex-a710", NeoverseN2Model, ProcessorFeatures.A710, [TuneA710]>; def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82, [TuneR82]>; @@ -1145,17 +1146,17 @@ def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1, [TuneX1]>; def : ProcessorModel<"cortex-x1c", CortexA57Model, ProcessorFeatures.X1C, [TuneX1]>; -def : ProcessorModel<"cortex-x2", CortexA57Model, ProcessorFeatures.X2, +def : ProcessorModel<"cortex-x2", NeoverseN2Model, ProcessorFeatures.X2, [TuneX2]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>; def : ProcessorModel<"neoverse-n1", CortexA57Model, ProcessorFeatures.NeoverseN1, [TuneNeoverseN1]>; -def : ProcessorModel<"neoverse-n2", CortexA57Model, +def : ProcessorModel<"neoverse-n2", NeoverseN2Model, ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>; -def : ProcessorModel<"neoverse-512tvb", CortexA57Model, +def : ProcessorModel<"neoverse-512tvb", NeoverseN2Model, ProcessorFeatures.Neoverse512TVB, [TuneNeoverse512TVB]>; -def : ProcessorModel<"neoverse-v1", CortexA57Model, +def : ProcessorModel<"neoverse-v1", NeoverseN2Model, ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3, [TuneExynosM3]>; diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index ef4860979dd3..c568f73471e1 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1173,6 +1173,8 @@ void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) { #include "AArch64GenMCPseudoLowering.inc" void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { + AArch64_MC::verifyInstructionPredicates(MI->getOpcode(), STI->getFeatureBits()); + // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index abfe2d507111..447ad10ddf22 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -237,6 +237,39 @@ static bool isMergePassthruOpcode(unsigned Opc) { } } +// Returns true if inactive lanes are known to be zeroed by construction. +static bool isZeroingInactiveLanes(SDValue Op) { + switch (Op.getOpcode()) { + default: + // We guarantee i1 splat_vectors to zero the other lanes by + // implementing it with ptrue and possibly a punpklo for nxv1i1. + if (ISD::isConstantSplatVectorAllOnes(Op.getNode())) + return true; + return false; + case AArch64ISD::PTRUE: + case AArch64ISD::SETCC_MERGE_ZERO: + return true; + case ISD::INTRINSIC_WO_CHAIN: + switch (Op.getConstantOperandVal(0)) { + default: + return false; + case Intrinsic::aarch64_sve_ptrue: + case Intrinsic::aarch64_sve_pnext: + case Intrinsic::aarch64_sve_cmpeq_wide: + case Intrinsic::aarch64_sve_cmpne_wide: + case Intrinsic::aarch64_sve_cmpge_wide: + case Intrinsic::aarch64_sve_cmpgt_wide: + case Intrinsic::aarch64_sve_cmplt_wide: + case Intrinsic::aarch64_sve_cmple_wide: + case Intrinsic::aarch64_sve_cmphs_wide: + case Intrinsic::aarch64_sve_cmphi_wide: + case Intrinsic::aarch64_sve_cmplo_wide: + case Intrinsic::aarch64_sve_cmpls_wide: + return true; + } + } +} + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -1082,6 +1115,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); } + // FIXME: Move lowering for more nodes here if those are common between + // SVE and SME. + if (Subtarget->hasSVE() || Subtarget->hasSME()) { + for (auto VT : + {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) { + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + } + } + if (Subtarget->hasSVE()) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); @@ -1162,14 +1205,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); @@ -2429,6 +2470,23 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitAddVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // pn + MIB.add(MI.getOperand(2)); // pm + MIB.add(MI.getOperand(3)); // zn + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -2561,6 +2619,14 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( BB); case AArch64::ZERO_M_PSEUDO: return EmitZero(MI, BB); + case AArch64::ADDHA_MPPZ_PSEUDO_S: + return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_S, AArch64::ZAS0, MI, BB); + case AArch64::ADDVA_MPPZ_PSEUDO_S: + return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_S, AArch64::ZAS0, MI, BB); + case AArch64::ADDHA_MPPZ_PSEUDO_D: + return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_D, AArch64::ZAD0, MI, BB); + case AArch64::ADDVA_MPPZ_PSEUDO_D: + return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_D, AArch64::ZAD0, MI, BB); } } @@ -4329,55 +4395,49 @@ SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern) { + if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all) + return DAG.getConstant(1, DL, MVT::nxv1i1); return DAG.getNode(AArch64ISD::PTRUE, DL, VT, DAG.getTargetConstant(Pattern, DL, MVT::i32)); } -static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) { +// Returns a safe bitcast between two scalable vector predicates, where +// any newly created lanes from a widening bitcast are defined as zero. +static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); - EVT OutVT = Op.getValueType(); - SDValue InOp = Op.getOperand(1); - EVT InVT = InOp.getValueType(); + EVT InVT = Op.getValueType(); + + assert(InVT.getVectorElementType() == MVT::i1 && + VT.getVectorElementType() == MVT::i1 && + "Expected a predicate-to-predicate bitcast"); + assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + InVT.isScalableVector() && + DAG.getTargetLoweringInfo().isTypeLegal(InVT) && + "Only expect to cast between legal scalable predicate types!"); // Return the operand if the cast isn't changing type, - // i.e. <n x 16 x i1> -> <n x 16 x i1> - if (InVT == OutVT) - return InOp; + // e.g. <n x 16 x i1> -> <n x 16 x i1> + if (InVT == VT) + return Op; - SDValue Reinterpret = - DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, InOp); + SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); - // If the argument converted to an svbool is a ptrue or a comparison, the - // lanes introduced by the widening are zero by construction. - switch (InOp.getOpcode()) { - case AArch64ISD::SETCC_MERGE_ZERO: + // We only have to zero the lanes if new lanes are being defined, e.g. when + // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the + // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then + // we can return here. + if (InVT.bitsGT(VT)) return Reinterpret; - case ISD::INTRINSIC_WO_CHAIN: - switch (InOp.getConstantOperandVal(0)) { - case Intrinsic::aarch64_sve_ptrue: - case Intrinsic::aarch64_sve_cmpeq_wide: - case Intrinsic::aarch64_sve_cmpne_wide: - case Intrinsic::aarch64_sve_cmpge_wide: - case Intrinsic::aarch64_sve_cmpgt_wide: - case Intrinsic::aarch64_sve_cmplt_wide: - case Intrinsic::aarch64_sve_cmple_wide: - case Intrinsic::aarch64_sve_cmphs_wide: - case Intrinsic::aarch64_sve_cmphi_wide: - case Intrinsic::aarch64_sve_cmplo_wide: - case Intrinsic::aarch64_sve_cmpls_wide: - return Reinterpret; - } - } - // Splat vectors of one will generate ptrue instructions - if (ISD::isConstantSplatVectorAllOnes(InOp.getNode())) + // Check if the other lanes are already known to be zeroed by + // construction. + if (isZeroingInactiveLanes(Op)) return Reinterpret; - // Otherwise, zero the newly introduced lanes. - SDValue Mask = getPTrue(DAG, DL, InVT, AArch64SVEPredPattern::all); - SDValue MaskReinterpret = - DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, OutVT, Mask); - return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret); + // Zero the newly introduced lanes. + SDValue Mask = DAG.getConstant(1, DL, InVT); + Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask); + return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask); } SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, @@ -4546,10 +4606,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_dupq_lane: return LowerDUPQLane(Op, DAG); case Intrinsic::aarch64_sve_convert_from_svbool: - return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), - Op.getOperand(1)); + return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG); case Intrinsic::aarch64_sve_convert_to_svbool: - return lowerConvertToSVBool(Op, DAG); + return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG); case Intrinsic::aarch64_sve_fneg: return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); @@ -6393,9 +6452,8 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) { if (SizeInBits < 8) return false; - APInt LowBits(SizeInBits, 0xFF); APInt RequredZero(SizeInBits, 0xFE); - KnownBits Bits = DAG.computeKnownBits(Arg, LowBits, 4); + KnownBits Bits = DAG.computeKnownBits(Arg, 4); bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero; return ZExtBool; } @@ -14814,16 +14872,6 @@ static SDValue performANDCombine(SDNode *N, if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - // Although NEON has no EORV instruction, when only the least significant bit - // is required the operation is synonymous with ADDV. - if (LHS.getOpcode() == ISD::VECREDUCE_XOR && isOneConstant(RHS) && - LHS.getOperand(0).getValueType().isFixedLengthVector() && - LHS.hasOneUse()) { - SDLoc DL(N); - SDValue ADDV = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, LHS.getOperand(0)); - return DAG.getNode(ISD::AND, DL, VT, ADDV, RHS); - } - if (VT.isScalableVector()) return performSVEAndCombine(N, DCI); @@ -16126,12 +16174,24 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, assert(Op.getValueType().isScalableVector() && TLI.isTypeLegal(Op.getValueType()) && "Expected legal scalable vector type!"); + assert(Op.getValueType() == Pg.getValueType() && + "Expected same type for PTEST operands"); // Ensure target specific opcodes are using legal type. EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue TVal = DAG.getConstant(1, DL, OutVT); SDValue FVal = DAG.getConstant(0, DL, OutVT); + // Ensure operands have type nxv16i1. + if (Op.getValueType() != MVT::nxv16i1) { + if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) && + isZeroingInactiveLanes(Op)) + Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg); + else + Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG); + Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op); + } + // Set condition code (CC) flags. SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op); @@ -18026,6 +18086,54 @@ static SDValue performCSELCombine(SDNode *N, return performCONDCombine(N, DCI, DAG, 2, 3); } +// Try to re-use an already extended operand of a vector SetCC feeding a +// extended select. Doing so avoids requiring another full extension of the +// SET_CC result when lowering the select. +static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) { + EVT Op0MVT = Op->getOperand(0).getValueType(); + if (!Op0MVT.isVector() || Op->use_empty()) + return SDValue(); + + // Make sure that all uses of Op are VSELECTs with result matching types where + // the result type has a larger element type than the SetCC operand. + SDNode *FirstUse = *Op->use_begin(); + if (FirstUse->getOpcode() != ISD::VSELECT) + return SDValue(); + EVT UseMVT = FirstUse->getValueType(0); + if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits()) + return SDValue(); + if (any_of(Op->uses(), [&UseMVT](const SDNode *N) { + return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT; + })) + return SDValue(); + + APInt V; + if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V)) + return SDValue(); + + SDLoc DL(Op); + SDValue Op0ExtV; + SDValue Op1ExtV; + ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get(); + // Check if the first operand of the SET_CC is already extended. If it is, + // split the SET_CC and re-use the extended version of the operand. + SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT), + Op->getOperand(0)); + SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT), + Op->getOperand(0)); + if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { + Op0ExtV = SDValue(Op0SExt, 0); + Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1)); + } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) { + Op0ExtV = SDValue(Op0ZExt, 0); + Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1)); + } else + return SDValue(); + + return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1), + Op0ExtV, Op1ExtV, Op->getOperand(2)); +} + static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!"); SDValue LHS = N->getOperand(0); @@ -18034,6 +18142,9 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); + if (SDValue V = tryToWidenSetCCOperands(N, DAG)) + return V; + // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X if (Cond == ISD::SETNE && isOneConstant(RHS) && LHS->getOpcode() == AArch64ISD::CSEL && @@ -21045,7 +21156,7 @@ SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, default: return SDValue(); case ISD::VECREDUCE_OR: - if (isAllActivePredicate(DAG, Pg)) + if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1) // The predicate can be 'Op' because // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op). return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE); @@ -21058,6 +21169,11 @@ SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, case ISD::VECREDUCE_XOR: { SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); + if (OpVT == MVT::nxv1i1) { + // Emulate a CNTP on .Q using .D and a different governing predicate. + Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg); + Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op); + } SDValue Cntp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); return DAG.getAnyExtOrTrunc(Cntp, DL, VT); @@ -21464,22 +21580,17 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT InVT = Op.getValueType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - (void)TLI; - assert(VT.isScalableVector() && TLI.isTypeLegal(VT) && - InVT.isScalableVector() && TLI.isTypeLegal(InVT) && + assert(VT.isScalableVector() && isTypeLegal(VT) && + InVT.isScalableVector() && isTypeLegal(InVT) && "Only expect to cast between legal scalable vector types!"); - assert((VT.getVectorElementType() == MVT::i1) == - (InVT.getVectorElementType() == MVT::i1) && - "Cannot cast between data and predicate scalable vector types!"); + assert(VT.getVectorElementType() != MVT::i1 && + InVT.getVectorElementType() != MVT::i1 && + "For predicate bitcasts, use getSVEPredicateBitCast"); if (InVT == VT) return Op; - if (VT.getVectorElementType() == MVT::i1) - return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); - EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 06ea918ea32e..e02b5e56fd2e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -571,6 +571,9 @@ public: MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitAddVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -1148,6 +1151,7 @@ private: // These can make "bitcasting" a multiphase process. REINTERPRET_CAST is used // to transition between unpacked and packed types of the same element type, // with BITCAST used otherwise. + // This function does not handle predicate bitcasts. SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const; bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index c477a44b13b2..6839e73796a6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -29,21 +29,21 @@ def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>; // An atomic load operation that does not need either acquire or release // semantics. -class relaxed_load<PatFrag base> +class relaxed_load<PatFrags base> : PatFrag<(ops node:$ptr), (base node:$ptr)> { let IsAtomic = 1; let IsAtomicOrderingAcquireOrStronger = 0; } // A atomic load operation that actually needs acquire semantics. -class acquiring_load<PatFrag base> +class acquiring_load<PatFrags base> : PatFrag<(ops node:$ptr), (base node:$ptr)> { let IsAtomic = 1; let IsAtomicOrderingAcquire = 1; } // An atomic load operation that needs sequential consistency. -class seq_cst_load<PatFrag base> +class seq_cst_load<PatFrags base> : PatFrag<(ops node:$ptr), (base node:$ptr)> { let IsAtomic = 1; let IsAtomicOrderingSequentiallyConsistent = 1; @@ -63,34 +63,34 @@ let Predicates = [HasLDAPR] in { } // 8-bit loads -def : Pat<(seq_cst_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; -def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; -def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, +def : Pat<(seq_cst_load<atomic_load_az_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; +def : Pat<(acquiring_load<atomic_load_az_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; +def : Pat<(relaxed_load<atomic_load_az_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)), (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>; -def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend8:$offset)), +def : Pat<(relaxed_load<atomic_load_az_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend8:$offset)), (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>; -def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn, - uimm12s1:$offset)), +def : Pat<(relaxed_load<atomic_load_az_8> (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; -def : Pat<(relaxed_load<atomic_load_8> +def : Pat<(relaxed_load<atomic_load_az_8> (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (LDURBBi GPR64sp:$Rn, simm9:$offset)>; // 16-bit loads -def : Pat<(seq_cst_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; -def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; -def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, +def : Pat<(seq_cst_load<atomic_load_az_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; +def : Pat<(acquiring_load<atomic_load_az_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; +def : Pat<(relaxed_load<atomic_load_az_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)), (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>; -def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend16:$extend)), +def : Pat<(relaxed_load<atomic_load_az_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)), (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>; -def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn, - uimm12s2:$offset)), +def : Pat<(relaxed_load<atomic_load_az_16> (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>; -def : Pat<(relaxed_load<atomic_load_16> +def : Pat<(relaxed_load<atomic_load_az_16> (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), (LDURHHi GPR64sp:$Rn, simm9:$offset)>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 78bc1b8c6f02..02fa36a1df4b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1505,7 +1505,7 @@ class CRmSystemI<Operand crmtype, bits<3> opc, string asm, class SystemNoOperands<bits<3> op2, string asm, list<dag> pattern = []> : SimpleSystemI<0, (ins), asm, "", pattern>, - Sched<[]> { + Sched<[WriteHint]> { bits<4> CRm; let CRm = 0b0011; let Inst{31-12} = 0b11010101000000110010; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 3802a45ad6c1..d444223e4494 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4356,10 +4356,12 @@ defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>; // AArch64's FCVT instructions saturate when out of range. multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> { + let Predicates = [HasFullFP16] in { def : Pat<(v4i16 (to_int_sat v4f16:$Rn, i16)), (!cast<Instruction>(INST # v4f16) v4f16:$Rn)>; def : Pat<(v8i16 (to_int_sat v8f16:$Rn, i16)), (!cast<Instruction>(INST # v8f16) v8f16:$Rn)>; + } def : Pat<(v2i32 (to_int_sat v2f32:$Rn, i32)), (!cast<Instruction>(INST # v2f32) v2f32:$Rn)>; def : Pat<(v4i32 (to_int_sat v4f32:$Rn, i32)), diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 68ff1b78e84b..c66f9cfd9c22 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -778,7 +778,7 @@ let Predicates = [HasSVEorSME] in { defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>; defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>; - def PTEST_PP : sve_int_ptest<0b010000, "ptest">; + def PTEST_PP : sve_int_ptest<0b010000, "ptest", AArch64ptest>; defm PFALSE : sve_int_pfalse<0b000000, "pfalse">; defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; @@ -1531,6 +1531,14 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), (PUNPKHI_PP PPR:$Ps)>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))), @@ -1539,7 +1547,6 @@ let Predicates = [HasSVEorSME] in { (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))), (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; - def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))), @@ -1549,6 +1556,23 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))), (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; + + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 5))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 7))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))), @@ -1566,6 +1590,39 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))), (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 5))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 6))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 7))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 9))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 10))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 11))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 13))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 15))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + // Extract subvectors from FP SVE vectors def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))), (UUNPKLO_ZZ_D ZPR:$Zs)>; @@ -2074,15 +2131,6 @@ let Predicates = [HasSVEorSME] in { def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; } - def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)), - (PTEST_PP PPR:$pg, PPR:$src)>; - def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)), - (PTEST_PP PPR:$pg, PPR:$src)>; - def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)), - (PTEST_PP PPR:$pg, PPR:$src)>; - def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)), - (PTEST_PP PPR:$pg, PPR:$src)>; - let AddedComplexity = 1 in { class LD1RPat<ValueType vt, SDPatternOperator operator, Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> : @@ -2347,6 +2395,9 @@ let Predicates = [HasSVEorSME] in { (AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>; def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)), (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>; + // Emulate .Q operation using a PTRUE_D when the other lanes don't matter. + def : Pat<(nxv1i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>; // Add more complex addressing modes here as required multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load, diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td index d18a05fda191..e378b043d37e 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -28,7 +28,8 @@ def CortexA53Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td index c6b112d0d2f1..141cc6b79c8b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA55.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td @@ -29,7 +29,7 @@ def CortexA55Model : SchedMachineModel { let PostRAScheduler = 1; // Enable PostRA scheduler pass. let CompleteModel = 0; // Covers instructions applicable to Cortex-A55. - list<Predicate> UnsupportedFeatures = [HasSVE]; + list<Predicate> UnsupportedFeatures = [HasSVE, HasMTE]; // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td index a860aa907fd1..8ce229374000 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -33,7 +33,8 @@ def CortexA57Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// @@ -459,9 +460,9 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCM // ASIMD FP convert, long and narrow def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>; // ASIMD FP convert, other, D-form -def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[A57Write_5cyc_1V], (instregex "^[FSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; // ASIMD FP convert, other, Q-form -def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57Write_5cyc_2V], (instregex "^[FSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP divide, D-form, F32 def : InstRW<[A57Write_17cyc_1W], (instregex "FDIVv2f32")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td index 6b053f1969b4..4c65b6727d93 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td @@ -18,11 +18,11 @@ def A64FXModel : SchedMachineModel { // Determined via a mix of micro-arch details and experimentation. let LoopMicroOpBufferSize = 128; let PostRAScheduler = 1; // Using PostRA sched. - let CompleteModel = 1; + let CompleteModel = 0; list<Predicate> UnsupportedFeatures = [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth, - HasSVE2orSME]; + HasSVE2orSME, HasMTE, HasMatMulInt8, HasBF16]; let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td index 32f7299fbf87..b8d5a70d7ec6 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td @@ -25,7 +25,9 @@ def Ampere1Model : SchedMachineModel { let CompleteModel = 1; list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + PAUnsupported.F, + [HasMTE]); } let SchedModel = Ampere1Model in { diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td index 9fbb46919427..e2d916954060 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -20,7 +20,8 @@ def CycloneModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td index d66efb82fccc..f2863f5a8e3b 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td @@ -26,7 +26,8 @@ def ExynosM3Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td index 94e70793e855..ab1e680f9e99 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td @@ -26,7 +26,8 @@ def ExynosM4Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td index 1db5f5322a64..ae0b2b3eaeb6 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td +++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td @@ -26,7 +26,8 @@ def ExynosM5Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index 7c9b0afdd169..a765cd1cdfe3 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -25,7 +25,8 @@ def FalkorModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td index cc568a2f2f17..3551066ee7c3 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td +++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td @@ -29,7 +29,8 @@ def KryoModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td new file mode 100644 index 000000000000..eb5b971d66e5 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td @@ -0,0 +1,2279 @@ +//=- AArch64SchedNeoverseN2.td - NeoverseN2 Scheduling Defs --*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for the Arm Neoverse N2 processors. +// +//===----------------------------------------------------------------------===// + +def NeoverseN2Model : SchedMachineModel { + let IssueWidth = 10; // Micro-ops dispatched at a time. + let MicroOpBufferSize = 160; // Entries in micro-op re-order buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 10; // Extra cycles for mispredicted branch. + let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57. + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = SMEUnsupported.F; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Neoverse N2. +// Instructions are first fetched and then decoded into internal macro-ops +// (MOPs). From there, the MOPs proceed through register renaming and dispatch +// stages. A MOP can be split into two micro-ops further down the pipeline +// after the decode stage. Once dispatched, micro-ops wait for their operands +// and issue out-of-order to one of thirteen issue pipelines. Each issue +// pipeline can accept one micro-op per cycle. + +let SchedModel = NeoverseN2Model in { + +// Define the (13) issue ports. +def N2UnitB : ProcResource<2>; // Branch 0/1 +def N2UnitS : ProcResource<2>; // Integer single Cycle 0/1 +def N2UnitM0 : ProcResource<1>; // Integer multicycle 0 +def N2UnitM1 : ProcResource<1>; // Integer multicycle 1 +def N2UnitL01 : ProcResource<2>; // Load/Store 0/1 +def N2UnitL2 : ProcResource<1>; // Load 2 +def N2UnitD : ProcResource<2>; // Store data 0/1 +def N2UnitV0 : ProcResource<1>; // FP/ASIMD 0 +def N2UnitV1 : ProcResource<1>; // FP/ASIMD 1 + +def N2UnitV : ProcResGroup<[N2UnitV0, N2UnitV1]>; // FP/ASIMD 0/1 +def N2UnitM : ProcResGroup<[N2UnitM0, N2UnitM1]>; // Integer single/multicycle 0/1 +def N2UnitL : ProcResGroup<[N2UnitL01, N2UnitL2]>; // Load/Store 0/1 and Load 2 +def N2UnitI : ProcResGroup<[N2UnitS, N2UnitM0, N2UnitM1]>; // Integer single cycle 0/1 and single/multicycle 0/1 + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadST, 0>; +def : ReadAdvance<ReadVLD, 0>; + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } +def : WriteRes<WriteLDHi, []> { let Latency = 4; } + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Neoverse N2. + +//===----------------------------------------------------------------------===// +// Define generic 1 micro-op types + +def N2Write_1cyc_1B : SchedWriteRes<[N2UnitB]> { let Latency = 1; } +def N2Write_1cyc_1I : SchedWriteRes<[N2UnitI]> { let Latency = 1; } +def N2Write_1cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 1; } +def N2Write_1cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 1; } +def N2Write_1cyc_1L01 : SchedWriteRes<[N2UnitL01]> { let Latency = 1; } +def N2Write_2cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 2; } +def N2Write_3cyc_1M : SchedWriteRes<[N2UnitM]> { let Latency = 3; } +def N2Write_2cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 2; + let ResourceCycles = [2]; } +def N2Write_3cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 3; + let ResourceCycles = [3]; } +def N2Write_5cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 5; + let ResourceCycles = [5]; } +def N2Write_12cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 12; + let ResourceCycles = [12]; } +def N2Write_20cyc_1M0 : SchedWriteRes<[N2UnitM0]> { let Latency = 20; + let ResourceCycles = [20]; } +def N2Write_4cyc_1L : SchedWriteRes<[N2UnitL]> { let Latency = 4; } +def N2Write_6cyc_1L : SchedWriteRes<[N2UnitL]> { let Latency = 6; } +def N2Write_2cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 2; } +def N2Write_3cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 3; } +def N2Write_4cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 4; } +def N2Write_5cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 5; } +def N2Write_12cyc_1V : SchedWriteRes<[N2UnitV]> { let Latency = 12; } +def N2Write_2cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 2; } +def N2Write_3cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 3; } +def N2Write_4cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 4; } +def N2Write_7cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 7; + let ResourceCycles = [7]; } +def N2Write_9cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 9; } +def N2Write_10cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 10; } +def N2Write_12cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 12; } +def N2Write_13cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 13; } +def N2Write_15cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 15; } +def N2Write_16cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 16; } +def N2Write_20cyc_1V0 : SchedWriteRes<[N2UnitV0]> { let Latency = 20; } +def N2Write_2cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 2; } +def N2Write_3cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 3; } +def N2Write_4cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 4; } +def N2Write_6cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 6; } +def N2Write_10cyc_1V1 : SchedWriteRes<[N2UnitV1]> { let Latency = 10; } +def N2Write_6cyc_1L01 : SchedWriteRes<[N2UnitL01]> { let Latency = 6; } + +//===----------------------------------------------------------------------===// +// Define generic 2 micro-op types + +def N2Write_1cyc_1B_1S : SchedWriteRes<[N2UnitB, N2UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1M0_1B : SchedWriteRes<[N2UnitM0, N2UnitB]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_9cyc_1M0_1L : SchedWriteRes<[N2UnitM0, N2UnitL]> { + let Latency = 9; + let NumMicroOps = 2; +} + +def N2Write_3cyc_1I_1M : SchedWriteRes<[N2UnitI, N2UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def N2Write_4cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_5cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_7cyc_1I_1L : SchedWriteRes<[N2UnitI, N2UnitL]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def N2Write_1cyc_1L01_1D : SchedWriteRes<[N2UnitL01, N2UnitD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def N2Write_5cyc_1M0_1V : SchedWriteRes<[N2UnitM0, N2UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def N2Write_2cyc_1L01_1V : SchedWriteRes<[N2UnitL01, N2UnitV]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def N2Write_4cyc_1V1_1V : SchedWriteRes<[N2UnitV1, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_4cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_10cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [5, 5]; +} + +def N2Write_13cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 13; + let NumMicroOps = 2; + let ResourceCycles = [6, 7]; +} + +def N2Write_15cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 15; + let NumMicroOps = 2; + let ResourceCycles = [7, 8]; +} + +def N2Write_16cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [8, 8]; +} + +def N2Write_4cyc_2V : SchedWriteRes<[N2UnitV, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_6cyc_2V : SchedWriteRes<[N2UnitV, N2UnitV]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_6cyc_2L : SchedWriteRes<[N2UnitL, N2UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_8cyc_1L_1V : SchedWriteRes<[N2UnitL, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def N2Write_4cyc_1L01_1V : SchedWriteRes<[N2UnitL01, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_3cyc_1M0_1M : SchedWriteRes<[N2UnitM0, N2UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def N2Write_2cyc_1M0_1M : SchedWriteRes<[N2UnitM0, N2UnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def N2Write_6cyc_2V1 : SchedWriteRes<[N2UnitV1, N2UnitV1]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_4cyc_1V0_1M : SchedWriteRes<[N2UnitV0, N2UnitM]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def N2Write_5cyc_2V0 : SchedWriteRes<[N2UnitV0, N2UnitV0]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def N2Write_5cyc_1V1_1M0 : SchedWriteRes<[N2UnitV1, N2UnitM0]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def N2Write_7cyc_1M0_1V0 : SchedWriteRes<[N2UnitM0, N2UnitV0]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def N2Write_2cyc_1V0_1M : SchedWriteRes<[N2UnitV0, N2UnitM]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1V_1V1 : SchedWriteRes<[N2UnitV, N2UnitV1]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1L_1M : SchedWriteRes<[N2UnitL, N2UnitM]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_6cyc_1L_1S : SchedWriteRes<[N2UnitL, N2UnitS]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def N2Write_9cyc_1L_1V : SchedWriteRes<[N2UnitL, N2UnitV]> { + let Latency = 9; + let NumMicroOps = 2; +} + +def N2Write_4cyc_2V1 : SchedWriteRes<[N2UnitV1, N2UnitV1]> { + let Latency = 4; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define generic 3 micro-op types + +def N2Write_1cyc_1L01_1D_1I : SchedWriteRes<[N2UnitL01, N2UnitD, N2UnitI]> { + let Latency = 1; + let NumMicroOps = 3; +} + +def N2Write_2cyc_1L01_1V_1I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitI]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def N2Write_2cyc_1L01_2V : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def N2Write_7cyc_1M_1M0_1V : SchedWriteRes<[N2UnitM, N2UnitM0, N2UnitV]> { + let Latency = 7; + let NumMicroOps = 3; +} + +def N2Write_8cyc_1M0_1V1_1V : SchedWriteRes<[N2UnitM0, N2UnitV1, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def N2Write_10cyc_1V_1L_1S : SchedWriteRes<[N2UnitV, N2UnitL, N2UnitL]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def N2Write_2cyc_1L01_1S_1V : SchedWriteRes<[N2UnitL01, N2UnitS, N2UnitV]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def N2Write_4cyc_1L01_1S_1V : SchedWriteRes<[N2UnitL01, N2UnitS, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def N2Write_6cyc_3L : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def N2Write_8cyc_1L_2V : SchedWriteRes<[N2UnitL, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} + +//===----------------------------------------------------------------------===// +// Define generic 4 micro-op types + +def N2Write_2cyc_1L01_2V_1I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV, + N2UnitI]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def N2Write_6cyc_4V0 : SchedWriteRes<[N2UnitV0, N2UnitV0, N2UnitV0, N2UnitV0]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def N2Write_4cyc_4V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def N2Write_6cyc_4V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def N2Write_8cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def N2Write_9cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> { + let Latency = 9; + let NumMicroOps = 4; +} + +def N2Write_2cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV, + N2UnitV]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def N2Write_4cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV, + N2UnitV]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def N2Write_5cyc_2L01_2V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitV, + N2UnitV]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def N2Write_8cyc_2M0_2V0 : SchedWriteRes<[N2UnitM0, N2UnitM0, N2UnitV0, + N2UnitV0]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def N2Write_11cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1, + N2UnitV1]> { + let Latency = 11; + let NumMicroOps = 4; +} + +def N2Write_9cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1, + N2UnitV1]> { + let Latency = 9; + let NumMicroOps = 4; +} + +def N2Write_8cyc_2V_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1, + N2UnitV1]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def N2Write_10cyc_2L_2V1 : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV1, + N2UnitV1]> { + let Latency = 10; + let NumMicroOps = 4; +} + +def N2Write_10cyc_2L_2V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV]> { + let Latency = 10; + let NumMicroOps = 4; +} + +def N2Write_4cyc_2M0_2M : SchedWriteRes<[N2UnitM0, N2UnitM0, N2UnitM, + N2UnitM]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def N2Write_6cyc_2I_2L : SchedWriteRes<[N2UnitI, N2UnitI, N2UnitL, N2UnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def N2Write_7cyc_4L : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL]> { + let Latency = 7; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define generic 5 micro-op types + +def N2Write_2cyc_1L01_2V_2I : SchedWriteRes<[N2UnitL01, N2UnitV, N2UnitV, + N2UnitI, N2UnitI]> { + let Latency = 2; + let NumMicroOps = 5; +} + +def N2Write_8cyc_2L_3V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV, + N2UnitV]> { + let Latency = 8; + let NumMicroOps = 5; +} + +//===----------------------------------------------------------------------===// +// Define generic 6 micro-op types + +def N2Write_8cyc_3L_3V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def N2Write_2cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 2; + let NumMicroOps = 6; +} + +def N2Write_6cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def N2Write_4cyc_3L01_3V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def N2Write_10cyc_2L_2V_2S : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitV, N2UnitV, + N2UnitS, N2UnitS]> { + let Latency = 10; + let NumMicroOps = 6; +} + +//===----------------------------------------------------------------------===// +// Define generic 7 micro-op types + +def N2Write_8cyc_3L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 7; +} + +//===----------------------------------------------------------------------===// +// Define generic 8 micro-op types + +def N2Write_6cyc_8V : SchedWriteRes<[N2UnitV, N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 6; + let NumMicroOps = 8; +} + +def N2Write_2cyc_4L01_4V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitV, N2UnitV, N2UnitV, + N2UnitV]> { + let Latency = 2; + let NumMicroOps = 8; +} + +def N2Write_5cyc_4L01_4V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitV, N2UnitV, N2UnitV, + N2UnitV]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def N2Write_8cyc_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def N2Write_9cyc_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 9; + let NumMicroOps = 8; +} + +//===----------------------------------------------------------------------===// +// Define generic 10 micro-op types + +def N2Write_7cyc_5L01_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitV, + N2UnitV, N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 7; + let NumMicroOps = 10; +} + +//===----------------------------------------------------------------------===// +// Define generic 12 micro-op types + +def N2Write_7cyc_6L01_6V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV]> { + let Latency = 7; + let NumMicroOps = 12; +} + +//===----------------------------------------------------------------------===// +// Define generic 15 micro-op types + +def N2Write_7cyc_5L01_5S_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitS, + N2UnitS, N2UnitS, N2UnitS, + N2UnitS, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 7; + let NumMicroOps = 15; +} + +//===----------------------------------------------------------------------===// +// Define generic 18 micro-op types + +def N2Write_11cyc_9L01_9V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 11; + let NumMicroOps = 18; +} + +//===----------------------------------------------------------------------===// +// Define generic 27 micro-op types + +def N2Write_11cyc_9L01_9S_9V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitL01, N2UnitL01, N2UnitL01, + N2UnitS, N2UnitS, N2UnitS, + N2UnitS, N2UnitS, N2UnitS, + N2UnitS, N2UnitS, N2UnitS, + N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV, + N2UnitV, N2UnitV, N2UnitV]> { + let Latency = 11; + let NumMicroOps = 27; +} + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + +// Branch Instructions +// ----------------------------------------------------------------------------- + +// Branch, immed +// Compare and branch +def : SchedAlias<WriteBr, N2Write_1cyc_1B>; + +// Branch, register +def : SchedAlias<WriteBrReg, N2Write_1cyc_1B>; + +// Branch and link, immed +// Branch and link, register +def : InstRW<[N2Write_1cyc_1B_1S], (instrs BL, BLR)>; + +// Arithmetic and Logical Instructions +// ----------------------------------------------------------------------------- + +// ALU, basic +// ALU, basic, flagset +def : SchedAlias<WriteI, N2Write_1cyc_1I>; + +// ALU, extend and shift +def : SchedAlias<WriteISReg, N2Write_2cyc_1M>; +def : SchedAlias<WriteIEReg, N2Write_2cyc_1M>; + +// Arithmetic, immediate to logical address tag +def : InstRW<[N2Write_2cyc_1M], (instrs ADDG, SUBG)>; + +// Convert floating-point condition flags +// Flag manipulation instructions +def : WriteRes<WriteSys, []> { let Latency = 1; } + +// Insert Random Tags +def : InstRW<[N2Write_2cyc_1M], (instrs IRG, IRGstack)>; + +// Insert Tag Mask +// Subtract Pointer +// Subtract Pointer, flagset +def : InstRW<[N2Write_1cyc_1I], (instrs GMI, SUBP, SUBPS)>; + +// Move and shift instructions +// ----------------------------------------------------------------------------- + +def : SchedAlias<WriteImm, N2Write_1cyc_1I>; + +// Divide and Multiply Instructions +// ----------------------------------------------------------------------------- + +// SDIV, UDIV +def : SchedAlias<WriteID32, N2Write_12cyc_1M0>; +def : SchedAlias<WriteID64, N2Write_20cyc_1M0>; + +def : WriteRes<WriteIM32, [N2UnitM]> { let Latency = 2; } +def : WriteRes<WriteIM64, [N2UnitM]> { let Latency = 2; } + +// Multiply high +def : InstRW<[N2Write_3cyc_1M], (instrs SMULHrr, UMULHrr)>; + +// Pointer Authentication Instructions (v8.3 PAC) +// ----------------------------------------------------------------------------- + +// Authenticate data address +// Authenticate instruction address +// Compute pointer authentication code for data address +// Compute pointer authentication code, using generic key +// Compute pointer authentication code for instruction address +def : InstRW<[N2Write_5cyc_1M0], (instregex "^AUT", "^PAC")>; + +// Branch and link, register, with pointer authentication +// Branch, register, with pointer authentication +// Branch, return, with pointer authentication +def : InstRW<[N2Write_6cyc_1M0_1B], (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, BRAA, + BRAAZ, BRAB, BRABZ, RETAA, RETAB, + ERETAA, ERETAB)>; + + +// Load register, with pointer authentication +def : InstRW<[N2Write_9cyc_1M0_1L], (instregex "^LDRA[AB](indexed|writeback)")>; + +// Strip pointer authentication code +def : InstRW<[N2Write_2cyc_1M0], (instrs XPACD, XPACI, XPACLRI)>; + +// Miscellaneous data-processing instructions +// ----------------------------------------------------------------------------- + +// Bitfield extract, one reg +// Bitfield extract, two regs +// NOTE: We don't model the difference between EXTR where both operands are the +// same (one reg). +def : SchedAlias<WriteExtr, N2Write_3cyc_1I_1M>; +def : InstRW<[N2Write_3cyc_1I_1M], (instrs EXTRWrri, EXTRXrri)>; + +// Bitfield move, basic +def : SchedAlias<WriteIS, N2Write_1cyc_1I>; + +// Bitfield move, insert +def : InstRW<[N2Write_2cyc_1M], (instregex "^BFM[WX]ri$")>; + +// Load instructions +// ----------------------------------------------------------------------------- + +def : SchedAlias<WriteLD, N2Write_4cyc_1L>; +def : SchedAlias<WriteLDIdx, N2Write_4cyc_1I_1L>; + +// Load pair, signed immed offset, signed words +def : InstRW<[N2Write_5cyc_1M0, WriteLDHi], (instrs LDPSWi)>; +// Load pair, immed post-index or immed pre-index, signed words +def : InstRW<[N2Write_5cyc_1M0, WriteLDHi, WriteAdr], + (instregex "^LDPSW(post|pre)$")>; + +// Store instructions +// ----------------------------------------------------------------------------- + +def : SchedAlias<WriteST, N2Write_1cyc_1L01_1D>; +def : SchedAlias<WriteSTIdx, N2Write_1cyc_1L01_1D_1I>; +def : SchedAlias<WriteSTP, N2Write_1cyc_1L01_1D>; +def : SchedAlias<WriteAdr, N2Write_1cyc_1I>; // copied from A57. + +// Tag load instructions +// ----------------------------------------------------------------------------- + +// Load allocation tag +// Load multiple allocation tags +def : InstRW<[N2Write_4cyc_1L], (instrs LDG, LDGM)>; + +// Tag store instructions +// ----------------------------------------------------------------------------- + +// Store allocation tags to one or two granules, post-index +// Store allocation tags to one or two granules, pre-index +// Store allocation tag to one or two granules, zeroing, post-index +// Store Allocation Tag to one or two granules, zeroing, pre-index +// Store allocation tag and reg pair to memory, post-Index +// Store allocation tag and reg pair to memory, pre-Index +def : InstRW<[N2Write_1cyc_1L01_1D_1I], (instrs STGPreIndex, STGPostIndex, + ST2GPreIndex, ST2GPostIndex, + STZGPreIndex, STZGPostIndex, + STZ2GPreIndex, STZ2GPostIndex, + STGPpre, STGPpost)>; + +// Store allocation tags to one or two granules, signed offset +// Store allocation tag to two granules, zeroing, signed offset +// Store allocation tag and reg pair to memory, signed offset +// Store multiple allocation tags +def : InstRW<[N2Write_1cyc_1L01_1D], (instrs STGOffset, ST2GOffset, STZGOffset, + STZ2GOffset, STGPi, STGM, STZGM)>; + +// FP data processing instructions +// ----------------------------------------------------------------------------- + +// FP absolute value +// FP arithmetic +// FP min/max +// FP negate +// FP select +def : SchedAlias<WriteF, N2Write_2cyc_1V>; + +// FP compare +def : SchedAlias<WriteFCmp, N2Write_2cyc_1V0>; + +// FP divide, square root +def : SchedAlias<WriteFDiv, N2Write_7cyc_1V0>; + +// FP divide, H-form +def : InstRW<[N2Write_7cyc_1V0], (instrs FDIVHrr)>; +// FP divide, S-form +def : InstRW<[N2Write_10cyc_1V0], (instrs FDIVSrr)>; +// FP divide, D-form +def : InstRW<[N2Write_15cyc_1V0], (instrs FDIVDrr)>; + +// FP square root, H-form +def : InstRW<[N2Write_7cyc_1V0], (instrs FSQRTHr)>; +// FP square root, S-form +def : InstRW<[N2Write_9cyc_1V0], (instrs FSQRTSr)>; +// FP square root, D-form +def : InstRW<[N2Write_16cyc_1V0], (instrs FSQRTDr)>; + +// FP multiply +def : WriteRes<WriteFMul, [N2UnitV]> { let Latency = 3; } + +// FP multiply accumulate +def : InstRW<[N2Write_4cyc_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; + +// FP round to integral +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$", + "^FRINT(32|64)[XZ][SD]r$")>; + +// FP miscellaneous instructions +// ----------------------------------------------------------------------------- + +// FP convert, from gen to vec reg +def : InstRW<[N2Write_3cyc_1M0], (instregex "^[SU]CVTF[SU][WX][HSD]ri$")>; + +// FP convert, from vec to gen reg +def : InstRW<[N2Write_3cyc_1V], (instregex "^FCVT[AMNPZ][SU][SU][WX][HSD]r$")>; + +// FP convert, Javascript from vec to gen reg +// FP convert, from vec to vec reg +def : SchedAlias<WriteFCvt, N2Write_3cyc_1V0>; + +// FP move, immed +// FP move, register +def : SchedAlias<WriteFImm, N2Write_2cyc_1V>; + +// FP transfer, from gen to low half of vec reg +def : InstRW<[N2Write_3cyc_1M0], (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr, + FMOVHWr, FMOVHXr, FMOVSWr, FMOVDXr)>; + +// FP transfer, from gen to high half of vec reg +def : InstRW<[N2Write_5cyc_1M0_1V], (instrs FMOVXDHighr)>; + +// FP transfer, from vec to gen reg +def : SchedAlias<WriteFCopy, N2Write_2cyc_1V>; + +// FP load instructions +// ----------------------------------------------------------------------------- + +// Load vector reg, literal, S/D/Q forms +// Load vector reg, unscaled immed +def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[SDQ]l$", + "^LDUR[BHSDQ]i$")>; + +// Load vector reg, immed post-index +def : InstRW<[N2Write_6cyc_1I_1L, WriteI], (instregex "^LDR[BHSDQ]post$")>; +// Load vector reg, immed pre-index +def : InstRW<[N2Write_6cyc_1I_1L, WriteAdr], (instregex "^LDR[BHSDQ]pre$")>; + +// Load vector reg, unsigned immed +def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[BHSDQ]ui$")>; + +// Load vector reg, register offset, basic +// Load vector reg, register offset, scale, S/D-form +// Load vector reg, register offset, extend +// Load vector reg, register offset, extend, scale, S/D-form +def : InstRW<[N2Write_6cyc_1L, ReadAdrBase], (instregex "^LDR[BSD]ro[WX]$")>; + +// Load vector reg, register offset, scale, H/Q-form +// Load vector reg, register offset, extend, scale, H/Q-form +def : InstRW<[N2Write_7cyc_1I_1L, ReadAdrBase], (instregex "^LDR[HQ]ro[WX]$")>; + +// Load vector pair, immed offset, S/D-form +def : InstRW<[N2Write_6cyc_1L, WriteLDHi], (instregex "^LDN?P[SD]i$")>; + +// Load vector pair, immed offset, Q-form +def : InstRW<[N2Write_6cyc_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>; + +// Load vector pair, immed post-index, S/D-form +// Load vector pair, immed pre-index, S/D-form +def : InstRW<[N2Write_6cyc_1I_1L, WriteLDHi, WriteAdr], + (instregex "^LDP[SD](pre|post)$")>; + +// Load vector pair, immed post-index, Q-form +// Load vector pair, immed pre-index, Q-form +def : InstRW<[N2Write_6cyc_2I_2L, WriteLDHi, WriteAdr], (instrs LDPQpost, + LDPQpre)>; + +// FP store instructions +// ----------------------------------------------------------------------------- + +// Store vector reg, unscaled immed, B/H/S/D-form +// Store vector reg, unscaled immed, Q-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STUR[BHSDQ]i$")>; + +// Store vector reg, immed post-index, B/H/S/D-form +// Store vector reg, immed post-index, Q-form +// Store vector reg, immed pre-index, B/H/S/D-form +// Store vector reg, immed pre-index, Q-form +def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V_1I, ReadAdrBase], + (instregex "^STR[BHSDQ](pre|post)$")>; + +// Store vector reg, unsigned immed, B/H/S/D-form +// Store vector reg, unsigned immed, Q-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STR[BHSDQ]ui$")>; + +// Store vector reg, register offset, basic, B/H/S/D-form +// Store vector reg, register offset, basic, Q-form +// Store vector reg, register offset, scale, S/D-form +// Store vector reg, register offset, extend, B/H/S/D-form +// Store vector reg, register offset, extend, Q-form +// Store vector reg, register offset, extend, scale, S/D-form +def : InstRW<[N2Write_2cyc_1L01_1V, ReadAdrBase], + (instregex "^STR[BSD]ro[WX]$")>; + +// Store vector reg, register offset, scale, H-form +// Store vector reg, register offset, scale, Q-form +// Store vector reg, register offset, extend, scale, H-form +// Store vector reg, register offset, extend, scale, Q-form +def : InstRW<[N2Write_2cyc_1L01_1V, ReadAdrBase], + (instregex "^STR[HQ]ro[WX]$")>; + +// Store vector pair, immed offset, S-form +// Store vector pair, immed offset, D-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STN?P[SD]i$")>; + +// Store vector pair, immed offset, Q-form +def : InstRW<[N2Write_2cyc_1L01_2V], (instrs STPQi, STNPQi)>; + +// Store vector pair, immed post-index, S-form +// Store vector pair, immed post-index, D-form +// Store vector pair, immed pre-index, S-form +// Store vector pair, immed pre-index, D-form +def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V_1I], + (instregex "^STP[SD](pre|post)$")>; + +// Store vector pair, immed post-index, Q-form +def : InstRW<[N2Write_2cyc_1L01_2V_1I], (instrs STPQpost)>; + +// Store vector pair, immed pre-index, Q-form +def : InstRW<[N2Write_2cyc_1L01_2V_2I], (instrs STPQpre)>; + +// ASIMD integer instructions +// ----------------------------------------------------------------------------- + +// ASIMD absolute diff +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD arith, pair-wise +// ASIMD compare +// ASIMD logical +// ASIMD max/min, basic and pair-wise +def : SchedAlias<WriteVd, N2Write_2cyc_1V>; +def : SchedAlias<WriteVq, N2Write_2cyc_1V>; + +// ASIMD absolute diff accum +// ASIMD absolute diff accum long +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>; + +// ASIMD arith, reduce, 4H/4S +def : InstRW<[N2Write_2cyc_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>; + +// ASIMD arith, reduce, 8B/8H +def : InstRW<[N2Write_4cyc_1V1_1V], + (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>; + +// ASIMD arith, reduce, 16B +def : InstRW<[N2Write_4cyc_1V1], (instrs ADDVv16i8v, SADDLVv16i8v, + UADDLVv16i8v)>; + +// ASIMD dot product +// ASIMD dot product using signed and unsigned integers +def : InstRW<[N2Write_3cyc_1V], + (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>; + +// ASIMD matrix multiply-accumulate +def : InstRW<[N2Write_3cyc_1V], (instrs SMMLA, UMMLA, USMMLA)>; + +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$", + "^[SU](MAX|MIN)Vv4i32v$")>; + +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[N2Write_4cyc_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$", + "^[SU](MAX|MIN)Vv8i16v$")>; + +// ASIMD max/min, reduce, 16B +def : InstRW<[N2Write_4cyc_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>; + +// ASIMD multiply +def : InstRW<[N2Write_4cyc_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>; + +// ASIMD multiply accumulate +def : InstRW<[N2Write_4cyc_1V0], (instregex "^MLAv", "^MLSv")>; + +// ASIMD multiply accumulate high +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; + +// ASIMD multiply accumulate long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; + +// ASIMD multiply accumulate saturating long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>; + +// ASIMD multiply/multiply long (8x8) polynomial, D-form +// ASIMD multiply/multiply long (8x8) polynomial, Q-form +def : InstRW<[N2Write_3cyc_1V0], (instregex "^PMULL?(v8i8|v16i8)$")>; + +// ASIMD multiply long +def : InstRW<[N2Write_3cyc_1V], (instregex "^[SU]MULLv", "^SQDMULLv")>; + +// ASIMD pairwise add and accumulate long +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ADALPv")>; + +// ASIMD shift accumulate +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]SRAv", "^[SU]RSRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[N2Write_2cyc_1V1], (instregex "^SHLv", "^SHLLv", "^SHRNv", + "^SSHLLv", "^SSHRv", "^USHLLv", + "^USHRv")>; + +// ASIMD shift by immed and insert, basic +def : InstRW<[N2Write_2cyc_1V1], (instregex "^SLIv", "^SRIv")>; + +// ASIMD shift by immed, complex +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^RSHRNv", "^SQRSHRNv", "^SQRSHRUNv", + "^(SQSHLU?|UQSHL)[bhsd]$", + "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$", + "^SQSHRNv", "^SQSHRUNv", "^SRSHRv", "^UQRSHRNv", + "^UQSHRNv", "^URSHRv")>; + +// ASIMD shift by register, basic +def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU]SHLv")>; + +// ASIMD shift by register, complex +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^[SU]RSHLv", "^[SU]QRSHLv", + "^[SU]QSHL(v1i8|v1i16|v1i32|v1i64|v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)$")>; + +// ASIMD floating-point instructions +// ----------------------------------------------------------------------------- + +// ASIMD FP absolute value/difference +// ASIMD FP arith, normal +// ASIMD FP compare +// ASIMD FP complex add +// ASIMD FP max/min, normal +// ASIMD FP max/min, pairwise +// ASIMD FP negate +// Handled by SchedAlias<WriteV[dq], ...> + +// ASIMD FP complex multiply add +def : InstRW<[N2Write_4cyc_1V], (instregex "^FCMLAv")>; + +// ASIMD FP convert, long (F16 to F32) +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTL(v4|v8)i16")>; + +// ASIMD FP convert, long (F32 to F64) +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVTL(v2|v4)i32")>; + +// ASIMD FP convert, narrow (F32 to F16) +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTN(v4|v8)i16")>; + +// ASIMD FP convert, narrow (F64 to F32) +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVTN(v2|v4)i32", + "^FCVTXN(v2|v4)f32")>; + +// ASIMD FP convert, other, D-form F32 and Q-form F64 +def : InstRW<[N2Write_3cyc_1V0], (instregex "^[FSU]CVT[AMNPZ][SU]v2f(32|64)$", + "^[SU]CVTFv2f(32|64)$")>; + +// ASIMD FP convert, other, D-form F16 and Q-form F32 +def : InstRW<[N2Write_4cyc_2V0], (instregex "^[FSU]CVT[AMNPZ][SU]v4f(16|32)$", + "^[SU]CVTFv4f(16|32)$")>; + +// ASIMD FP convert, other, Q-form F16 +def : InstRW<[N2Write_6cyc_4V0], (instregex "^[FSU]CVT[AMNPZ][SU]v8f16$", + "^[SU]CVTFv8f16$")>; + +// ASIMD FP divide, D-form, F16 +def : InstRW<[N2Write_7cyc_1V0], (instrs FDIVv4f16)>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[N2Write_10cyc_2V0], (instrs FDIVv2f32)>; + +// ASIMD FP divide, Q-form, F16 +def : InstRW<[N2Write_13cyc_2V0], (instrs FDIVv8f16)>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[N2Write_10cyc_2V0], (instrs FDIVv4f32)>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[N2Write_15cyc_2V0], (instrs FDIVv2f64)>; + +// ASIMD FP max/min, reduce, F32 and D-form F16 +def : InstRW<[N2Write_4cyc_1V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>; + +// ASIMD FP max/min, reduce, Q-form F16 +def : InstRW<[N2Write_6cyc_2V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>; + +// ASIMD FP multiply +def : InstRW<[N2Write_3cyc_1V], (instregex "^FMULv", "^FMULXv")>; + +// ASIMD FP multiply accumulate +def : InstRW<[N2Write_4cyc_1V], (instregex "^FMLAv", "^FMLSv")>; + +// ASIMD FP multiply accumulate long +def : InstRW<[N2Write_5cyc_1V], (instregex "^FMLALv", "^FMLSLv")>; + +// ASIMD FP round, D-form F32 and Q-form F64 +def : InstRW<[N2Write_3cyc_1V0], + (instregex "^FRINT[AIMNPXZ]v2f(32|64)$", + "^FRINT[32|64)[XZ]v2f(32|64)$")>; + +// ASIMD FP round, D-form F16 and Q-form F32 +def : InstRW<[N2Write_4cyc_2V0], + (instregex "^FRINT[AIMNPXZ]v4f(16|32)$", + "^FRINT(32|64)[XZ]v4f32$")>; + + +// ASIMD FP round, Q-form F16 +def : InstRW<[N2Write_6cyc_4V0], (instregex "^FRINT[AIMNPXZ]v8f16$")>; + +// ASIMD FP square root, D-form, F16 +def : InstRW<[N2Write_7cyc_1V0], (instrs FSQRTv4f16)>; + +// ASIMD FP square root, D-form, F32 +def : InstRW<[N2Write_10cyc_2V0], (instrs FSQRTv2f32)>; + +// ASIMD FP square root, Q-form, F16 +def : InstRW<[N2Write_13cyc_2V0], (instrs FSQRTv8f16)>; + +// ASIMD FP square root, Q-form, F32 +def : InstRW<[N2Write_10cyc_2V0], (instrs FSQRTv4f32)>; + +// ASIMD FP square root, Q-form, F64 +def : InstRW<[N2Write_16cyc_2V0], (instrs FSQRTv2f64)>; + +// ASIMD BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// ASIMD convert, F32 to BF16 +def : InstRW<[N2Write_4cyc_1V0], (instrs BFCVTN, BFCVTN2)>; + +// ASIMD dot product +def : InstRW<[N2Write_4cyc_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>; + +// ASIMD matrix multiply accumulate +def : InstRW<[N2Write_5cyc_1V], (instrs BFMMLA)>; + +// ASIMD multiply accumulate long +def : InstRW<[N2Write_4cyc_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT, + BFMLALTIdx)>; + +// Scalar convert, F32 to BF16 +def : InstRW<[N2Write_3cyc_1V0], (instrs BFCVT)>; + +// ASIMD miscellaneous instructions +// ----------------------------------------------------------------------------- + +// ASIMD bit reverse +// ASIMD bitwise insert +// ASIMD count +// ASIMD duplicate, element +// ASIMD extract +// ASIMD extract narrow +// ASIMD insert, element to element +// ASIMD move, FP immed +// ASIMD move, integer immed +// ASIMD reverse +// ASIMD table lookup, 1 or 2 table regs +// ASIMD table lookup extension, 1 table reg +// ASIMD transfer, element to gen reg +// ASIMD transpose +// ASIMD unzip/zip +// Handled by SchedAlias<WriteV[dq], ...> + +// ASIMD duplicate, gen reg +def : InstRW<[N2Write_3cyc_1M0], (instregex "^DUPv.+gpr")>; + +// ASIMD extract narrow, saturating +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]QXTNv", "^SQXTUNv")>; + +// ASIMD reciprocal and square root estimate, D-form U32 +def : InstRW<[N2Write_3cyc_1V0], (instrs URECPEv2i32, URSQRTEv2i32)>; + +// ASIMD reciprocal and square root estimate, Q-form U32 +def : InstRW<[N2Write_4cyc_2V0], (instrs URECPEv4i32, URSQRTEv4i32)>; + +// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms +def : InstRW<[N2Write_3cyc_1V0], (instrs FRECPEv1f16, FRECPEv1i32, + FRECPEv1i64, FRECPEv2f32, + FRSQRTEv1f16, FRSQRTEv1i32, + FRSQRTEv1i64, FRSQRTEv2f32)>; + +// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32 +def : InstRW<[N2Write_4cyc_2V0], (instrs FRECPEv4f16, FRECPEv4f32, + FRSQRTEv4f16, FRSQRTEv4f32)>; + +// ASIMD reciprocal and square root estimate, Q-form F16 +def : InstRW<[N2Write_6cyc_4V0], (instrs FRECPEv8f16, FRSQRTEv8f16)>; + +// ASIMD reciprocal exponent +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRECPXv")>; + +// ASIMD reciprocal step +def : InstRW<[N2Write_4cyc_1V], (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD table lookup, 3 table regs +def : InstRW<[N2Write_4cyc_2V], (instrs TBLv8i8Three, TBLv16i8Three)>; + +// ASIMD table lookup, 4 table regs +def : InstRW<[N2Write_4cyc_4V], (instrs TBLv8i8Four, TBLv16i8Four)>; + +// ASIMD table lookup extension, 2 table reg +def : InstRW<[N2Write_4cyc_2V], (instrs TBXv8i8Two, TBXv16i8Two)>; + +// ASIMD table lookup extension, 3 table reg +def : InstRW<[N2Write_6cyc_4V], (instrs TBXv8i8Three, TBXv16i8Three)>; + +// ASIMD table lookup extension, 4 table reg +def : InstRW<[N2Write_6cyc_8V], (instrs TBXv8i8Four, TBXv16i8Four)>; + +// ASIMD transfer, gen reg to element +def : InstRW<[N2Write_5cyc_1M0_1V], (instregex "^INSv")>; + +// ASIMD load instructions +// ----------------------------------------------------------------------------- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_6cyc_1L, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_6cyc_1L, WriteAdr], + (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_6cyc_2L, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_6cyc_2L, WriteAdr], + (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_6cyc_3L, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_6cyc_3L, WriteAdr], + (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_7cyc_4L, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_7cyc_4L, WriteAdr], + (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; + +// ASIMD load, 2 element, multiple, Q-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[N2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; + +// ASIMD load, 3 element, multiple, Q-form, B/H/S +def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; + +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Threev(2d)$")>; +def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +// ASIMD load, 3 element, one lane, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; + +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[N2Write_9cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_9cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>; + +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[N2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_8cyc_4L_4V, WriteAdr], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD store instructions +// ----------------------------------------------------------------------------- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[N2Write_2cyc_3L01_3V], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_2cyc_3L01_3V, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[N2Write_2cyc_4L01_4V], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_2cyc_4L01_4V, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; + +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_4cyc_2L01_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_5cyc_2L01_2V], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[N2Write_5cyc_2L01_2V, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; + +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; + +// ASIMD store, 4 element, multiple, Q-form, B/H/S +def : InstRW<[N2Write_7cyc_6L01_6V], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[N2Write_7cyc_6L01_6V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; + +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[N2Write_5cyc_4L01_4V], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[N2Write_5cyc_4L01_4V, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H/S +def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST4i(8|16|32)$")>; +def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4i(8|16|32)_POST$")>; + +// ASIMD store, 4 element, one lane, D +def : InstRW<[N2Write_4cyc_3L01_3V], (instregex "ST4i(64)$")>; +def : InstRW<[N2Write_4cyc_3L01_3V, WriteAdr], (instregex "ST4i(64)_POST$")>; + +// Cryptography extensions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[N2Write_2cyc_1V], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; + +// Crypto polynomial (64x64) multiply long +def : InstRW<[N2Write_2cyc_1V0], (instrs PMULLv1i64, PMULLv2i64)>; + +// Crypto SHA1 hash acceleration op +// Crypto SHA1 schedule acceleration ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA1(H|SU0|SU1)")>; + +// Crypto SHA1 hash acceleration ops +// Crypto SHA256 hash acceleration ops +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SHA1[CMP]", "^SHA256H2?")>; + +// Crypto SHA256 schedule acceleration ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA256SU[01]")>; + +// Crypto SHA512 hash acceleration ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^SHA512(H|H2|SU0|SU1)")>; + +// Crypto SHA3 ops +def : InstRW<[N2Write_2cyc_1V0], (instrs BCAX, EOR3, RAX1, XAR)>; + +// Crypto SM3 ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^SM3PARTW[12]$", "^SM3SS1$", + "^SM3TT[12][AB]$")>; + +// Crypto SM4 ops +def : InstRW<[N2Write_4cyc_1V0], (instrs SM4E, SM4ENCKEY)>; + +// CRC +// ----------------------------------------------------------------------------- + +def : InstRW<[N2Write_2cyc_1M0], (instregex "^CRC32")>; + +// SVE Predicate instructions +// ----------------------------------------------------------------------------- + +// Loop control, based on predicate +def : InstRW<[N2Write_2cyc_1M], (instrs BRKA_PPmP, BRKA_PPzP, + BRKB_PPmP, BRKB_PPzP)>; + +// Loop control, based on predicate and flag setting +def : InstRW<[N2Write_3cyc_1M], (instrs BRKAS_PPzP, BRKBS_PPzP)>; + +// Loop control, propagating +def : InstRW<[N2Write_2cyc_1M0], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; + +// Loop control, propagating and flag setting +def : InstRW<[N2Write_3cyc_1M0_1M], (instrs BRKNS_PPzP, BRKPAS_PPzPP, + BRKPBS_PPzPP)>; + +// Loop control, based on GPR +def : InstRW<[N2Write_3cyc_1M], + (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]$")>; + +def : InstRW<[N2Write_3cyc_1M], (instregex "^WHILE(RW|WR)_PXX_[BHSD]$")>; + +// Loop terminate +def : InstRW<[N2Write_1cyc_1M], (instregex "^CTERM(EQ|NE)_(WW|XX)$")>; + +// Predicate counting scalar +def : InstRW<[N2Write_2cyc_1M], (instrs ADDPL_XXI, ADDVL_XXI, RDVLI_XI)>; +def : InstRW<[N2Write_2cyc_1M], + (instregex "^(CNT|DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[BHWD]_XPiI$", + "^SQ(DEC|INC)[BHWD]_XPiWdI$", + "^(UQDEC|UQINC)[BHWD]_WPiI$")>; + +// Predicate counting scalar, active predicate +def : InstRW<[N2Write_2cyc_1M], + (instregex "^CNTP_XPP_[BHSD]$", + "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]$", + "^(UQDEC|UQINC)P_WP_[BHSD]$", + "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]$")>; + +// Predicate counting vector, active predicate +def : InstRW<[N2Write_7cyc_1M_1M0_1V], + (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]$")>; + +// Predicate logical +def : InstRW<[N2Write_1cyc_1M0], + (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP$")>; + +// Predicate logical, flag setting +def : InstRW<[N2Write_2cyc_1M0_1M], + (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP$")>; + +// Predicate reverse +def : InstRW<[N2Write_2cyc_1M], (instregex "^REV_PP_[BHSD]$")>; + +// Predicate select +def : InstRW<[N2Write_1cyc_1M0], (instrs SEL_PPPP)>; + +// Predicate set +def : InstRW<[N2Write_2cyc_1M], (instregex "^PFALSE$", "^PTRUE_[BHSD]$")>; + +// Predicate set/initialize, set flags +def : InstRW<[N2Write_3cyc_1M], (instregex "^PTRUES_[BHSD]$")>; + +// Predicate find first/next +def : InstRW<[N2Write_3cyc_1M], (instregex "^PFIRST_B$", "^PNEXT_[BHSD]$")>; + +// Predicate test +def : InstRW<[N2Write_1cyc_1M], (instrs PTEST_PP)>; + +// Predicate transpose +def : InstRW<[N2Write_2cyc_1M], (instregex "^TRN[12]_PPP_[BHSDQ]$")>; + +// Predicate unpack and widen +def : InstRW<[N2Write_2cyc_1M], (instrs PUNPKHI_PP, PUNPKLO_PP)>; + +// Predicate zip/unzip +def : InstRW<[N2Write_2cyc_1M], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]$")>; + +// SVE integer instructions +// ----------------------------------------------------------------------------- + +// Arithmetic, absolute diff +def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]$")>; + +// Arithmetic, absolute diff accum +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>; + +// Arithmetic, absolute diff accum long +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>; + +// Arithmetic, absolute diff long +def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>; + +// Arithmetic, basic +def : InstRW<[N2Write_2cyc_1V], + (instregex "^(ABS|ADD|CNOT|NEG|SUB|SUBR)_ZPmZ_[BHSD]$", + "^(ADD|SUB)_ZZZ_[BHSD]$", + "^(ADD|SUB|SUBR)_ZI_[BHSD]$", + "^ADR_[SU]XTW_ZZZ_D_[0123]$", + "^ADR_LSL_ZZZ_[SD]_[0123]$", + "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]$", + "^SADDLBT_ZZZ_[HSD]$", + "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]$", + "^SSUBL(BT|TB)_ZZZ_[HSD]$")>; + +// Arithmetic, complex +def : InstRW<[N2Write_2cyc_1V], + (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]$", + "^SQ(ABS|ADD|NEG|SUB|SUBR)_ZPmZ_[BHSD]$", + "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]$", + "^[SU]Q(ADD|SUB)_ZI_[BHSD]$", + "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]$", + "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]$")>; + +// Arithmetic, large integer +def : InstRW<[N2Write_2cyc_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>; + +// Arithmetic, pairwise add +def : InstRW<[N2Write_2cyc_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>; + +// Arithmetic, pairwise add and accum long +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>; + +// Arithmetic, shift +def : InstRW<[N2Write_2cyc_1V1], + (instregex "^(ASR|LSL|LSR)_WIDE_ZPmZ_[BHS]$", + "^(ASR|LSL|LSR)_WIDE_ZZZ_[BHS]$", + "^(ASR|LSL|LSR)_ZPmI_[BHSD]$", + "^(ASR|LSL|LSR)_ZPmZ_[BHSD]$", + "^(ASR|LSL|LSR)_ZZI_[BHSD]$", + "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]$")>; + +// Arithmetic, shift and accumulate +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>; + +// Arithmetic, shift by immediate +// Arithmetic, shift by immediate and insert +def : InstRW<[N2Write_2cyc_1V1], + (instregex "^(SHRNB|SHRNT|SSHLLB|SSHLLT|USHLLB|USHLLT|SLI|SRI)_ZZI_[BHSD]$")>; + +// Arithmetic, shift complex +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^(SQ)?RSHRU?N[BT]_ZZI_[BHS]$", + "^(SQRSHL|SQRSHLR|SQSHL|SQSHLR|UQRSHL|UQRSHLR|UQSHL|UQSHLR)_ZPmZ_[BHSD]$", + "^(SQSHL|SQSHLU|UQSHL)_ZPmI_[BHSD]$", + "^SQSHRU?N[BT]_ZZI_[BHS]$", + "^UQR?SHRN[BT]_ZZI_[BHS]$")>; + +// Arithmetic, shift right for divide +def : InstRW<[N2Write_4cyc_1V1], (instregex "^ASRD_ZPmI_[BHSD]$")>; + +// Arithmetic, shift rounding +def : InstRW<[N2Write_4cyc_1V1], + (instregex "^(SRSHL|SRSHLR|URSHL|URSHLR)_ZPmZ_[BHSD]$", + "^[SU]RSHR_ZPmI_[BHSD]$")>; + +// Bit manipulation +def : InstRW<[N2Write_6cyc_2V1], + (instregex "^(BDEP|BEXT|BGRP)_ZZZ_[BHSD]$")>; + +// Bitwise select +def : InstRW<[N2Write_2cyc_1V], (instregex "^(BSL|BSL1N|BSL2N|NBSL)_ZZZZ$")>; + +// Count/reverse bits +def : InstRW<[N2Write_2cyc_1V], (instregex "^(CLS|CLZ|CNT|RBIT)_ZPmZ_[BHSD]$")>; + +// Broadcast logical bitmask immediate to vector +def : InstRW<[N2Write_2cyc_1V], (instrs DUPM_ZI)>; + +// Compare and set flags +def : InstRW<[N2Write_4cyc_1V0_1M], + (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]$", + "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]$")>; + +// Complex add +def : InstRW<[N2Write_2cyc_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>; + +// Complex dot product 8-bit element +def : InstRW<[N2Write_3cyc_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; + +// Complex dot product 16-bit element +def : InstRW<[N2Write_4cyc_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; + +// Complex multiply-add B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^CMLA_ZZZ_[BHS]$", + "^CMLA_ZZZI_[HS]$")>; + +// Complex multiply-add D element size +def : InstRW<[N2Write_5cyc_2V0], (instrs CMLA_ZZZ_D)>; + +// Conditional extract operations, scalar form +def : InstRW<[N2Write_8cyc_1M0_1V1_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>; + +// Conditional extract operations, SIMD&FP scalar and vector forms +def : InstRW<[N2Write_3cyc_1V1], (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]$", + "^COMPACT_ZPZ_[SD]$", + "^SPLICE_ZPZZ?_[BHSD]$")>; + +// Convert to floating point, 64b to float or convert to double +def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_Dto[SD]$")>; + +// Convert to floating point, 64b to half +def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_DtoH$")>; + +// Convert to floating point, 32b to single or half +def : InstRW<[N2Write_4cyc_2V0], (instregex "^[SU]CVTF_ZPmZ_Sto[HS]$")>; + +// Convert to floating point, 32b to double +def : InstRW<[N2Write_3cyc_1V0], (instregex "^[SU]CVTF_ZPmZ_StoD$")>; + +// Convert to floating point, 16b to half +def : InstRW<[N2Write_6cyc_4V0], (instregex "^[SU]CVTF_ZPmZ_HtoH$")>; + +// Copy, scalar +def : InstRW<[N2Write_5cyc_1M0_1V], (instregex "^CPY_ZPmR_[BHSD]$")>; + +// Copy, scalar SIMD&FP or imm +def : InstRW<[N2Write_2cyc_1V], (instregex "^CPY_ZPm[IV]_[BHSD]$", + "^CPY_ZPzI_[BHSD]$")>; + +// Divides, 32 bit +def : InstRW<[N2Write_12cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_S$")>; + +// Divides, 64 bit +def : InstRW<[N2Write_20cyc_1V0], (instregex "^[SU]DIVR?_ZPmZ_D$")>; + +// Dot product, 8 bit +def : InstRW<[N2Write_3cyc_1V], (instregex "^[SU]DOT_ZZZI?_S$")>; + +// Dot product, 8 bit, using signed and unsigned integers +def : InstRW<[N2Write_3cyc_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; + +// Dot product, 16 bit +def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>; + +// Duplicate, immediate and indexed form +def : InstRW<[N2Write_2cyc_1V], (instregex "^DUP_ZI_[BHSD]$", + "^DUP_ZZI_[BHSDQ]$")>; + +// Duplicate, scalar form +def : InstRW<[N2Write_3cyc_1M0], (instregex "^DUP_ZR_[BHSD]$")>; + +// Extend, sign or zero +def : InstRW<[N2Write_2cyc_1V1], (instregex "^[SU]XTB_ZPmZ_[HSD]$", + "^[SU]XTH_ZPmZ_[SD]$", + "^[SU]XTW_ZPmZ_[D]$")>; + +// Extract +def : InstRW<[N2Write_2cyc_1V], (instrs EXT_ZZI, EXT_ZZI_B)>; + +// Extract narrow saturating +def : InstRW<[N2Write_4cyc_1V1], (instregex "^[SU]QXTN[BT]_ZZ_[BHS]$", + "^SQXTUN[BT]_ZZ_[BHS]$")>; + +// Extract/insert operation, SIMD and FP scalar form +def : InstRW<[N2Write_3cyc_1V1], (instregex "^LAST[AB]_VPZ_[BHSD]$", + "^INSR_ZV_[BHSD]$")>; + +// Extract/insert operation, scalar +def : InstRW<[N2Write_5cyc_1V1_1M0], (instregex "^LAST[AB]_RPZ_[BHSD]$", + "^INSR_ZR_[BHSD]$")>; + +// Histogram operations +def : InstRW<[N2Write_2cyc_1V], (instregex "^HISTCNT_ZPzZZ_[SD]$", + "^HISTSEG_ZZZ$")>; + +// Horizontal operations, B, H, S form, immediate operands only +def : InstRW<[N2Write_4cyc_1V0], (instregex "^INDEX_II_[BHS]$")>; + +// Horizontal operations, B, H, S form, scalar, immediate operands/ scalar +// operands only / immediate, scalar operands +def : InstRW<[N2Write_7cyc_1M0_1V0], (instregex "^INDEX_(IR|RI|RR)_[BHS]$")>; + +// Horizontal operations, D form, immediate operands only +def : InstRW<[N2Write_5cyc_2V0], (instrs INDEX_II_D)>; + +// Horizontal operations, D form, scalar, immediate operands)/ scalar operands +// only / immediate, scalar operands +def : InstRW<[N2Write_8cyc_2M0_2V0], (instregex "^INDEX_(IR|RI|RR)_D$")>; + +// Logical +def : InstRW<[N2Write_2cyc_1V], + (instregex "^(AND|EOR|ORR)_ZI$", + "^(AND|BIC|EOR|EOR(BT|TB)?|ORR)_ZZZ$", + "^EOR(BT|TB)_ZZZ_[BHSD]$", + "^(AND|BIC|EOR|NOT|ORR)_ZPmZ_[BHSD]$")>; + +// Max/min, basic and pairwise +def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]$", + "^[SU](MAX|MIN)P?_ZPmZ_[BHSD]$")>; + +// Matching operations +def : InstRW<[N2Write_2cyc_1V0_1M], (instregex "^N?MATCH_PPzZZ_[BH]$")>; + +// Matrix multiply-accumulate +def : InstRW<[N2Write_3cyc_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; + +// Move prefix +def : InstRW<[N2Write_2cyc_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$", + "^MOVPRFX_ZZ$")>; + +// Multiply, B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]$", + "^[SU]MULH_(ZPmZ|ZZZ)_[BHS]$")>; + +// Multiply, D element size +def : InstRW<[N2Write_5cyc_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D$", + "^[SU]MULH_(ZPmZ|ZZZ)_D$")>; + +// Multiply long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$", + "^[SU]MULL[BT]_ZZZ_[HSD]$")>; + +// Multiply accumulate, B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$", + "^(ML[AS]|MAD|MSB)_ZPmZZ_[BHS]$")>; + +// Multiply accumulate, D element size +def : InstRW<[N2Write_5cyc_2V0], (instregex "^ML[AS]_ZZZI_D$", + "^(ML[AS]|MAD|MSB)_ZPmZZ_D$")>; + +// Multiply accumulate long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$", + "^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>; + +// Multiply accumulate saturating doubling long regular +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$", + "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>; + +// Multiply saturating doubling high, B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$", + "^SQDMULH_ZZZI_[HS]$")>; + +// Multiply saturating doubling high, D element size +def : InstRW<[N2Write_5cyc_2V0], (instrs SQDMULH_ZZZ_D, SQDMULH_ZZZI_D)>; + +// Multiply saturating doubling long +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$", + "^SQDMULL[BT]_ZZZI_[SD]$")>; + +// Multiply saturating rounding doubling regular/complex accumulate, B, H, S +// element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$", + "^SQRDCMLAH_ZZZ_[BHS]$", + "^SQRDML[AS]H_ZZZI_[HS]$", + "^SQRDCMLAH_ZZZI_[HS]$")>; + +// Multiply saturating rounding doubling regular/complex accumulate, D element +// size +def : InstRW<[N2Write_5cyc_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$", + "^SQRDCMLAH_ZZZ_D$")>; + +// Multiply saturating rounding doubling regular/complex, B, H, S element size +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SQRDMULH_ZZZ_[BHS]$", + "^SQRDMULH_ZZZI_[HS]$")>; + +// Multiply saturating rounding doubling regular/complex, D element size +def : InstRW<[N2Write_5cyc_2V0], (instregex "^SQRDMULH_ZZZI?_D$")>; + +// Multiply/multiply long, (8x8) polynomial +def : InstRW<[N2Write_2cyc_1V0], (instregex "^PMUL_ZZZ_B$", + "^PMULL[BT]_ZZZ_[HDQ]$")>; + +// Predicate counting vector +def : InstRW<[N2Write_2cyc_1V0], + (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI$")>; + +// Reciprocal estimate +def : InstRW<[N2Write_4cyc_2V0], (instrs URECPE_ZPmZ_S, URSQRTE_ZPmZ_S)>; + +// Reduction, arithmetic, B form +def : InstRW<[N2Write_11cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_B")>; + +// Reduction, arithmetic, H form +def : InstRW<[N2Write_9cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_H")>; + +// Reduction, arithmetic, S form +def : InstRW<[N2Write_8cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_S")>; + +// Reduction, arithmetic, D form +def : InstRW<[N2Write_8cyc_2V_2V1], (instregex "^[SU](ADD|MAX|MIN)V_VPZ_D")>; + +// Reduction, logical +def : InstRW<[N2Write_6cyc_1V_1V1], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]$")>; + +// Reverse, vector +def : InstRW<[N2Write_2cyc_1V], (instregex "^REV_ZZ_[BHSD]$", + "^REVB_ZPmZ_[HSD]$", + "^REVH_ZPmZ_[SD]$", + "^REVW_ZPmZ_D$")>; + +// Select, vector form +def : InstRW<[N2Write_2cyc_1V], (instregex "^SEL_ZPZZ_[BHSD]$")>; + +// Table lookup +def : InstRW<[N2Write_2cyc_1V], (instregex "^TBL_ZZZZ?_[BHSD]$")>; + +// Table lookup extension +def : InstRW<[N2Write_2cyc_1V], (instregex "^TBX_ZZZ_[BHSD]$")>; + +// Transpose, vector form +def : InstRW<[N2Write_2cyc_1V], (instregex "^TRN[12]_ZZZ_[BHSDQ]$")>; + +// Unpack and extend +def : InstRW<[N2Write_2cyc_1V], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]$")>; + +// Zip/unzip +def : InstRW<[N2Write_2cyc_1V], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]$")>; + +// SVE floating-point instructions +// ----------------------------------------------------------------------------- + +// Floating point absolute value/difference +def : InstRW<[N2Write_2cyc_1V], (instregex "^FAB[SD]_ZPmZ_[HSD]$")>; + +// Floating point arithmetic +def : InstRW<[N2Write_2cyc_1V], (instregex "^F(ADD|SUB)_(ZPm[IZ]|ZZZ)_[HSD]$", + "^FADDP_ZPmZZ_[HSD]$", + "^FNEG_ZPmZ_[HSD]$", + "^FSUBR_ZPm[IZ]_[HSD]$")>; + +// Floating point associative add, F16 +def : InstRW<[N2Write_10cyc_1V1], (instrs FADDA_VPZ_H)>; + +// Floating point associative add, F32 +def : InstRW<[N2Write_6cyc_1V1], (instrs FADDA_VPZ_S)>; + +// Floating point associative add, F64 +def : InstRW<[N2Write_4cyc_1V], (instrs FADDA_VPZ_D)>; + +// Floating point compare +def : InstRW<[N2Write_2cyc_1V0], (instregex "^FACG[ET]_PPzZZ_[HSD]$", + "^FCM(EQ|GE|GT|NE)_PPzZ[0Z]_[HSD]$", + "^FCM(LE|LT)_PPzZ0_[HSD]$", + "^FCMUO_PPzZZ_[HSD]$")>; + +// Floating point complex add +def : InstRW<[N2Write_3cyc_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>; + +// Floating point complex multiply add +def : InstRW<[N2Write_5cyc_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$", + "^FCMLA_ZZZI_[HS]$")>; + +// Floating point convert, long or narrow (F16 to F32 or F32 to F16) +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)$", + "^FCVTLT_ZPmZ_HtoS$", + "^FCVTNT_ZPmZ_StoH$")>; + +// Floating point convert, long or narrow (F16 to F64, F32 to F64, F64 to F32 +// or F64 to F16) +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FCVT_ZPmZ_(HtoD|StoD|DtoS|DtoH)$", + "^FCVTLT_ZPmZ_StoD$", + "^FCVTNT_ZPmZ_DtoS$")>; + +// Floating point convert, round to odd +def : InstRW<[N2Write_3cyc_1V0], (instrs FCVTX_ZPmZ_DtoS, FCVTXNT_ZPmZ_DtoS)>; + +// Floating point base2 log, F16 +def : InstRW<[N2Write_6cyc_4V0], (instrs FLOGB_ZPmZ_H)>; + +// Floating point base2 log, F32 +def : InstRW<[N2Write_4cyc_2V0], (instrs FLOGB_ZPmZ_S)>; + +// Floating point base2 log, F64 +def : InstRW<[N2Write_3cyc_1V0], (instrs FLOGB_ZPmZ_D)>; + +// Floating point convert to integer, F16 +def : InstRW<[N2Write_6cyc_4V0], (instregex "^FCVTZ[SU]_ZPmZ_HtoH$")>; + +// Floating point convert to integer, F32 +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FCVTZ[SU]_ZPmZ_(HtoS|StoS)$")>; + +// Floating point convert to integer, F64 +def : InstRW<[N2Write_3cyc_1V0], + (instregex "^FCVTZ[SU]_ZPmZ_(HtoD|StoD|DtoS|DtoD)$")>; + +// Floating point copy +def : InstRW<[N2Write_2cyc_1V], (instregex "^FCPY_ZPmI_[HSD]$", + "^FDUP_ZI_[HSD]$")>; + +// Floating point divide, F16 +def : InstRW<[N2Write_13cyc_1V0], (instregex "^FDIVR?_ZPmZ_H$")>; + +// Floating point divide, F32 +def : InstRW<[N2Write_10cyc_1V0], (instregex "^FDIVR?_ZPmZ_S$")>; + +// Floating point divide, F64 +def : InstRW<[N2Write_15cyc_1V0], (instregex "^FDIVR?_ZPmZ_D$")>; + +// Floating point min/max pairwise +def : InstRW<[N2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?P_ZPmZZ_[HSD]$")>; + +// Floating point min/max +def : InstRW<[N2Write_2cyc_1V], (instregex "^F(MAX|MIN)(NM)?_ZPm[IZ]_[HSD]$")>; + +// Floating point multiply +def : InstRW<[N2Write_3cyc_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]$", + "^FMUL_(ZPm[IZ]|ZZZI?)_[HSD]$")>; + +// Floating point multiply accumulate +def : InstRW<[N2Write_4cyc_1V], + (instregex "^FML[AS]_(ZPmZZ|ZZZI)_[HSD]$", + "^(FMAD|FNMAD|FNML[AS]|FN?MSB)_ZPmZZ_[HSD]$")>; + +// Floating point multiply add/sub accumulate long +def : InstRW<[N2Write_4cyc_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>; + +// Floating point reciprocal estimate, F16 +def : InstRW<[N2Write_6cyc_4V0], (instrs FRECPE_ZZ_H, FRECPX_ZPmZ_H, + FRSQRTE_ZZ_H)>; + +// Floating point reciprocal estimate, F32 +def : InstRW<[N2Write_4cyc_2V0], (instrs FRECPE_ZZ_S, FRECPX_ZPmZ_S, + FRSQRTE_ZZ_S)>; + +// Floating point reciprocal estimate, F64 +def : InstRW<[N2Write_3cyc_1V0], (instrs FRECPE_ZZ_D, FRECPX_ZPmZ_D, + FRSQRTE_ZZ_D)>; + +// Floating point reciprocal step +def : InstRW<[N2Write_4cyc_1V0], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>; + +// Floating point reduction, F16 +def : InstRW<[N2Write_6cyc_2V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_H$")>; + +// Floating point reduction, F32 +def : InstRW<[N2Write_4cyc_1V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_S$")>; + +// Floating point reduction, F64 +def : InstRW<[N2Write_2cyc_1V], + (instregex "^(FADDV|FMAXNMV|FMAXV|FMINNMV|FMINV)_VPZ_D$")>; + +// Floating point round to integral, F16 +def : InstRW<[N2Write_6cyc_4V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_H$")>; + +// Floating point round to integral, F32 +def : InstRW<[N2Write_4cyc_2V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_S$")>; + +// Floating point round to integral, F64 +def : InstRW<[N2Write_3cyc_1V0], (instregex "^FRINT[AIMNPXZ]_ZPmZ_D$")>; + +// Floating point square root, F16 +def : InstRW<[N2Write_13cyc_1V0], (instrs FSQRT_ZPmZ_H)>; + +// Floating point square root, F32 +def : InstRW<[N2Write_10cyc_1V0], (instrs FSQRT_ZPmZ_S)>; + +// Floating point square root, F64 +def : InstRW<[N2Write_16cyc_1V0], (instrs FSQRT_ZPmZ_D)>; + +// Floating point trigonometric exponentiation +def : InstRW<[N2Write_3cyc_1V1], (instregex "^FEXPA_ZZ_[HSD]$")>; + +// Floating point trigonometric multiply add +def : InstRW<[N2Write_4cyc_1V], (instregex "^FTMAD_ZZI_[HSD]$")>; + +// Floating point trigonometric, miscellaneous +def : InstRW<[N2Write_3cyc_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>; + +// SVE BFloat16 (BF16) instructions +// ----------------------------------------------------------------------------- + +// Convert, F32 to BF16 +def : InstRW<[N2Write_3cyc_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; + +// Dot product +def : InstRW<[N2Write_4cyc_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; + +// Matrix multiply accumulate +def : InstRW<[N2Write_5cyc_1V], (instrs BFMMLA_ZZZ)>; + +// Multiply accumulate long +def : InstRW<[N2Write_4cyc_1V], (instregex "^BFMLAL[BT]_ZZ[ZI]$")>; + +// SVE Load instructions +// ----------------------------------------------------------------------------- + +// Load vector +def : InstRW<[N2Write_6cyc_1L], (instrs LDR_ZXI)>; + +// Load predicate +def : InstRW<[N2Write_6cyc_1L_1M], (instrs LDR_PXI)>; + +// Contiguous load, scalar + imm +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1[BHWD]_IMM_REAL$", + "^LD1S?B_[HSD]_IMM_REAL$", + "^LD1S?H_[SD]_IMM_REAL$", + "^LD1S?W_D_IMM_REAL$" )>; +// Contiguous load, scalar + scalar +def : InstRW<[N2Write_6cyc_1L01], (instregex "^LD1[BHWD]$", + "^LD1S?B_[HSD]$", + "^LD1S?H_[SD]$", + "^LD1S?W_D$" )>; + +// Contiguous load broadcast, scalar + imm +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1R[BHWD]_IMM$", + "^LD1RSW_IMM$", + "^LD1RS?B_[HSD]_IMM$", + "^LD1RS?H_[SD]_IMM$", + "^LD1RS?W_D_IMM$", + "^LD1RQ_[BHWD]_IMM$")>; + +// Contiguous load broadcast, scalar + scalar +def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1RQ_[BHWD]$")>; + +// Non temporal load, scalar + imm +def : InstRW<[N2Write_6cyc_1L], (instregex "^LDNT1[BHWD]_ZRI$")>; + +// Non temporal load, scalar + scalar +def : InstRW<[N2Write_6cyc_1L_1S], (instregex "^LDNT1[BHWD]_ZRR$")>; + +// Non temporal gather load, vector + scalar 32-bit element size +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LDNT1[BHW]_ZZR_S_REAL$", + "^LDNT1S[BH]_ZZR_S_REAL$")>; + +// Non temporal gather load, vector + scalar 64-bit element size +def : InstRW<[N2Write_10cyc_2L_2V1], (instregex "^LDNT1S?[BHW]_ZZR_D_REAL$")>; +def : InstRW<[N2Write_10cyc_2L_2V1], (instrs LDNT1D_ZZR_D_REAL)>; + +// Contiguous first faulting load, scalar + scalar +def : InstRW<[N2Write_6cyc_1L_1S], (instregex "^LDFF1[BHWD]_REAL$", + "^LDFF1S?B_[HSD]_REAL$", + "^LDFF1S?H_[SD]_REAL$", + "^LDFF1S?W_D_REAL$")>; + +// Contiguous non faulting load, scalar + imm +def : InstRW<[N2Write_6cyc_1L], (instregex "^LDNF1[BHWD]_IMM_REAL$", + "^LDNF1S?B_[HSD]_IMM_REAL$", + "^LDNF1S?H_[SD]_IMM_REAL$", + "^LDNF1S?W_D_IMM_REAL$")>; + +// Contiguous Load two structures to two vectors, scalar + imm +def : InstRW<[N2Write_8cyc_1L_1V], (instregex "^LD2[BHWD]_IMM$")>; + +// Contiguous Load two structures to two vectors, scalar + scalar +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LD2[BHWD]$")>; + +// Contiguous Load three structures to three vectors, scalar + imm +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^LD3[BHWD]_IMM$")>; + +// Contiguous Load three structures to three vectors, scalar + scalar +def : InstRW<[N2Write_10cyc_1V_1L_1S], (instregex "^LD3[BHWD]$")>; + +// Contiguous Load four structures to four vectors, scalar + imm +def : InstRW<[N2Write_9cyc_2L_2V], (instregex "^LD4[BHWD]_IMM$")>; + +// Contiguous Load four structures to four vectors, scalar + scalar +def : InstRW<[N2Write_10cyc_2L_2V_2S], (instregex "^LD4[BHWD]$")>; + +// Gather load, vector + imm, 32-bit element size +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^GLD(FF)?1S?[BH]_S_IMM_REAL$", + "^GLD(FF)?1W_IMM_REAL$")>; + +// Gather load, vector + imm, 64-bit element size +def : InstRW<[N2Write_9cyc_2L_2V], (instregex "^GLD(FF)?1S?[BHW]_D_IMM_REAL$", + "^GLD(FF)?1D_IMM_REAL$")>; + +// Gather load, 64-bit element size +def : InstRW<[N2Write_9cyc_2L_2V], + (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW_(SCALED_)?REAL$", + "^GLD(FF)?1S?[BHW]_D_(SCALED_)?REAL$", + "^GLD(FF)?1D_[SU]XTW_(SCALED_)?REAL$", + "^GLD(FF)?1D_(SCALED_)?REAL$")>; + +// Gather load, 32-bit scaled offset +def : InstRW<[N2Write_10cyc_2L_2V], + (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED_REAL$", + "^GLD(FF)?1W_[SU]XTW_SCALED_REAL")>; + +// Gather load, 32-bit unpacked unscaled offset +def : InstRW<[N2Write_9cyc_1L_1V], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW_REAL$", + "^GLD(FF)?1W_[SU]XTW_REAL$")>; + +// SVE Store instructions +// ----------------------------------------------------------------------------- + +// Store from predicate reg +def : InstRW<[N2Write_1cyc_1L01], (instrs STR_PXI)>; + +// Store from vector reg +def : InstRW<[N2Write_2cyc_1L01_1V], (instrs STR_ZXI)>; + +// Contiguous store, scalar + imm +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^ST1[BHWD]_IMM$", + "^ST1B_[HSD]_IMM$", + "^ST1H_[SD]_IMM$", + "^ST1W_D_IMM$")>; + +// Contiguous store, scalar + scalar +def : InstRW<[N2Write_2cyc_1L01_1S_1V], (instregex "^ST1H(_[SD])?$")>; +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^ST1[BWD]$", + "^ST1B_[HSD]$", + "^ST1W_D$")>; + +// Contiguous store two structures from two vectors, scalar + imm +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "^ST2[BHWD]_IMM$")>; + +// Contiguous store two structures from two vectors, scalar + scalar +def : InstRW<[N2Write_4cyc_1L01_1S_1V], (instrs ST2H)>; + +// Contiguous store two structures from two vectors, scalar + scalar +def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "^ST2[BWD]$")>; + +// Contiguous store three structures from three vectors, scalar + imm +def : InstRW<[N2Write_7cyc_5L01_5V], (instregex "^ST3[BHWD]_IMM$")>; + +// Contiguous store three structures from three vectors, scalar + scalar +def : InstRW<[N2Write_7cyc_5L01_5S_5V], (instrs ST3H)>; + +// Contiguous store three structures from three vectors, scalar + scalar +def : InstRW<[N2Write_7cyc_5L01_5S_5V], (instregex "^ST3[BWD]$")>; + +// Contiguous store four structures from four vectors, scalar + imm +def : InstRW<[N2Write_11cyc_9L01_9V], (instregex "^ST4[BHWD]_IMM$")>; + +// Contiguous store four structures from four vectors, scalar + scalar +def : InstRW<[N2Write_11cyc_9L01_9S_9V], (instrs ST4H)>; + +// Contiguous store four structures from four vectors, scalar + scalar +def : InstRW<[N2Write_11cyc_9L01_9S_9V], (instregex "^ST4[BWD]$")>; + +// Non temporal store, scalar + imm +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>; + +// Non temporal store, scalar + scalar +def : InstRW<[N2Write_2cyc_1L01_1S_1V], (instrs STNT1H_ZRR)>; +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>; + +// Scatter non temporal store, vector + scalar 32-bit element size +def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^STNT1[BHW]_ZZR_S")>; + +// Scatter non temporal store, vector + scalar 64-bit element size +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^STNT1[BHWD]_ZZR_D")>; + +// Scatter store vector + imm 32-bit element size +def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^SST1[BH]_S_IMM$", + "^SST1W_IMM$")>; + +// Scatter store vector + imm 64-bit element size +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D_IMM$", + "^SST1D_IMM$")>; + +// Scatter store, 32-bit scaled offset +def : InstRW<[N2Write_4cyc_2L01_2V], + (instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unpacked unscaled offset +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D_[SU]XTW$", + "^SST1D_[SU]XTW$")>; + +// Scatter store, 32-bit unpacked scaled offset +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$", + "^SST1D_[SU]XTW_SCALED$")>; + +// Scatter store, 32-bit unscaled offset +def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "^SST1[BH]_S_[SU]XTW$", + "^SST1W_[SU]XTW$")>; + +// Scatter store, 64-bit scaled offset +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[HW]_D_SCALED$", + "^SST1D_SCALED$")>; + +// Scatter store, 64-bit unscaled offset +def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "^SST1[BHW]_D$", + "^SST1D$")>; + +// SVE Miscellaneous instructions +// ----------------------------------------------------------------------------- + +// Read first fault register, unpredicated +def : InstRW<[N2Write_2cyc_1M0], (instrs RDFFR_P_REAL)>; + +// Read first fault register, predicated +def : InstRW<[N2Write_3cyc_1M0_1M], (instrs RDFFR_PPz_REAL)>; + +// Read first fault register and set flags +def : InstRW<[N2Write_4cyc_2M0_2M], (instrs RDFFRS_PPz)>; + +// Set first fault register +// Write to first fault register +def : InstRW<[N2Write_2cyc_1M0], (instrs SETFFR, WRFFR)>; + +// Prefetch +def : InstRW<[N2Write_4cyc_1L], (instregex "^PRF[BHWD]")>; + +// SVE Cryptographic instructions +// ----------------------------------------------------------------------------- + +// Crypto AES ops +def : InstRW<[N2Write_2cyc_1V], (instregex "^AES[DE]_ZZZ_B$", + "^AESI?MC_ZZ_B$")>; + +// Crypto SHA3 ops +def : InstRW<[N2Write_2cyc_1V0], (instregex "^(BCAX|EOR3)_ZZZZ$", + "^RAX1_ZZZ_D$", + "^XAR_ZZZI_[BHSD]$")>; + +// Crypto SM4 ops +def : InstRW<[N2Write_4cyc_1V0], (instregex "^SM4E(KEY)?_ZZZ_S$")>; + +} diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td index 6ecfc97a4273..9c1bf3231a55 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td @@ -26,7 +26,8 @@ def TSV110Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); } // Define each kind of processor resource and number available on the TSV110, diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td index ff34c0ce9a0c..8b380ae0e8f3 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -27,7 +27,8 @@ def ThunderXT8XModel : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index ffa0a5e7d91a..cdafa33da054 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -27,7 +27,8 @@ def ThunderX2T99Model : SchedMachineModel { list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, PAUnsupported.F, - SMEUnsupported.F); + SMEUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td index 46a1c217f984..5b1e9b5bcf23 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td +++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td @@ -25,7 +25,8 @@ def ThunderX3T110Model : SchedMachineModel { let CompleteModel = 1; list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, - PAUnsupported.F); + PAUnsupported.F, + [HasMTE]); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 41c7a8c5042f..274a025e82a0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -796,6 +796,50 @@ static Optional<Instruction *> instCombineSVELast(InstCombiner &IC, return IC.replaceInstUsesWith(II, Extract); } +static Optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, + IntrinsicInst &II) { + // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar + // integer variant across a variety of micro-architectures. Replace scalar + // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple + // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more + // depending on the micro-architecture, but has been observed as generally + // being faster, particularly when the CLAST[AB] op is a loop-carried + // dependency. + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + Value *Pg = II.getArgOperand(0); + Value *Fallback = II.getArgOperand(1); + Value *Vec = II.getArgOperand(2); + Type *Ty = II.getType(); + + if (!Ty->isIntegerTy()) + return None; + + Type *FPTy; + switch (cast<IntegerType>(Ty)->getBitWidth()) { + default: + return None; + case 16: + FPTy = Builder.getHalfTy(); + break; + case 32: + FPTy = Builder.getFloatTy(); + break; + case 64: + FPTy = Builder.getDoubleTy(); + break; + } + + Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy); + auto *FPVTy = VectorType::get( + FPTy, cast<VectorType>(Vec->getType())->getElementCount()); + Value *FPVec = Builder.CreateBitCast(Vec, FPVTy); + auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()}, + {Pg, FPFallBack, FPVec}); + Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType()); + return IC.replaceInstUsesWith(II, FPIItoInt); +} + static Optional<Instruction *> instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II) { LLVMContext &Ctx = II.getContext(); @@ -1294,6 +1338,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, case Intrinsic::aarch64_sve_lasta: case Intrinsic::aarch64_sve_lastb: return instCombineSVELast(IC, II); + case Intrinsic::aarch64_sve_clasta_n: + case Intrinsic::aarch64_sve_clastb_n: + return instCombineSVECondLast(IC, II); case Intrinsic::aarch64_sve_cntd: return instCombineSVECntElts(IC, II, 2); case Intrinsic::aarch64_sve_cntw: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index d0aacb457a39..59ec91843266 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -334,8 +334,10 @@ public: return 2; } - bool emitGetActiveLaneMask() const { - return ST->hasSVE(); + PredicationStyle emitGetActiveLaneMask() const { + if (ST->hasSVE()) + return PredicationStyle::DataAndControlFlow; + return PredicationStyle::None; } bool supportsScalableVectors() const { return ST->hasSVE(); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 89e1d85a6085..aaef363e9b8d 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/LowLevelType.h" @@ -354,7 +355,9 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, "Return value without a vreg"); bool Success = true; - if (!VRegs.empty()) { + if (!FLI.CanLowerReturn) { + insertSRetStores(MIRBuilder, Val->getType(), VRegs, FLI.DemoteRegister); + } else if (!VRegs.empty()) { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); @@ -464,6 +467,18 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, return Success; } +bool AArch64CallLowering::canLowerReturn(MachineFunction &MF, + CallingConv::ID CallConv, + SmallVectorImpl<BaseArgInfo> &Outs, + bool IsVarArg) const { + SmallVector<CCValAssign, 16> ArgLocs; + const auto &TLI = *getTLI<AArch64TargetLowering>(); + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, + MF.getFunction().getContext()); + + return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv)); +} + /// Helper function to compute forwarded registers for musttail calls. Computes /// the forwarded registers, sets MBB liveness, and emits COPY instructions that /// can be used to save + restore registers later. @@ -533,6 +548,12 @@ bool AArch64CallLowering::lowerFormalArguments( SmallVector<ArgInfo, 8> SplitArgs; SmallVector<std::pair<Register, Register>> BoolArgs; + + // Insert the hidden sret parameter if the return value won't fit in the + // return registers. + if (!FLI.CanLowerReturn) + insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL); + unsigned i = 0; for (auto &Arg : F.args()) { if (DL.getTypeStoreSize(Arg.getType()).isZero()) @@ -1194,7 +1215,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. - if (!Info.OrigRet.Ty->isVoidTy()) { + if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) { CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv); CallReturnHandler Handler(MIRBuilder, MRI, MIB); bool UsingReturnedArg = @@ -1226,6 +1247,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, .addImm(Assigner.StackOffset) .addImm(CalleePopBytes); + if (!Info.CanLowerReturn) { + insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, + Info.DemoteRegister, Info.DemoteStackIndex); + } return true; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h index aafb1d19640a..cbdf77f69a63 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h @@ -35,6 +35,10 @@ public: ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI, Register SwiftErrorVReg) const override; + bool canLowerReturn(MachineFunction &MF, CallingConv::ID CallConv, + SmallVectorImpl<BaseArgInfo> &Outs, + bool IsVarArg) const override; + bool fallBackToDAGISel(const MachineFunction &MF) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 9a65687735fe..eb8d0552173d 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -1710,11 +1710,6 @@ bool AArch64InstructionSelector::selectCompareBranch( MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) { Register CondReg = I.getOperand(0).getReg(); MachineInstr *CCMI = MRI.getVRegDef(CondReg); - if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) { - CondReg = CCMI->getOperand(1).getReg(); - CCMI = MRI.getVRegDef(CondReg); - } - // Try to select the G_BRCOND using whatever is feeding the condition if // possible. unsigned CCMIOpc = CCMI->getOpcode(); @@ -3346,12 +3341,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_SELECT: { auto &Sel = cast<GSelect>(I); - if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) { - LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty - << ", expected: " << LLT::scalar(1) << '\n'); - return false; - } - const Register CondReg = Sel.getCondReg(); const Register TReg = Sel.getTrueReg(); const Register FReg = Sel.getFalseReg(); @@ -4777,12 +4766,6 @@ static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst, return false; MachineInstr *ValDef = MRI.getVRegDef(Val); unsigned Opcode = ValDef->getOpcode(); - if (Opcode == TargetOpcode::G_TRUNC) { - // Look through a trunc. - Val = ValDef->getOperand(1).getReg(); - ValDef = MRI.getVRegDef(Val); - Opcode = ValDef->getOpcode(); - } if (isa<GAnyCmp>(ValDef)) { CanNegate = true; MustBeFirst = false; @@ -4870,12 +4853,6 @@ MachineInstr *AArch64InstructionSelector::emitConjunctionRec( auto &MRI = *MIB.getMRI(); MachineInstr *ValDef = MRI.getVRegDef(Val); unsigned Opcode = ValDef->getOpcode(); - if (Opcode == TargetOpcode::G_TRUNC) { - // Look through a trunc. - Val = ValDef->getOperand(1).getReg(); - ValDef = MRI.getVRegDef(Val); - Opcode = ValDef->getOpcode(); - } if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) { Register LHS = Cmp->getLHSReg(); Register RHS = Cmp->getRHSReg(); @@ -5026,31 +5003,17 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) { // First, check if the condition is defined by a compare. MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg()); - while (CondDef) { - // We can only fold if all of the defs have one use. - Register CondDefReg = CondDef->getOperand(0).getReg(); - if (!MRI.hasOneNonDBGUse(CondDefReg)) { - // Unless it's another select. - for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { - if (CondDef == &UI) - continue; - if (UI.getOpcode() != TargetOpcode::G_SELECT) - return false; - } - } - - // We can skip over G_TRUNC since the condition is 1-bit. - // Truncating/extending can have no impact on the value. - unsigned Opc = CondDef->getOpcode(); - if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC) - break; - - // Can't see past copies from physregs. - if (Opc == TargetOpcode::COPY && - Register::isPhysicalRegister(CondDef->getOperand(1).getReg())) - return false; - CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); + // We can only fold if all of the defs have one use. + Register CondDefReg = CondDef->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(CondDefReg)) { + // Unless it's another select. + for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) { + if (CondDef == &UI) + continue; + if (UI.getOpcode() != TargetOpcode::G_SELECT) + return false; + } } // Is the condition defined by a compare? diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 74ec9373ce9e..d3617b87a851 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -42,7 +42,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) : ST(&ST) { using namespace TargetOpcode; const LLT p0 = LLT::pointer(0, 64); - const LLT s1 = LLT::scalar(1); const LLT s8 = LLT::scalar(8); const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); @@ -80,7 +79,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) const LLT &MinFPScalar = HasFP16 ? s16 : s32; getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) - .legalFor({p0, s1, s8, s16, s32, s64}) + .legalFor({p0, s8, s16, s32, s64}) .legalFor(PackedVectorAllTypeList) .widenScalarToNextPow2(0) .clampScalar(0, s8, s64) @@ -198,8 +197,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder( {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) - .legalFor({{s32, s1}, {s64, s1}}) + .legalFor({{s32, s32}, {s64, s32}}) .clampScalar(0, s32, s64) + .clampScalar(1, s32, s64) .widenScalarToNextPow2(0); getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) @@ -241,7 +241,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_INSERT) .legalIf(all(typeInSet(0, {s32, s64, p0}), - typeInSet(1, {s1, s8, s16, s32}), smallerThan(1, 0))) + typeInSet(1, {s8, s16, s32}), smallerThan(1, 0))) .widenScalarToNextPow2(0) .clampScalar(0, s32, s64) .widenScalarToNextPow2(1) @@ -260,8 +260,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32) .maxScalarIf(typeInSet(1, {s128}), 0, s64); - getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) - .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)) + + for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { + auto &Actions = getActionDefinitionsBuilder(Op); + + if (Op == G_SEXTLOAD) + Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered)); + + // Atomics have zero extending behavior. + Actions .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8}, @@ -278,6 +285,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .unsupportedIfMemSizeNotPow2() // Lower anything left over into G_*EXT and G_LOAD .lower(); + } auto IsPtrVecPred = [=](const LegalityQuery &Query) { const LLT &ValTy = Query.Types[0]; @@ -425,10 +433,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) const LLT &SrcTy = Query.Types[1]; - // Special case for s1. - if (SrcTy == s1) - return true; - // Make sure we fit in a register otherwise. Don't bother checking that // the source type is below 128 bits. We shouldn't be allowing anything // through which is wider than the destination in the first place. @@ -481,13 +485,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarToNextPow2(0); // Control-flow - getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32}); + getActionDefinitionsBuilder(G_BRCOND) + .legalFor({s32}) + .clampScalar(0, s32, s32); getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0}); getActionDefinitionsBuilder(G_SELECT) - .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) + .legalFor({{s32, s32}, {s64, s32}, {p0, s32}}) .widenScalarToNextPow2(0) .clampScalar(0, s32, s64) + .clampScalar(1, s32, s32) .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) .lowerIf(isVector(0)); @@ -500,7 +507,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0}); getActionDefinitionsBuilder(G_PTRTOINT) - .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) + .legalForCartesianProduct({s8, s16, s32, s64}, {p0}) .legalFor({{v2s64, v2p0}}) .maxScalar(0, s64) .widenScalarToNextPow2(0, /*Min*/ 8); @@ -517,7 +524,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // FIXME: This is wrong since G_BITCAST is not allowed to change the // number of bits but it's what the previous code described and fixing // it breaks tests. - .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, + .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8, v8s16, v4s16, v2s16, v4s32, v2s32, v2s64, v2p0}); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 2901e5c0fe4d..bd0a497fa441 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -43,11 +43,9 @@ namespace { class AArch64MCCodeEmitter : public MCCodeEmitter { MCContext &Ctx; - const MCInstrInfo &MCII; public: - AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : Ctx(ctx), MCII(mcii) {} + AArch64MCCodeEmitter(const MCInstrInfo &, MCContext &ctx) : Ctx(ctx) {} AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) = delete; void operator=(const AArch64MCCodeEmitter &) = delete; ~AArch64MCCodeEmitter() override = default; @@ -193,12 +191,6 @@ public: uint32_t encodeMatrixIndexGPR32(const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -618,9 +610,6 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue, void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - if (MI.getOpcode() == AArch64::TLSDESCCALL) { // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the // following (BLR) instruction. It doesn't emit any code itself so it @@ -674,7 +663,6 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison( return EncodedValue; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AArch64GenMCCodeEmitter.inc" MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 34e3b2cf58e4..f129bfe11e4d 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -34,6 +34,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC #define GET_INSTRINFO_MC_HELPERS +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AArch64GenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 049c49796dc6..7d1de3e53c0c 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -33,6 +33,7 @@ class MCSubtargetInfo; class MCTargetOptions; class MCTargetStreamer; class Target; +class FeatureBitset; MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 2744e81f99f1..cb36aa26e839 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -227,6 +227,40 @@ class sme_add_vector_to_tile_u64<bit V, string mnemonic> let Inst{2-0} = ZAda; } +class sme_add_vector_to_tile_pseudo<ZPRRegOp zpr_ty> + : Pseudo<(outs), + (ins i64imm:$tile, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let usesCustomInserter = 1; +} + +def ADDHA_MPPZ_PSEUDO_S : sme_add_vector_to_tile_pseudo<ZPR32>; +def ADDVA_MPPZ_PSEUDO_S : sme_add_vector_to_tile_pseudo<ZPR32>; + +def : Pat<(int_aarch64_sme_addha + imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm), + (nxv4i32 ZPR32:$zn)), + (ADDHA_MPPZ_PSEUDO_S imm0_3:$tile, $pn, $pm, $zn)>; +def : Pat<(int_aarch64_sme_addva + imm0_3:$tile, (nxv4i1 PPR3bAny:$pn), (nxv4i1 PPR3bAny:$pm), + (nxv4i32 ZPR32:$zn)), + (ADDVA_MPPZ_PSEUDO_S imm0_3:$tile, $pn, $pm, $zn)>; + +let Predicates = [HasSMEI64] in { +def ADDHA_MPPZ_PSEUDO_D : sme_add_vector_to_tile_pseudo<ZPR64>; +def ADDVA_MPPZ_PSEUDO_D : sme_add_vector_to_tile_pseudo<ZPR64>; + +def : Pat<(int_aarch64_sme_addha + imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm), + (nxv2i64 ZPR64:$zn)), + (ADDHA_MPPZ_PSEUDO_D imm0_7:$tile, $pn, $pm, $zn)>; +def : Pat<(int_aarch64_sme_addva + imm0_7:$tile, (nxv2i1 PPR3bAny:$pn), (nxv2i1 PPR3bAny:$pm), + (nxv2i64 ZPR64:$zn)), + (ADDVA_MPPZ_PSEUDO_D imm0_7:$tile, $pn, $pm, $zn)>; +} + //===----------------------------------------------------------------------===// // SME Contiguous Loads //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 3631536a32b9..7cdd4c4af95e 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -650,11 +650,11 @@ multiclass sve_int_pfalse<bits<6> opc, string asm> { def : Pat<(nxv1i1 immAllZerosV), (!cast<Instruction>(NAME))>; } -class sve_int_ptest<bits<6> opc, string asm> +class sve_int_ptest<bits<6> opc, string asm, SDPatternOperator op> : I<(outs), (ins PPRAny:$Pg, PPR8:$Pn), asm, "\t$Pg, $Pn", "", - []>, Sched<[]> { + [(op (nxv16i1 PPRAny:$Pg), (nxv16i1 PPR8:$Pn))]>, Sched<[]> { bits<4> Pg; bits<4> Pn; let Inst{31-24} = 0b00100101; @@ -1691,6 +1691,9 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op, !cast<Instruction>(NAME), PTRUE_S>; def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2i1, nxv2i1, !cast<Instruction>(NAME), PTRUE_D>; + // Emulate .Q operation using a PTRUE_D when the other lanes don't matter. + def : SVE_2_Op_AllActive_Pat<nxv1i1, op_nopred, nxv1i1, nxv1i1, + !cast<Instruction>(NAME), PTRUE_D>; } // An instance of sve_int_pred_log_and but uses op_nopred's first operand as the @@ -1706,6 +1709,9 @@ multiclass sve_int_pred_log_v2<bits<4> opc, string asm, SDPatternOperator op, (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; def : Pat<(nxv2i1 (op_nopred nxv2i1:$Op1, nxv2i1:$Op2)), (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; + // Emulate .Q operation using a PTRUE_D when the other lanes don't matter. + def : Pat<(nxv1i1 (op_nopred nxv1i1:$Op1, nxv1i1:$Op2)), + (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 71303611265c..cf8891cff1b3 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -343,7 +343,8 @@ struct SysAlias { : Name(N), Encoding(E), FeaturesRequired(F) {} bool haveFeatures(FeatureBitset ActiveFeatures) const { - return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + return ActiveFeatures[llvm::AArch64::FeatureAll] || + (FeaturesRequired & ActiveFeatures) == FeaturesRequired; } FeatureBitset getRequiredFeatures() const { return FeaturesRequired; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index c4680cbedadf..91dc611fb265 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -317,6 +317,9 @@ extern char &SIFormMemoryClausesID; void initializeSIPostRABundlerPass(PassRegistry&); extern char &SIPostRABundlerID; +void initializeGCNCreateVOPDPass(PassRegistry &); +extern char &GCNCreateVOPDID; + void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); extern char &AMDGPUUnifyDivergentExitNodesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 94d7844e8a32..a8108b1d637b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -626,13 +626,13 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { Constant *FoldedT = SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL); - if (isa<ConstantExpr>(FoldedT)) + if (!FoldedT || isa<ConstantExpr>(FoldedT)) return false; Constant *FoldedF = SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL); - if (isa<ConstantExpr>(FoldedF)) + if (!FoldedF || isa<ConstantExpr>(FoldedF)) return false; IRBuilder<> Builder(&BO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b00df27f5fd3..589992c7a7ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1883,20 +1883,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return true; } +// Match an immediate (if Imm is true) or an SGPR (if Imm is false) +// offset. If Imm32Only is true, match only 32-bit immediate offsets +// available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, - SDValue &Offset, bool &Imm) const { + SDValue &Offset, bool Imm, + bool Imm32Only) const { ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); if (!C) { + if (Imm) + return false; if (ByteOffsetNode.getValueType().isScalarInteger() && ByteOffsetNode.getValueType().getSizeInBits() == 32) { Offset = ByteOffsetNode; - Imm = false; return true; } if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { Offset = ByteOffsetNode.getOperand(0); - Imm = false; return true; } } @@ -1908,9 +1912,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, int64_t ByteOffset = C->getSExtValue(); Optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false); - if (EncodedOffset) { + if (EncodedOffset && Imm && !Imm32Only) { Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); - Imm = true; return true; } @@ -1919,7 +1922,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, return false; EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); - if (EncodedOffset) { + if (EncodedOffset && Imm32Only) { Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; } @@ -1927,11 +1930,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset)) return false; - SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); - Offset = SDValue( - CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); + if (!Imm) { + SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); + Offset = SDValue( + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); + return true; + } - return true; + return false; } SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { @@ -1959,8 +1965,12 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { Ops), 0); } +// Match a base and an immediate (if Imm is true) or an SGPR +// (if Imm is false) offset. If Imm32Only is true, match only 32-bit +// immediate offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, - SDValue &Offset, bool &Imm) const { + SDValue &Offset, bool Imm, + bool Imm32Only) const { SDLoc SL(Addr); // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -1977,41 +1987,34 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, assert(N0 && N1 && isa<ConstantSDNode>(N1)); } if (N0 && N1) { - if (SelectSMRDOffset(N1, Offset, Imm)) { + if (SelectSMRDOffset(N1, Offset, Imm, Imm32Only)) { SBase = Expand32BitAddress(N0); return true; } } + return false; } + if (!Imm) + return false; SBase = Expand32BitAddress(Addr); Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); - Imm = true; return true; } bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - bool Imm = false; - return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; + return SelectSMRD(Addr, SBase, Offset, /* Imm */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - - bool Imm = false; - if (!SelectSMRD(Addr, SBase, Offset, Imm)) - return false; - - return !Imm && isa<ConstantSDNode>(Offset); + return SelectSMRD(Addr, SBase, Offset, /* Imm */ true, /* Imm32Only */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - bool Imm = false; - return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && - !isa<ConstantSDNode>(Offset); + return SelectSMRD(Addr, SBase, Offset, /* Imm */ false); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 862be9dc5568..7894b8eb5b67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -193,11 +193,11 @@ private: bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, - bool &Imm) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool Imm, + bool Imm32Only) const; SDValue Expand32BitAddress(SDValue Addr) const; - bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, - bool &Imm) const; + bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool Imm, + bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ef7929012597..bf520a560404 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4803,6 +4803,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::Nand: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: return AtomicExpansionKind::CmpXChg; default: return AtomicExpansionKind::None; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3f242fdb6d8e..70fae9d784a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1180,7 +1180,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); if (Arg) { - const int64_t Value = Arg.getValue().Value.getSExtValue(); + const int64_t Value = Arg.value().Value.getSExtValue(); if (Value == 0) { unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); @@ -3235,7 +3235,7 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) - return false; + return Register(); if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); @@ -3851,27 +3851,36 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { getAddrModeInfo(*MI, *MRI, AddrInfo); // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, - // then we can select all ptr + 32-bit offsets not just immediate offsets. - if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + // then we can select all ptr + 32-bit offsets. + if (AddrInfo.empty()) return None; const GEPInfo &GEPInfo = AddrInfo[0]; + Register PtrReg = GEPInfo.SgprParts[0]; + // SGPR offset is unsigned. - if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) - return None; + if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) && + GEPInfo.Imm != 0) { + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; + } - // If we make it this far we have a load with an 32-bit immediate offset. - // It is OK to select this using a sgpr offset, because we have already - // failed trying to select this load into one of the _IMM variants since - // the _IMM Patterns are considered before the _SGPR patterns. - Register PtrReg = GEPInfo.SgprParts[0]; - Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } - }}; + if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) { + if (Register OffsetReg = + matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) { + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; + } + } + + return None; } std::pair<Register, int> @@ -4231,7 +4240,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { }, [=](MachineInstrBuilder &MIB) { // vaddr if (FI) - MIB.addFrameIndex(FI.getValue()); + MIB.addFrameIndex(FI.value()); else MIB.addReg(VAddr); }, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 31012915457b..26e6b9a10688 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -542,63 +542,37 @@ def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val), } } // End foreach as -// TODO: Add GISelPredicateCode for the ret and noret PatFrags once -// GlobalISelEmitter allows pattern matches where src and dst def count -// mismatch. - -multiclass ret_noret_op { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - def "_ret" : PatFrag<(ops node:$ptr, node:$data), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; - } - - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - def "_noret" : PatFrag<(ops node:$ptr, node:$data), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; - } +multiclass noret_op { + let HasNoUse = true in + def "_noret" : PatFrag<(ops node:$ptr, node:$data), + (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; } -defm int_amdgcn_flat_atomic_fadd : ret_noret_op; -defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op; -defm int_amdgcn_flat_atomic_fmin : ret_noret_op; -defm int_amdgcn_flat_atomic_fmax : ret_noret_op; -defm int_amdgcn_global_atomic_fadd : ret_noret_op; -defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op; -defm int_amdgcn_global_atomic_fmin : ret_noret_op; -defm int_amdgcn_global_atomic_fmax : ret_noret_op; -defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op; - -multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - defm "_noret" : binary_atomic_op<atomic_op, IsInt>; - } +defm int_amdgcn_flat_atomic_fadd : noret_op; +defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; +defm int_amdgcn_flat_atomic_fmin : noret_op; +defm int_amdgcn_flat_atomic_fmax : noret_op; +defm int_amdgcn_global_atomic_fadd : noret_op; +defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op; +defm int_amdgcn_global_atomic_fmin : noret_op; +defm int_amdgcn_global_atomic_fmax : noret_op; +defm int_amdgcn_ds_fadd_v2bf16 : noret_op; - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - defm "_ret" : binary_atomic_op<atomic_op, IsInt>; - } +multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { + let HasNoUse = true in + defm "_noret" : binary_atomic_op<atomic_op, IsInt>; } -multiclass ret_noret_ternary_atomic_op<SDNode atomic_op> { - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - defm "_noret" : ternary_atomic_op<atomic_op>; - } - - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - defm "_ret" : ternary_atomic_op<atomic_op>; - } +multiclass noret_ternary_atomic_op<SDNode atomic_op> { + let HasNoUse = true in + defm "_noret" : ternary_atomic_op<atomic_op>; } multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> { foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { defm "_"#as : binary_atomic_op<atomic_op, IsInt>; - defm "_"#as : ret_noret_binary_atomic_op<atomic_op, IsInt>; + defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>; } } } @@ -640,13 +614,15 @@ def store_align16_local: PatFrag<(ops node:$val, node:$ptr), let AddressSpaces = StoreAddress_local.AddrSpaces in { defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_local : ret_noret_ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_local_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_local : noret_ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_local_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; } let AddressSpaces = StoreAddress_region.AddrSpaces in { -defm atomic_cmp_swap_region : ret_noret_ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_region_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_region : noret_ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_region_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index ed6ddbf426fd..38e04dedd9fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -171,6 +171,10 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { } void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index 1b513c456307..745734aac2b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -131,8 +131,8 @@ public: bool IsAOneAddressSpace = isOneAddressSpace(A); bool IsBOneAddressSpace = isOneAddressSpace(B); - return AIO.getValue() >= BIO.getValue() && - (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); + return AIO.value() >= BIO.value() && + (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); } }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 77816a783630..6bd906439ee8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -40,9 +40,9 @@ using namespace llvm; #include "AMDGPUGenSubtargetInfo.inc" #undef AMDGPUSubtarget -static cl::opt<bool> DisablePowerSched( - "amdgpu-disable-power-sched", - cl::desc("Disable scheduling to minimize mAI power bursts"), +static cl::opt<bool> EnablePowerSched( + "amdgpu-enable-power-sched", + cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false)); static cl::opt<bool> EnableVGPRIndexMode( @@ -916,7 +916,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { void apply(ScheduleDAGInstrs *DAGInstrs) override { const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasMAIInsts() || DisablePowerSched) + if (!ST.hasMAIInsts()) return; DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); @@ -966,7 +966,8 @@ void GCNSubtarget::getPostRAMutations( std::unique_ptr<ScheduleDAGMutation> GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { - return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); + return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo) + : nullptr; } const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1c6b9d35695a..971e44723758 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,11 +22,13 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" +#include "GCNVOPDUtils.h" #include "R600.h" #include "R600TargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" @@ -278,6 +280,12 @@ static cl::opt<bool> cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable GFX11+ VOPD +static cl::opt<bool> + EnableVOPD("amdgpu-enable-vopd", + cl::desc("Enable VOPD, dual issue of VALU in wave32"), + cl::init(true), cl::Hidden); + // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt<bool> EnableDCEInRA("amdgpu-dce-in-ra", @@ -383,6 +391,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeSIPostRABundlerPass(*PR); + initializeGCNCreateVOPDPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -920,6 +929,8 @@ public: DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); DAG->addMutation(createIGroupLPDAGMutation()); DAG->addMutation(createSchedBarrierDAGMutation()); + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + DAG->addMutation(createVOPDPairingMutation()); return DAG; } @@ -1399,6 +1410,8 @@ void GCNPassConfig::addPreSched2() { } void GCNPassConfig::addPreEmitPass() { + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + addPass(&GCNCreateVOPDID); addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index a087323e5de7..04dd3e938a15 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1412,10 +1412,12 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> { foreach RtnMode = ["ret", "noret"] in { - defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode + defvar Op = !cast<SDPatternOperator>(OpPrefix + # !if(!eq(RtnMode, "ret"), "", "_noret") # !if(isIntr, "", "_" # vt.Size)); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)), (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, @@ -1428,6 +1430,7 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1439,10 +1442,12 @@ multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> { multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { foreach RtnMode = ["ret", "noret"] in { - defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global_" # RtnMode + defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global" + # !if(!eq(RtnMode, "ret"), "", "_noret") # "_" # vt.Size); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); @@ -1465,6 +1470,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> !if(!eq(vt, i32), sub0, sub0_sub1)), Addr64ResDag) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1495,13 +1501,14 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, list<string> RtnModes = ["ret", "noret"]> { foreach RtnMode = RtnModes in { - defvar Op = !cast<SDPatternOperator>(!if(!eq(RtnMode, "none"), - OpPrefix, OpPrefix # "_" # RtnMode)); - defvar InstSuffix = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")), - "_RTN", ""); - defvar CachePolicy = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")), + defvar Op = !cast<SDPatternOperator>(OpPrefix + # !if(!eq(RtnMode, "ret"), "", "_noret")); + + defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), (timm:$cachepolicy)); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), @@ -1534,6 +1541,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1551,7 +1559,7 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i32, "BUFFER_ATOMIC_OR">; defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i32, "BUFFER_ATOMIC_XOR">; defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i32, "BUFFER_ATOMIC_INC">; defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i32, "BUFFER_ATOMIC_DEC">; -defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["none"]>; +defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["ret"]>; defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i64, "BUFFER_ATOMIC_SWAP_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i64, "BUFFER_ATOMIC_ADD_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i64, "BUFFER_ATOMIC_SUB_X2">; @@ -1643,7 +1651,8 @@ let SubtargetPredicate = isGFX90APlus in { foreach RtnMode = ["ret", "noret"] in { -defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # "_" # RtnMode); +defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap + # !if(!eq(RtnMode, "ret"), "", "_noret")); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), (timm:$cachepolicy)); diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 27b723875aa4..d8387bf6f1ae 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -950,10 +950,11 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; } // End AddedComplexity = 100 -class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), - (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds)) ->; +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, + bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { @@ -965,75 +966,88 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { !cast<PatFrag>(frag#"_local_"#vt.Size)>; } - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; } multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; } def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; + !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { // Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode. -class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < +class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, + int complexity = 0, bit gds=0> : GCNPat< (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), - (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds)) ->; + (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), + /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), + /* complexity */ 1>; } - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } } // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 let SubtargetPredicate = isGFX11Plus in { // The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode. -class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < +class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, + int complexity = 0, bit gds=0> : GCNPat< (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), - (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds)) ->; + (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; - def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } } // End SubtargetPredicate = isGFX11Plus @@ -1090,17 +1104,20 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp } // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = isGFX90APlus in { -def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>; +def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>; +let AddedComplexity = 1 in def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>; } let SubtargetPredicate = isGFX940Plus in { -def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_ret_32>; +def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>; +let AddedComplexity = 1 in def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>; def : GCNPat < - (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)), + (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)), (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) >; +let AddedComplexity = 1 in def : GCNPat < (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index cb2822818549..c634e15945ad 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1015,31 +1015,35 @@ class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt multiclass FlatAtomicPat <string inst, string node, ValueType vt, ValueType data_vt = vt> { - defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size); + defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size); defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, - ValueType data_vt = vt, bit isIntr = 0> { - defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + ValueType data_vt = vt, int complexity = 0, + bit isIntr = 0> { + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size)); defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + let AddedComplexity = complexity in def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + let AddedComplexity = !add(complexity, 1) in def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt, ValueType data_vt = vt> { - defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>; + defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>; } class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1260,17 +1264,16 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> { - defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size)); defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); - let AddedComplexity = 10 in { - defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>; - } + defm : FlatSignedAtomicPat <inst, node, vt, data_vt, /* complexity */ 10, isIntr>; - let AddedComplexity = 11 in { - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>; - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; - } + let AddedComplexity = 13 in + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>; + + let AddedComplexity = 12 in + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; } multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt, diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp new file mode 100644 index 000000000000..83dc3bebf4d3 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -0,0 +1,175 @@ +//===- GCNCreateVOPD.cpp - Create VOPD Instructions ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Combine VALU pairs into VOPD instructions +/// Only works on wave32 +/// Has register requirements, we reject creating VOPD if the requirements are +/// not met. +/// shouldCombineVOPD mutator in postRA machine scheduler puts candidate +/// instructions for VOPD back-to-back +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "GCNVOPDUtils.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include <utility> + +#define DEBUG_TYPE "gcn-create-vopd" +STATISTIC(NumVOPDCreated, "Number of VOPD Insts Created."); + +using namespace llvm; + +namespace { + +class GCNCreateVOPD : public MachineFunctionPass { +private: +public: + static char ID; + const GCNSubtarget *ST = nullptr; + + GCNCreateVOPD() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "GCN Create VOPD Instructions"; + } + + bool doReplace(const SIInstrInfo *SII, + std::pair<MachineInstr *, MachineInstr *> &Pair) { + auto *FirstMI = Pair.first; + auto *SecondMI = Pair.second; + unsigned Opc1 = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), + AMDGPU::getVOPDOpcode(Opc2)); + assert(NewOpcode != -1 && + "Should have previously determined this as a possible VOPD\n"); + + auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI, + FirstMI->getDebugLoc(), SII->get(NewOpcode)) + .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags()); + VOPDInst.add(FirstMI->getOperand(0)) + .add(SecondMI->getOperand(0)) + .add(FirstMI->getOperand(1)); + + switch (Opc1) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(FirstMI->getOperand(2)); + VOPDInst.add(FirstMI->getOperand(3)); + break; + default: + VOPDInst.add(FirstMI->getOperand(2)); + break; + } + + VOPDInst.add(SecondMI->getOperand(1)); + + switch (Opc2) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(SecondMI->getOperand(2)); + VOPDInst.add(SecondMI->getOperand(3)); + break; + default: + VOPDInst.add(SecondMI->getOperand(2)); + break; + } + + VOPDInst.copyImplicitOps(*FirstMI); + VOPDInst.copyImplicitOps(*SecondMI); + + LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: " + << *Pair.first << "\tY: " << *Pair.second << "\n"); + FirstMI->eraseFromParent(); + SecondMI->eraseFromParent(); + ++NumVOPDCreated; + return true; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32()) + return false; + LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n"); + + const SIInstrInfo *SII = ST->getInstrInfo(); + bool Changed = false; + + SmallVector<std::pair<MachineInstr *, MachineInstr *>> ReplaceCandidates; + + for (auto &MBB : MF) { + auto MII = MBB.begin(), E = MBB.end(); + while (MII != E) { + auto *FirstMI = &*MII; + MII = next_nodbg(MII, MBB.end()); + if (MII == MBB.end()) + break; + if (FirstMI->isDebugInstr()) + continue; + auto *SecondMI = &*MII; + unsigned Opc = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + std::pair<MachineInstr *, MachineInstr *> Pair; + + if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) + Pair = {FirstMI, SecondMI}; + else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) + Pair = {SecondMI, FirstMI}; + else + continue; + // checkVOPDRegConstraints cares about program order, but doReplace + // cares about X-Y order in the constituted VOPD + if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) { + ReplaceCandidates.push_back(Pair); + ++MII; + } + } + } + for (auto &Pair : ReplaceCandidates) { + Changed |= doReplace(SII, Pair); + } + + return Changed; + } +}; + +} // namespace + +char GCNCreateVOPD::ID = 0; + +char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID; + +INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions", + false, false) diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 1cd880eaa48e..5d254518c67a 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -143,13 +143,20 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const { } int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { - auto DPP32 = AMDGPU::getDPPOp32(Op); + int DPP32 = AMDGPU::getDPPOp32(Op); if (IsShrinkable) { assert(DPP32 == -1); - auto E32 = AMDGPU::getVOPe32(Op); + int E32 = AMDGPU::getVOPe32(Op); DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32); } - return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32; + if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1) + return DPP32; + int DPP64 = -1; + if (ST->hasVOP3DPP()) + DPP64 = AMDGPU::getDPPOp64(Op); + if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1) + return DPP64; + return -1; } // tracks the register operand definition and returns: @@ -188,6 +195,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); + bool HasVOP3DPP = ST->hasVOP3DPP(); auto OrigOp = OrigMI.getOpcode(); auto DPPOp = getDPPOp(OrigOp, IsShrinkable); if (DPPOp == -1) { @@ -201,10 +209,18 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, bool Fail = false; do { - auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); - assert(Dst); - DPPInst.add(*Dst); - int NumOperands = 1; + int NumOperands = 0; + if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) { + DPPInst.add(*Dst); + ++NumOperands; + } + if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) { + if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) { + DPPInst.add(*SDst); + ++NumOperands; + } + // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst + } const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); if (OldIdx != -1) { @@ -230,7 +246,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::OpName::src0_modifiers)) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src0_modifiers)); - assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + assert(HasVOP3DPP || + (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); DPPInst.addImm(Mod0->getImm()); ++NumOperands; } else if (AMDGPU::getNamedOperandIdx(DPPOp, @@ -253,7 +270,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::OpName::src1_modifiers)) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src1_modifiers)); - assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + assert(HasVOP3DPP || + (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); DPPInst.addImm(Mod1->getImm()); ++NumOperands; } else if (AMDGPU::getNamedOperandIdx(DPPOp, @@ -261,7 +279,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.addImm(0); ++NumOperands; } - if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + if (Src1) { if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); Fail = true; @@ -270,8 +289,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*Src1); ++NumOperands; } - - if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { + if (auto *Mod2 = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) { + assert(NumOperands == + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); + assert(HasVOP3DPP || + (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); + DPPInst.addImm(Mod2->getImm()); + ++NumOperands; + } + auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); + if (Src2) { if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) || !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); @@ -279,8 +307,62 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, break; } DPPInst.add(*Src2); + ++NumOperands; + } + if (HasVOP3DPP) { + auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp); + if (ClampOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::clamp) != -1) { + DPPInst.addImm(ClampOpr->getImm()); + } + auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in); + if (VdstInOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::vdst_in) != -1) { + DPPInst.add(*VdstInOpr); + } + auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod); + if (OmodOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::omod) != -1) { + DPPInst.addImm(OmodOpr->getImm()); + } + // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to + // all 1. + if (auto *OpSelOpr = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) { + auto OpSel = OpSelOpr->getImm(); + if (OpSel != 0) { + LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n"); + Fail = true; + break; + } + if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel) != -1) + DPPInst.addImm(OpSel); + } + if (auto *OpSelHiOpr = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) { + auto OpSelHi = OpSelHiOpr->getImm(); + // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check + // the bitmask for 3 op_sel_hi bits set + assert(Src2 && "Expected vop3p with 3 operands"); + if (OpSelHi != 7) { + LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n"); + Fail = true; + break; + } + if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel_hi) != -1) + DPPInst.addImm(OpSelHi); + } + auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo); + if (NegOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_lo) != -1) { + DPPInst.addImm(NegOpr->getImm()); + } + auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi); + if (NegHiOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_hi) != -1) { + DPPInst.addImm(NegHiOpr->getImm()); + } } - DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); @@ -531,8 +613,16 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } bool IsShrinkable = isShrinkable(OrigMI); - if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { - LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); + if (!(IsShrinkable || + ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) || + TII->isVOP3(OrigOp)) && + ST->hasVOP3DPP()) || + TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { + LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n"); + break; + } + if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) { + LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n"); break; } @@ -543,9 +633,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { break; } + auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); assert(Src0 && "Src1 without Src0?"); - if (Src1 && Src1->isIdenticalTo(*Src0)) { - assert(Src1->isReg()); + if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) || + (Src2 && Src2->isIdenticalTo(*Src0)))) || + (Use == Src1 && (Src1->isIdenticalTo(*Src0) || + (Src2 && Src2->isIdenticalTo(*Src1))))) { LLVM_DEBUG( dbgs() << " " << OrigMI diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp new file mode 100644 index 000000000000..a5008e39d91a --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -0,0 +1,212 @@ +//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#include "GCNVOPDUtils.h" +#include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/MC/MCInst.h" + +using namespace llvm; + +#define DEBUG_TYPE "gcn-vopd-utils" + +bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI) { + const MachineFunction *MF = FirstMI.getMF(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo()); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const unsigned NumVGPRBanks = 4; + // Literals also count against scalar bus limit + SmallVector<const MachineOperand *> UniqueLiterals; + auto addLiteral = [&](const MachineOperand &Op) { + for (auto &Literal : UniqueLiterals) { + if (Literal->isIdenticalTo(Op)) + return; + } + UniqueLiterals.push_back(&Op); + }; + SmallVector<Register> UniqueScalarRegs; + assert([&]() -> bool { + for (auto MII = MachineBasicBlock::const_iterator(&FirstMI); + MII != FirstMI.getParent()->instr_end(); ++MII) { + if (&*MII == &SecondMI) + return true; + } + return false; + }() && "Expected FirstMI to precede SecondMI"); + // Cannot pair dependent instructions + for (const auto &Use : SecondMI.uses()) + if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg())) + return false; + + struct ComponentInfo { + ComponentInfo(const MachineInstr &MI) : MI(MI) {} + Register Dst, Reg0, Reg1, Reg2; + const MachineInstr &MI; + }; + ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)}; + + for (ComponentInfo &Comp : CInfo) { + switch (Comp.MI.getOpcode()) { + case AMDGPU::V_FMAMK_F32: + // cannot inline the fixed literal in fmamk + addLiteral(Comp.MI.getOperand(2)); + Comp.Reg2 = Comp.MI.getOperand(3).getReg(); + break; + case AMDGPU::V_FMAAK_F32: + // cannot inline the fixed literal in fmaak + addLiteral(Comp.MI.getOperand(3)); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_DOT2_F32_F16: + case AMDGPU::V_DOT2_F32_BF16: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + Comp.Reg2 = Comp.MI.getOperand(0).getReg(); + break; + case AMDGPU::V_CNDMASK_B32_e32: + UniqueScalarRegs.push_back(AMDGPU::VCC_LO); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_MOV_B32_e32: + break; + default: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + } + + Comp.Dst = Comp.MI.getOperand(0).getReg(); + + const MachineOperand &Op0 = Comp.MI.getOperand(1); + if (Op0.isReg()) { + if (!TRI->isVectorRegister(MRI, Op0.getReg())) { + if (!is_contained(UniqueScalarRegs, Op0.getReg())) + UniqueScalarRegs.push_back(Op0.getReg()); + } else + Comp.Reg0 = Op0.getReg(); + } else { + if (!TII.isInlineConstant(Comp.MI, 1)) + addLiteral(Op0); + } + } + + if (UniqueLiterals.size() > 1) + return false; + if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) + return false; + + // check port 0 + if (CInfo[0].Reg0 && CInfo[1].Reg0 && + CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks) + return false; + // check port 1 + if (CInfo[0].Reg1 && CInfo[1].Reg1 && + CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks) + return false; + // check port 2 + if (CInfo[0].Reg2 && CInfo[1].Reg2 && + !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1)) + return false; + if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1)) + return false; + + LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI + << "\n\tY: " << SecondMI << "\n"); + return true; +} + +/// Check if the instr pair, FirstMI and SecondMI, should be scheduled +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII); + unsigned Opc2 = SecondMI.getOpcode(); + auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + + // One instruction case + if (!FirstMI) + return SecondCanBeVOPD.Y; + + unsigned Opc = FirstMI->getOpcode(); + auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + + if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) || + (FirstCanBeVOPD.Y && SecondCanBeVOPD.X))) + return false; + + return checkVOPDRegConstraints(STII, *FirstMI, SecondMI); +} + +/// Adapts design from MacroFusion +/// Puts valid candidate instructions back-to-back so they can easily +/// be turned into VOPD instructions +/// Greedily pairs instruction candidates. O(n^2) algorithm. +struct VOPDPairingMutation : ScheduleDAGMutation { + ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer + + VOPDPairingMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer + : shouldScheduleAdjacent(shouldScheduleAdjacent) {} + + void apply(ScheduleDAGInstrs *DAG) override { + const TargetInstrInfo &TII = *DAG->TII; + const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); + if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) { + LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n"); + return; + } + + std::vector<SUnit>::iterator ISUI, JSUI; + for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) { + const MachineInstr *IMI = ISUI->getInstr(); + if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI)) + continue; + if (!hasLessThanNumFused(*ISUI, 2)) + continue; + + for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) { + if (JSUI->isBoundaryNode()) + continue; + const MachineInstr *JMI = JSUI->getInstr(); + if (!hasLessThanNumFused(*JSUI, 2) || + !shouldScheduleAdjacent(TII, ST, IMI, *JMI)) + continue; + if (fuseInstructionPair(*DAG, *ISUI, *JSUI)) + break; + } + } + LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n"); + } +}; + +std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() { + return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent); +} diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h new file mode 100644 index 000000000000..22361b9a1a07 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h @@ -0,0 +1,32 @@ +//===- GCNVOPDUtils.h - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class SIInstrInfo; + +bool checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI); + +std::unique_ptr<ScheduleDAGMutation> createVOPDPairingMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 02c213f90f89..228963ff2a20 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -62,12 +62,6 @@ public: virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const = 0; - -protected: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 11fe3f9ef058..fba4b1a3db66 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -36,6 +36,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AMDGPUGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 060d4b660632..c2e2563c3989 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -50,6 +50,7 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "AMDGPUGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 78eb304fe84f..3d926e52c368 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -58,11 +58,6 @@ private: uint64_t getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; - }; } // end anonymous namespace @@ -90,11 +85,8 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, } void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (MI.getOpcode() == R600::RETURN || MI.getOpcode() == R600::FETCH_CLAUSE || @@ -187,5 +179,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, return MO.getImm(); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "R600GenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp index 269209a12175..b9ff195e0ddc 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp @@ -13,10 +13,12 @@ #include "R600MCTargetDesc.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/SubtargetFeature.h" using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "R600GenInstrInfo.inc" MCInstrInfo *llvm::createR600MCInstrInfo() { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h index 605ae851378d..b4ce748532f8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h @@ -35,6 +35,7 @@ MCInstrInfo *createR600MCInstrInfo(); #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM #define GET_INSTRINFO_SCHED_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "R600GenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 5e67fb5ec876..e093d78b2cc6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -310,11 +310,8 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { } void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { int Opcode = MI.getOpcode(); APInt Encoding, Scratch; getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI); @@ -574,5 +571,4 @@ void SIMCCodeEmitter::getMachineOpValueCommon( llvm_unreachable("Encoding of this operand type is not supported yet."); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index bf52f7830ad7..5199a37a0519 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1623,7 +1623,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, NewBldVec); } -SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, const SDLoc &DL) const { // Old -> New swizzle values diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index 1e75a0432ec3..e7706fa0ef5c 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -74,8 +74,8 @@ private: void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; - SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, - const SDLoc &DL) const; + SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], + SelectionDAG &DAG, const SDLoc &DL) const; SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp index 8f7807a2b472..f81f5122bbc9 100644 --- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp @@ -13,6 +13,7 @@ // #include "AMDGPUMCInstLower.h" +#include "MCTargetDesc/R600MCTargetDesc.h" #include "R600AsmPrinter.h" #include "R600Subtarget.h" #include "llvm/CodeGen/MachineOperand.h" @@ -42,6 +43,9 @@ void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { } void R600AsmPrinter::emitInstruction(const MachineInstr *MI) { + R600_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>(); R600MCInstLower MCInstLowering(OutContext, STI, *this); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 094d5cd58673..d16da2a8b86b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -352,7 +352,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // TODO: Generalize to more vector types. setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16}, + MVT::v4i16, MVT::v4f16}, Custom); // Deal with vec3 vector operations when widened to vec4. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 814a7c446889..799d34e32d27 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3335,15 +3335,18 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { MachineInstr *DefMI; - const auto killDef = [&DefMI, &MBB, this]() -> void { + const auto killDef = [&]() -> void { const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // The only user is the instruction which will be killed. - if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg())) + Register DefReg = DefMI->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(DefReg)) return; // We cannot just remove the DefMI here, calling pass will crash. DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) DefMI->removeOperand(I); + if (LV) + LV->getVarInfo(DefReg).AliveBlocks.clear(); }; int64_t Imm; @@ -3982,6 +3985,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + int Src3Idx = -1; + if (Src0Idx == -1) { + // VOPD V_DUAL_* instructions use different operand names. + Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X); + Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X); + Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y); + Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y); + } // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); @@ -4255,9 +4266,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. - for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) { if (OpIdx == -1) - break; + continue; const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 311f9f68e675..1b411eb83eb3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1242,6 +1242,9 @@ namespace AMDGPU { int getDPPOp32(uint16_t Opcode); LLVM_READONLY + int getDPPOp64(uint16_t Opcode); + + LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 29ee9f12b12d..23afd6556bc9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -193,43 +193,32 @@ def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; -multiclass SDBufferAtomicRetNoRet { - def "_ret" : PatFrag< - (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, - node:$offset, node:$cachepolicy, node:$idxen), - (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex, - node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, - node:$idxen)> { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }]; - let GISelPredicateCode = [{ return true; }]; - } - +multiclass SDBufferAtomicNoRet { def "_noret" : PatFrag< (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen)> { - let PredicateCode = [{ return SDValue(N, 0).use_empty(); }]; - let GISelPredicateCode = [{ return false; }]; + let HasNoUse = true; } } -defm SIbuffer_atomic_swap : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_add : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_sub : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_smin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_umin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_smax : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_umax : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_and : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_or : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_xor : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_inc : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_dec : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fadd : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fmin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fmax : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_swap : SDBufferAtomicNoRet; +defm SIbuffer_atomic_add : SDBufferAtomicNoRet; +defm SIbuffer_atomic_sub : SDBufferAtomicNoRet; +defm SIbuffer_atomic_smin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_umin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_smax : SDBufferAtomicNoRet; +defm SIbuffer_atomic_umax : SDBufferAtomicNoRet; +defm SIbuffer_atomic_and : SDBufferAtomicNoRet; +defm SIbuffer_atomic_or : SDBufferAtomicNoRet; +defm SIbuffer_atomic_xor : SDBufferAtomicNoRet; +defm SIbuffer_atomic_inc : SDBufferAtomicNoRet; +defm SIbuffer_atomic_dec : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fadd : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fmin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fmax : SDBufferAtomicNoRet; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -246,24 +235,13 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; -def SIbuffer_atomic_cmpswap_ret : PatFrag< - (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, - node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), - (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex, - node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, - node:$idxen)> { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }]; - let GISelPredicateCode = [{ return true; }]; -} - def SIbuffer_atomic_cmpswap_noret : PatFrag< (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen)> { - let PredicateCode = [{ return SDValue(N, 0).use_empty(); }]; - let GISelPredicateCode = [{ return false; }]; + let HasNoUse = true; } class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode, @@ -774,13 +752,13 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, let AddressSpaces = StoreAddress_local.AddrSpaces in { defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _local_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), + defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; } let AddressSpaces = StoreAddress_region.AddrSpaces in { defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _region_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), + defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; } } @@ -2194,21 +2172,21 @@ class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp, "$sdst", "$vdst"), ""); // use $sdst for VOPC - string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); - string isrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1", - " $src1,")); - string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); - - string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); - string fsrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1_modifiers", - " $src1_modifiers,")); - string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); - - string src0 = !if(Src0HasMods, fsrc0, isrc0); - string src1 = !if(Src1HasMods, fsrc1, isrc1); - string src2 = !if(Src2HasMods, fsrc2, isrc2); + string src0nomods = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string src1nomods = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string src2nomods = !if(!eq(NumSrcArgs, 3), " $src2", ""); + + string src0mods = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1mods = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2mods = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + + string src0 = !if(Src0HasMods, src0mods, src0nomods); + string src1 = !if(Src1HasMods, src1mods, src1nomods); + string src2 = !if(Src2HasMods, src2mods, src2nomods); string opsel = !if(HasOpSel, "$op_sel", ""); string 3PMods = !if(IsVOP3P, !if(HasOpSel, "$op_sel_hi", "") @@ -2559,8 +2537,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, // the asm operand name via this HasModifiers flag field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret; field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp, - HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods, - HasSrc2FloatMods, DstVT >.ret; + HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers, + HasModifiers, DstVT>.ret; field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret; field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret; field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret; @@ -2800,6 +2778,14 @@ def getDPPOp32 : InstrMapping { let ValueCols = [["DPP"]]; } +def getDPPOp64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["VOP3"]; + let ValueCols = [["VOP3_DPP"]]; +} + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; @@ -2961,6 +2947,27 @@ def getVCMPXOpFromVCMP : InstrMapping { let ValueCols = [["1"]]; } +def VOPDComponentTable : GenericTable { + let FilterClass = "VOPD_Component"; + let CppTypeName = "VOPDComponentInfo"; + let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX"]; + let PrimaryKey = ["BaseVOP"]; + let PrimaryKeyName = "getVOPDComponentHelper"; +} + +def VOPDPairs : GenericTable { + let FilterClass = "VOPD_Base"; + let CppTypeName = "VOPDInfo"; + let Fields = ["Opcode", "OpX", "OpY"]; + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getVOPDOpcodeHelper"; +} + +def getVOPDInfoFromComponentOpcodes : SearchIndex { + let Table = VOPDPairs; + let Key = ["OpX", "OpY"]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 829669157893..ce8c03bb8d64 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1449,6 +1449,14 @@ def : BitConvert <v8i32, v16f16, VReg_256>; def : BitConvert <v8i32, v16i16, VReg_256>; def : BitConvert <v8f32, v16f16, VReg_256>; def : BitConvert <v8f32, v16i16, VReg_256>; +def : BitConvert <v16f16, v4i64, VReg_256>; +def : BitConvert <v16i16, v4i64, VReg_256>; +def : BitConvert <v16f16, v4f64, VReg_256>; +def : BitConvert <v16i16, v4f64, VReg_256>; +def : BitConvert <v4i64, v16f16, VReg_256>; +def : BitConvert <v4i64, v16i16, VReg_256>; +def : BitConvert <v4f64, v16f16, VReg_256>; +def : BitConvert <v4f64, v16i16, VReg_256>; // 512-bit bitcast def : BitConvert <v16i32, v16f32, VReg_512>; @@ -3012,6 +3020,35 @@ multiclass Int16Med3Pat<Instruction med3Inst, def : FPMed3Pat<f32, V_MED3_F32_e64>; +class +IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max, + SDPatternOperator max_or_min_oneuse> : AMDGPUPat < + (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1), + i32:$src2), + (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) +>; + +class +FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, + SDPatternOperator max_or_min_oneuse> : GCNPat < + (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods vt:$src2, i32:$src2_mods))), + (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +let OtherPredicates = [isGFX11Plus] in { +def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>; +def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>; +def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; +def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; +def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; +def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +} + let OtherPredicates = [isGFX9Plus] in { def : FP16Med3Pat<f16, V_MED3_F16_e64>; defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 607383ab8cde..67077a2eaa6b 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -148,6 +148,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addUsedIfAvailable<LiveIntervals>(); // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved<MachineDominatorTree>(); AU.addPreserved<SlotIndexes>(); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index dd881ec42d53..786b6b61cb23 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -72,7 +72,7 @@ INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID; -/// Insert restore code for the callee-saved registers used in the function. +/// Insert spill code for the callee-saved registers used in the function. static void insertCSRSaves(MachineBasicBlock &SaveBlock, ArrayRef<CalleeSavedInfo> CSI, LiveIntervals *LIS) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index e426e938b856..ff5587fbb0ca 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1883,7 +1883,13 @@ void SIScheduleDAGMI::schedule() LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); buildDAGWithRegPressure(); + postprocessDAG(); + LLVM_DEBUG(dump()); + if (PrintDAGs) + dump(); + if (ViewMISchedDAGs) + viewGraph(); topologicalSort(); findRootsAndBiasEdges(TopRoots, BotRoots); diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 8a66213931ff..6b93769949bc 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2329,13 +2329,13 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { continue; if (const auto &MOI = MOA.getLoadInfo(MI)) - Changed |= expandLoad(MOI.getValue(), MI); + Changed |= expandLoad(MOI.value(), MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) - Changed |= expandStore(MOI.getValue(), MI); + Changed |= expandStore(MOI.value(), MI); else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) - Changed |= expandAtomicFence(MOI.getValue(), MI); + Changed |= expandAtomicFence(MOI.value(), MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) - Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); + Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI); } } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 5215397d5936..66bc46aaefea 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -20,10 +21,40 @@ using namespace llvm; namespace { class SIOptimizeExecMasking : public MachineFunctionPass { + MachineFunction *MF = nullptr; + const GCNSubtarget *ST = nullptr; + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + const MachineRegisterInfo *MRI = nullptr; + + Register isCopyFromExec(const MachineInstr &MI) const; + Register isCopyToExec(const MachineInstr &MI) const; + bool removeTerminatorBit(MachineInstr &MI) const; + MachineBasicBlock::reverse_iterator + fixTerminators(MachineBasicBlock &MBB) const; + MachineBasicBlock::reverse_iterator + findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) const; + + bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, + MCRegister Reg, bool UseLiveOuts = false, + bool IgnoreStart = false) const; + bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; + MachineInstr *findInstrBackwards(MachineInstr &Origin, + std::function<bool(MachineInstr *)> Pred, + ArrayRef<MCRegister> NonModifiableRegs, + unsigned MaxInstructions = 20) const; + MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, + MCRegister Exec) const; + bool optimizeExecSequence() const; + bool optimizeVCmpxAndSaveexecSequence() const; + bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, + MCRegister Exec) const; + public: static char ID; -public: SIOptimizeExecMasking() : MachineFunctionPass(ID) { initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); } @@ -53,7 +84,7 @@ char SIOptimizeExecMasking::ID = 0; char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; /// If \p MI is a copy from exec, return the register copied to. -static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { +Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: @@ -61,8 +92,7 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && - Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)) + if (Src.isReg() && Src.getReg() == TRI->getExec()) return MI.getOperand(0).getReg(); } } @@ -71,14 +101,13 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { } /// If \p MI is a copy to exec, return the register copied from. -static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) { +Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && - Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) && + if (Dst.isReg() && Dst.getReg() == TRI->getExec() && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; @@ -173,64 +202,64 @@ static unsigned getSaveExecOp(unsigned Opc) { // These are only terminators to get correct spill code placement during // register allocation, so turn them back into normal instructions. -static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { +bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::S_MOV_B32_term: { bool RegSrc = MI.getOperand(1).isReg(); - MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); + MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); return true; } case AMDGPU::S_MOV_B64_term: { bool RegSrc = MI.getOperand(1).isReg(); - MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); + MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); return true; } case AMDGPU::S_XOR_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); + MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); return true; } case AMDGPU::S_XOR_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_XOR_B32)); + MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); return true; } case AMDGPU::S_OR_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B64)); + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); return true; } case AMDGPU::S_OR_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B32)); + MI.setDesc(TII->get(AMDGPU::S_OR_B32)); return true; } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); + MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); return true; } case AMDGPU::S_ANDN2_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); + MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); return true; } case AMDGPU::S_AND_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + MI.setDesc(TII->get(AMDGPU::S_AND_B64)); return true; } case AMDGPU::S_AND_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + MI.setDesc(TII->get(AMDGPU::S_AND_B32)); return true; } default: @@ -241,9 +270,8 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { // Turn all pseudoterminators in the block into their equivalent non-terminator // instructions. Returns the reverse iterator to the first non-terminator // instruction in the block. -static MachineBasicBlock::reverse_iterator fixTerminators( - const SIInstrInfo &TII, - MachineBasicBlock &MBB) { +MachineBasicBlock::reverse_iterator +SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); bool Seen = false; @@ -252,7 +280,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators( if (!I->isTerminator()) return Seen ? FirstNonTerm : I; - if (removeTerminatorBit(TII, *I)) { + if (removeTerminatorBit(*I)) { if (!Seen) { FirstNonTerm = I; Seen = true; @@ -263,17 +291,15 @@ static MachineBasicBlock::reverse_iterator fixTerminators( return FirstNonTerm; } -static MachineBasicBlock::reverse_iterator findExecCopy( - const SIInstrInfo &TII, - const GCNSubtarget &ST, - MachineBasicBlock &MBB, - MachineBasicBlock::reverse_iterator I, - unsigned CopyToExec) { +MachineBasicBlock::reverse_iterator +SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB, + MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) const { const unsigned InstLimit = 25; auto E = MBB.rend(); for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { - Register CopyFromExec = isCopyFromExec(*I, ST); + Register CopyFromExec = isCopyFromExec(*I); if (CopyFromExec.isValid()) return I; } @@ -298,11 +324,9 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { // an arbitrary condition based on the current MachineInstr, for instance an // target instruction. Breaks prematurely by returning nullptr if one of the // registers given in NonModifiableRegs is modified by the current instruction. -static MachineInstr * -findInstrBackwards(MachineInstr &Origin, - std::function<bool(MachineInstr *)> Pred, - ArrayRef<MCRegister> NonModifiableRegs, - const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) { +MachineInstr *SIOptimizeExecMasking::findInstrBackwards( + MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, + ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const { MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), E = Origin.getParent()->rend(); unsigned CurrentIteration = 0; @@ -310,7 +334,7 @@ findInstrBackwards(MachineInstr &Origin, for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { if (A->isDebugInstr()) continue; - + if (Pred(&*A)) return &*A; @@ -318,209 +342,64 @@ findInstrBackwards(MachineInstr &Origin, if (A->modifiesRegister(Reg, TRI)) return nullptr; } - + ++CurrentIteration; } return nullptr; } - // Determine if a register Reg is not re-defined and still in use // in the range (Stop..Start]. // It does so by backwards calculating liveness from the end of the BB until // either Stop or the beginning of the BB is reached. // After liveness is calculated, we can determine if Reg is still in use and not // defined inbetween the instructions. -static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, - MCRegister Reg, const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI, - bool useLiveOuts = false, - bool ignoreStart = false) { +bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, + MachineInstr &Start, + MCRegister Reg, + bool UseLiveOuts, + bool IgnoreStart) const { LivePhysRegs LR(*TRI); - if (useLiveOuts) + if (UseLiveOuts) LR.addLiveOuts(*Stop.getParent()); MachineBasicBlock::reverse_iterator A(Start); MachineBasicBlock::reverse_iterator E(Stop); - if (ignoreStart) + if (IgnoreStart) ++A; for (; A != Stop.getParent()->rend() && A != Stop; ++A) { LR.stepBackward(*A); } - return !LR.available(MRI, Reg); + return !LR.available(*MRI, Reg); } // Determine if a register Reg is not re-defined and still in use // in the range (Stop..BB.end]. -static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI, - MRI, true); +bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, + MCRegister Reg) const { + return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); } -// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence -// by looking at an instance of a s_and_saveexec instruction. Returns a pointer -// to the v_cmp instruction if it is safe to replace the sequence (see the -// conditions in the function body). This is after register allocation, so some -// checks on operand dependencies need to be considered. -static MachineInstr *findPossibleVCMPVCMPXOptimization( - MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, - const SIInstrInfo *TII, MachineRegisterInfo &MRI) { - - MachineInstr *VCmp = nullptr; - - Register SaveExecDest = SaveExec.getOperand(0).getReg(); - if (!TRI->isSGPRReg(MRI, SaveExecDest)) - return nullptr; - - MachineOperand *SaveExecSrc0 = - TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); - if (!SaveExecSrc0->isReg()) - return nullptr; - - // Try to find the last v_cmp instruction that defs the saveexec input - // operand without any write to Exec or the saveexec input operand inbetween. - VCmp = findInstrBackwards( - SaveExec, - [&](MachineInstr *Check) { - return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && - Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); - }, - {Exec, SaveExecSrc0->getReg()}, TRI); - - if (!VCmp) - return nullptr; - - MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); - assert(VCmpDest && "Should have an sdst operand!"); - - // Check if any of the v_cmp source operands is written by the saveexec. - MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); - if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) && - SaveExec.modifiesRegister(Src0->getReg(), TRI)) - return nullptr; - - MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); - if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) && - SaveExec.modifiesRegister(Src1->getReg(), TRI)) - return nullptr; - - // Don't do the transformation if the destination operand is included in - // it's MBB Live-outs, meaning it's used in any of it's successors, leading - // to incorrect code if the v_cmp and therefore the def of - // the dest operand is removed. - if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) - return nullptr; - - // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the - // s_and_saveexec, skip the optimization. - if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI, - false, true) || - isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI)) - return nullptr; - - // Try to determine if there is a write to any of the VCmp - // operands between the saveexec and the vcmp. - // If yes, additional VGPR spilling might need to be inserted. In this case, - // it's not worth replacing the instruction sequence. - SmallVector<MCRegister, 2> NonDefRegs; - if (Src0->isReg()) - NonDefRegs.push_back(Src0->getReg()); - - if (Src1->isReg()) - NonDefRegs.push_back(Src1->getReg()); - - if (!findInstrBackwards( - SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, - NonDefRegs, TRI)) - return nullptr; - - return VCmp; -} - -// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the -// operands extracted from a v_cmp ..., s_and_saveexec pattern. -static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, - MachineInstr &VCmp, MCRegister Exec, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); - - if (NewOpcode == -1) - return false; - - MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); - MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); - - Register MoveDest = SaveExecInstr.getOperand(0).getReg(); - - MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); - if (!SaveExecInstr.uses().empty()) { - bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32; - unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(*SaveExecInstr.getParent(), InsertPosIt, - SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) - .addReg(Exec); - } - - // Omit dst as V_CMPX is implicitly writing to EXEC. - // Add dummy src and clamp modifiers, if needed. - auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), - VCmp.getDebugLoc(), TII->get(NewOpcode)); - - auto TryAddImmediateValueFromNamedOperand = - [&](unsigned OperandName) -> void { - if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) - Builder.addImm(Mod->getImm()); - }; - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); - Builder.add(*Src0); - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); - Builder.add(*Src1); - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); - - // The kill flags may no longer be correct. - if (Src0->isReg()) - MRI.clearKillFlags(Src0->getReg()); - if (Src1->isReg()) - MRI.clearKillFlags(Src1->getReg()); - - return true; -} - -bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineRegisterInfo *MRI = &MF.getRegInfo(); - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - - // Optimize sequences emitted for control flow lowering. They are originally - // emitted as the separate operations because spill code may need to be - // inserted for the saved copy of exec. - // - // x = copy exec - // z = s_<op>_b64 x, y - // exec = copy z - // => - // x = s_<op>_saveexec_b64 y - // +// Optimize sequences emitted for control flow lowering. They are originally +// emitted as the separate operations because spill code may need to be +// inserted for the saved copy of exec. +// +// x = copy exec +// z = s_<op>_b64 x, y +// exec = copy z +// => +// x = s_<op>_saveexec_b64 y +// +bool SIOptimizeExecMasking::optimizeExecSequence() const { + MCRegister Exec = TRI->getExec(); bool Changed = false; - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); + for (MachineBasicBlock &MBB : *MF) { + MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); MachineBasicBlock::reverse_iterator E = MBB.rend(); if (I == E) continue; @@ -532,7 +411,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { unsigned SearchCount = 0; const unsigned SearchLimit = 5; while (I != E && SearchCount++ < SearchLimit) { - CopyToExec = isCopyToExec(*I, ST); + CopyToExec = isCopyToExec(*I); if (CopyToExec) break; ++I; @@ -542,8 +421,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { continue; // Scan backwards to find the def. - auto CopyToExecInst = &*I; - auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec); + auto *CopyToExecInst = &*I; + auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec); if (CopyFromExecInst == E) { auto PrepareExecInst = std::next(I); if (PrepareExecInst == E) @@ -574,8 +453,9 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { MachineInstr *SaveExecInst = nullptr; SmallVector<MachineInstr *, 4> OtherUseInsts; - for (MachineBasicBlock::iterator J - = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); + for (MachineBasicBlock::iterator + J = std::next(CopyFromExecInst->getIterator()), + JE = I->getIterator(); J != JE; ++J) { if (SaveExecInst && J->readsRegister(Exec, TRI)) { LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); @@ -655,58 +535,210 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), CopyFromExec) - .addReg(OtherOp->getReg()); + .addReg(OtherOp->getReg()); SaveExecInst->eraseFromParent(); CopyToExecInst->eraseFromParent(); for (MachineInstr *OtherInst : OtherUseInsts) { - OtherInst->substituteRegister(CopyToExec, Exec, - AMDGPU::NoSubRegister, *TRI); + OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, + *TRI); } Changed = true; } - // After all s_op_saveexec instructions are inserted, - // replace (on GFX10.3 and later) - // v_cmp_* SGPR, IMM, VGPR - // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR - // with - // s_mov_b32 EXEC_SGPR_DEST, exec_lo - // v_cmpx_* IMM, VGPR - // to reduce pipeline stalls. - if (ST.hasGFX10_3Insts()) { - DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; - const unsigned AndSaveExecOpcode = - ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - // Record relevant v_cmp / s_and_saveexec instruction pairs for - // replacement. - if (MI.getOpcode() != AndSaveExecOpcode) - continue; + return Changed; +} - if (MachineInstr *VCmp = - findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) - SaveExecVCmpMapping[&MI] = VCmp; - } +// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence +// by looking at an instance of a s_and_saveexec instruction. Returns a pointer +// to the v_cmp instruction if it is safe to replace the sequence (see the +// conditions in the function body). This is after register allocation, so some +// checks on operand dependencies need to be considered. +MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization( + MachineInstr &SaveExec, MCRegister Exec) const { + + MachineInstr *VCmp = nullptr; + + Register SaveExecDest = SaveExec.getOperand(0).getReg(); + if (!TRI->isSGPRReg(*MRI, SaveExecDest)) + return nullptr; + + MachineOperand *SaveExecSrc0 = + TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg()) + return nullptr; + + // Try to find the last v_cmp instruction that defs the saveexec input + // operand without any write to Exec or the saveexec input operand inbetween. + VCmp = findInstrBackwards( + SaveExec, + [&](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && + Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); + }, + {Exec, SaveExecSrc0->getReg()}); + + if (!VCmp) + return nullptr; + + MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); + assert(VCmpDest && "Should have an sdst operand!"); + + // Check if any of the v_cmp source operands is written by the saveexec. + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && + SaveExec.modifiesRegister(Src0->getReg(), TRI)) + return nullptr; + + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && + SaveExec.modifiesRegister(Src1->getReg(), TRI)) + return nullptr; + + // Don't do the transformation if the destination operand is included in + // it's MBB Live-outs, meaning it's used in any of it's successors, leading + // to incorrect code if the v_cmp and therefore the def of + // the dest operand is removed. + if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) + return nullptr; + + // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the + // s_and_saveexec, skip the optimization. + if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false, + true) || + isRegisterInUseAfter(SaveExec, VCmpDest->getReg())) + return nullptr; + + // Try to determine if there is a write to any of the VCmp + // operands between the saveexec and the vcmp. + // If yes, additional VGPR spilling might need to be inserted. In this case, + // it's not worth replacing the instruction sequence. + SmallVector<MCRegister, 2> NonDefRegs; + if (Src0->isReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, + NonDefRegs)) + return nullptr; + + return VCmp; +} + +// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the +// operands extracted from a v_cmp ..., s_and_saveexec pattern. +bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( + MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { + const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); + + if (NewOpcode == -1) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); + + Register MoveDest = SaveExecInstr.getOperand(0).getReg(); + + MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); + if (!SaveExecInstr.uses().empty()) { + bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; + unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(*SaveExecInstr.getParent(), InsertPosIt, + SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) + .addReg(Exec); + } + + // Omit dst as V_CMPX is implicitly writing to EXEC. + // Add dummy src and clamp modifiers, if needed. + auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), + VCmp.getDebugLoc(), TII->get(NewOpcode)); + + auto TryAddImmediateValueFromNamedOperand = + [&](unsigned OperandName) -> void { + if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) + Builder.addImm(Mod->getImm()); + }; + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); + Builder.add(*Src0); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); + Builder.add(*Src1); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); + + // The kill flags may no longer be correct. + if (Src0->isReg()) + MRI->clearKillFlags(Src0->getReg()); + if (Src1->isReg()) + MRI->clearKillFlags(Src1->getReg()); + + return true; +} + +// After all s_op_saveexec instructions are inserted, +// replace (on GFX10.3 and later) +// v_cmp_* SGPR, IMM, VGPR +// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR +// with +// s_mov_b32 EXEC_SGPR_DEST, exec_lo +// v_cmpx_* IMM, VGPR +// to reduce pipeline stalls. +bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const { + if (!ST->hasGFX10_3Insts()) + return false; + + bool Changed = false; + + DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; + MCRegister Exec = TRI->getExec(); + const unsigned AndSaveExecOpcode = + ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + // Record relevant v_cmp / s_and_saveexec instruction pairs for + // replacement. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + + if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec)) + SaveExecVCmpMapping[&MI] = VCmp; } + } - for (const auto &Entry : SaveExecVCmpMapping) { - MachineInstr *SaveExecInstr = Entry.getFirst(); - MachineInstr *VCmpInstr = Entry.getSecond(); + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); - if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, - TRI, *MRI)) { - SaveExecInstr->eraseFromParent(); - VCmpInstr->eraseFromParent(); + if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); - Changed = true; - } + Changed = true; } } return Changed; } + +bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + this->MF = &MF; + ST = &MF.getSubtarget<GCNSubtarget>(); + TRI = ST->getRegisterInfo(); + TII = ST->getInstrInfo(); + MRI = &MF.getRegInfo(); + + bool Changed = optimizeExecSequence(); + Changed |= optimizeVCmpxAndSaveexecSequence(); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index e5e65a8dbbf1..57dbad468de8 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -159,6 +159,9 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { return false; Register SelReg = Op1->getReg(); + if (SelReg.isPhysical()) + return false; + auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS); if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) return false; @@ -264,13 +267,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { // Try to remove v_cndmask_b32. if (SelLI) { - bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill(); - if (!CanRemoveSel) { - // Try to shrink the live interval and check for dead def instead. - LIS->shrinkToUses(SelLI, nullptr); - CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); - } - if (CanRemoveSel) { + // Kill status must be checked before shrinking the live range. + bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill(); + LIS->shrinkToUses(SelLI); + bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); + if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) { LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ad1455ed20fd..b32d5bb04d5b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2933,6 +2933,10 @@ MCRegister SIRegisterInfo::getVCC() const { return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; } +MCRegister SIRegisterInfo::getExec() const { + return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; +} + const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { // VGPR tuples have an alignment requirement on gfx90a variants. return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 9bfbc253410b..6024158be181 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -344,6 +344,8 @@ public: MCRegister getVCC() const; + MCRegister getExec() const; + const TargetRegisterClass *getRegClass(unsigned RCID) const; // Find reaching register definition diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index e4ab72f1095b..2f334e211181 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -277,6 +277,18 @@ struct VOPC64DPPInfo { uint16_t Opcode; }; +struct VOPDComponentInfo { + uint16_t BaseVOP; + uint16_t VOPDOp; + bool CanBeVOPDX; +}; + +struct VOPDInfo { + uint16_t Opcode; + uint16_t OpX; + uint16_t OpY; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -293,6 +305,10 @@ struct VOPC64DPPInfo { #define GET_VOPC64DPPTable_IMPL #define GET_VOPC64DPP8Table_DECL #define GET_VOPC64DPP8Table_IMPL +#define GET_VOPDComponentTable_DECL +#define GET_VOPDComponentTable_IMPL +#define GET_VOPDPairs_DECL +#define GET_VOPDPairs_IMPL #define GET_WMMAOpcode2AddrMappingTable_DECL #define GET_WMMAOpcode2AddrMappingTable_IMPL #define GET_WMMAOpcode3AddrMappingTable_DECL @@ -398,6 +414,19 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info ? Info->is_gfx940_xdl : false; } +CanBeVOPD getCanBeVOPD(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + if (Info) + return {Info->CanBeVOPDX, 1}; + else + return {0, 0}; +} + +unsigned getVOPDOpcode(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + return Info ? Info->VOPDOp : ~0u; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; @@ -415,6 +444,11 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) { return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); } +int getVOPDFull(unsigned OpX, unsigned OpY) { + const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY); + return Info ? Info->Opcode : -1; +} + namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index dffeec10a14a..51cf1678207c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -470,6 +470,14 @@ bool getMAIIsDGEMM(unsigned Opc); LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +struct CanBeVOPD { + bool X; + bool Y; +}; + +LLVM_READONLY +CanBeVOPD getCanBeVOPD(unsigned Opc); + LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, @@ -483,6 +491,12 @@ LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); LLVM_READONLY +unsigned getVOPDOpcode(unsigned Opc); + +LLVM_READONLY +int getVOPDFull(unsigned OpX, unsigned OpY); + +LLVM_READONLY unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 1485a1e63129..b24857edb59a 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -495,9 +495,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=* bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, - Src0DPP:$src0, - Src1DPP:$src1, - dpp8:$dpp8, FI:$fi); + Src0DPP:$src0, + Src1DPP:$src1, + dpp8:$dpp8, FI:$fi); let HasExt = 1; let HasExtDPP = 1; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index eb6c54a45263..33d3441e94c2 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1108,7 +1108,6 @@ class VOPC64_DPP_Base<bits<10> op, string OpName, VOPProfile P> // Inst{87-84} ignored by hw let Inst{91-88} = bank_mask; let Inst{95-92} = row_mask; - } class VOPC64_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> @@ -1148,7 +1147,6 @@ class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P> let Inst{40-32} = fi; let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); let Inst{95-72} = dpp8{23-0}; - } class VOPC64_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 8cd3d2fe2c47..187485ffa3ae 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1215,7 +1215,9 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); - let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers)); + let HasModifiers = + !if (Features.IsMAI, 0, + !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers)); } class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> { @@ -1414,7 +1416,7 @@ multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName, VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>; multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName, - string asmName> : + string asmName> : VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>; multiclass VOP3be_Realtriple_gfx11< diff --git a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp index 0390c01eecb1..cee2fc7d2bf0 100644 --- a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp +++ b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp @@ -49,6 +49,9 @@ public: } // end anonymous namespace void ARCAsmPrinter::emitInstruction(const MachineInstr *MI) { + ARC_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + SmallString<128> Str; raw_svector_ostream O(Str); diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp index d4f74fa77fc4..36b00af2c0b4 100644 --- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp +++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp @@ -26,6 +26,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "ARCGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h index ab06ce46d99f..5f83b48b36af 100644 --- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h +++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.h @@ -28,6 +28,7 @@ class Target; // Defines symbolic names for the ARC instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "ARCGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 48559a89a30a..73970b9c74c5 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -378,13 +378,13 @@ def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Prefers32BitThumb", "true def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2", "Prefer 32-bit alignment for loops">; -def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "1", +def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "4", "Model MVE instructions as a 1 beat per tick architecture">; def FeatureMVEVectorCostFactor2 : SubtargetFeature<"mve2beat", "MVEVectorCostFactor", "2", "Model MVE instructions as a 2 beats per tick architecture">; -def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "4", +def FeatureMVEVectorCostFactor4 : SubtargetFeature<"mve4beat", "MVEVectorCostFactor", "1", "Model MVE instructions as a 4 beats per tick architecture">; /// Some instructions update CPSR partially, which can add false dependency for @@ -1450,6 +1450,13 @@ def : ProcessorModel<"cortex-m55", CortexM4Model, [ARMv81mMainline, HasMVEFloatOps, FeatureFixCMSE_CVE_2021_35465]>; +def : ProcessorModel<"cortex-m85", CortexM7Model, [ARMv81mMainline, + FeatureDSP, + FeatureFPARMv8_D16, + FeaturePACBTI, + FeatureUseMISched, + HasMVEFloatOps]>; + def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, FeatureHWDivARM, diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 4aa28bc5d28d..57cbd7a3b2b8 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -1337,6 +1337,10 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { #include "ARMGenMCPseudoLowering.inc" void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { + // TODOD FIXME: Enable feature predicate checks once all the test pass. + // ARM_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + const DataLayout &DL = getDataLayout(); MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 85e32c08c74c..e6be93e6480a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -450,6 +450,14 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + + if (!HasMVEFP) { + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + } } setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand); @@ -13350,14 +13358,14 @@ static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { // to make better use of vaddva style instructions. if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) && IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) && - !isa<ConstantSDNode>(N0)) { + !isa<ConstantSDNode>(N0) && N1->hasOneUse()) { SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0)); return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1)); } // And turn add(add(A, reduce(B)), add(C, reduce(D))) -> // add(add(add(A, C), reduce(B)), reduce(D)) if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD && - N1.getOpcode() == ISD::ADD) { + N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) { unsigned N0RedOp = 0; if (!IsVecReduce(N0.getOperand(N0RedOp))) { N0RedOp = 1; @@ -13424,7 +13432,7 @@ static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { }; SDValue X; - if (N0.getOpcode() == ISD::ADD) { + if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) { if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) { int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0), N0.getOperand(1).getOperand(0)); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 3a9946ee810b..ba1d806c8d81 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2247,15 +2247,15 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, return canTailPredicateLoop(L, LI, SE, DL, LAI); } -bool ARMTTIImpl::emitGetActiveLaneMask() const { +PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { if (!ST->hasMVEIntegerOps() || !EnableTailPredication) - return false; + return PredicationStyle::None; // Intrinsic @llvm.get.active.lane.mask is supported. // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. - return true; + return PredicationStyle::Data; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index d7a2bdb3db15..dcf82e703a7f 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -298,7 +298,7 @@ public: TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); - bool emitGetActiveLaneMask() const; + PredicationStyle emitGetActiveLaneMask() const; void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 3f1379f135d1..9f85d72cc810 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -133,6 +133,7 @@ static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, } #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "ARMGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index e0c992f4fae2..3066d9ba6783 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -139,6 +139,7 @@ bool isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI); // Defines symbolic names for the ARM instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "ARMGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index 30785340ef12..296801094fbe 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -351,13 +351,13 @@ Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) { if (!Op0 || !Op1) return Optional<int64_t>{}; if (I->getOpcode() == Instruction::Add) - return Optional<int64_t>{Op0.getValue() + Op1.getValue()}; + return Optional<int64_t>{Op0.value() + Op1.value()}; if (I->getOpcode() == Instruction::Mul) - return Optional<int64_t>{Op0.getValue() * Op1.getValue()}; + return Optional<int64_t>{Op0.value() * Op1.value()}; if (I->getOpcode() == Instruction::Shl) - return Optional<int64_t>{Op0.getValue() << Op1.getValue()}; + return Optional<int64_t>{Op0.value() << Op1.value()}; if (I->getOpcode() == Instruction::Or) - return Optional<int64_t>{Op0.getValue() | Op1.getValue()}; + return Optional<int64_t>{Op0.value() | Op1.value()}; } return Optional<int64_t>{}; } diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp index 0001e520b1fb..70fc90bf9eb5 100644 --- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp @@ -180,6 +180,10 @@ bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void AVRAsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // AVR_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + AVRMCInstLower MCInstLowering(OutContext, *this); MCInst I; diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp index cdfe4a21105d..ba370261e284 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp @@ -27,6 +27,7 @@ #include "llvm/MC/TargetRegistry.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AVRGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h index aaf236d82016..e83d674f87cc 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h @@ -49,6 +49,7 @@ std::unique_ptr<MCObjectTargetWriter> createAVRELFObjectWriter(uint8_t OSABI); #include "AVRGenRegisterInfo.inc" #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "AVRGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp index 349cdd92ae62..9aad9375d913 100644 --- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -149,6 +149,13 @@ private: // The base call is not an input of any other preserve_* // intrinsics. std::map<CallInst *, CallInfo> BaseAICalls; + // A map to hold <AnonRecord, TypeDef> relationships + std::map<DICompositeType *, DIDerivedType *> AnonRecords; + + void CheckAnonRecordType(DIDerivedType *ParentTy, DIType *Ty); + void CheckCompositeType(DIDerivedType *ParentTy, DICompositeType *CTy); + void CheckDerivedType(DIDerivedType *ParentTy, DIDerivedType *DTy); + void ResetMetadata(struct CallInfo &CInfo); bool doTransformation(Function &F); @@ -221,10 +228,80 @@ bool BPFAbstractMemberAccess::run(Function &F) { if (M->debug_compile_units().empty()) return false; + // For each argument/return/local_variable type, trace the type + // pattern like '[derived_type]* [composite_type]' to check + // and remember (anon record -> typedef) relations where the + // anon record is defined as + // typedef [const/volatile/restrict]* [anon record] + DISubprogram *SP = F.getSubprogram(); + if (SP && SP->isDefinition()) { + for (DIType *Ty: SP->getType()->getTypeArray()) + CheckAnonRecordType(nullptr, Ty); + for (const DINode *DN : SP->getRetainedNodes()) { + if (const auto *DV = dyn_cast<DILocalVariable>(DN)) + CheckAnonRecordType(nullptr, DV->getType()); + } + } + DL = &M->getDataLayout(); return doTransformation(F); } +void BPFAbstractMemberAccess::ResetMetadata(struct CallInfo &CInfo) { + if (auto Ty = dyn_cast<DICompositeType>(CInfo.Metadata)) { + if (AnonRecords.find(Ty) != AnonRecords.end()) { + if (AnonRecords[Ty] != nullptr) + CInfo.Metadata = AnonRecords[Ty]; + } + } +} + +void BPFAbstractMemberAccess::CheckCompositeType(DIDerivedType *ParentTy, + DICompositeType *CTy) { + if (!CTy->getName().empty() || !ParentTy || + ParentTy->getTag() != dwarf::DW_TAG_typedef) + return; + + if (AnonRecords.find(CTy) == AnonRecords.end()) { + AnonRecords[CTy] = ParentTy; + return; + } + + // Two or more typedef's may point to the same anon record. + // If this is the case, set the typedef DIType to be nullptr + // to indicate the duplication case. + DIDerivedType *CurrTy = AnonRecords[CTy]; + if (CurrTy == ParentTy) + return; + AnonRecords[CTy] = nullptr; +} + +void BPFAbstractMemberAccess::CheckDerivedType(DIDerivedType *ParentTy, + DIDerivedType *DTy) { + DIType *BaseType = DTy->getBaseType(); + if (!BaseType) + return; + + unsigned Tag = DTy->getTag(); + if (Tag == dwarf::DW_TAG_pointer_type) + CheckAnonRecordType(nullptr, BaseType); + else if (Tag == dwarf::DW_TAG_typedef) + CheckAnonRecordType(DTy, BaseType); + else + CheckAnonRecordType(ParentTy, BaseType); +} + +void BPFAbstractMemberAccess::CheckAnonRecordType(DIDerivedType *ParentTy, + DIType *Ty) { + if (!Ty) + return; + + if (auto *CTy = dyn_cast<DICompositeType>(Ty)) + return CheckCompositeType(ParentTy, CTy); + else if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) + return CheckDerivedType(ParentTy, DTy); +} + static bool SkipDIDerivedTag(unsigned Tag, bool skipTypedef) { if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type && @@ -298,6 +375,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic"); + ResetMetadata(CInfo); CInfo.AccessIndex = getConstant(Call->getArgOperand(1)); CInfo.Base = Call->getArgOperand(0); return true; @@ -307,6 +385,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic"); + ResetMetadata(CInfo); CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); CInfo.Base = Call->getArgOperand(0); CInfo.RecordAlignment = DL->getABITypeAlign(getBaseElementType(Call)); diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index d6145f53c170..c8849bd50464 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -138,6 +138,9 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) { + BPF_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MCInst TmpInst; if (!BTF || !BTF->InstLower(MI, TmpInst)) { diff --git a/llvm/lib/Target/BPF/BTF.h b/llvm/lib/Target/BPF/BTF.h index 4540054aaf34..89852be4a8c8 100644 --- a/llvm/lib/Target/BPF/BTF.h +++ b/llvm/lib/Target/BPF/BTF.h @@ -48,6 +48,8 @@ #ifndef LLVM_LIB_TARGET_BPF_BTF_H #define LLVM_LIB_TARGET_BPF_BTF_H +#include <cstdint> + namespace llvm { namespace BTF { diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp index a98d001097bc..cb321906db03 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp @@ -31,14 +31,13 @@ using namespace llvm; namespace { class BPFMCCodeEmitter : public MCCodeEmitter { - const MCInstrInfo &MCII; const MCRegisterInfo &MRI; bool IsLittleEndian; public: - BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, + BPFMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri, bool IsLittleEndian) - : MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {} + : MRI(mri), IsLittleEndian(IsLittleEndian) { } BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete; void operator=(const BPFMCCodeEmitter &) = delete; ~BPFMCCodeEmitter() override = default; @@ -62,12 +61,6 @@ public: void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -117,9 +110,6 @@ static uint8_t SwapBits(uint8_t Val) void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - unsigned Opcode = MI.getOpcode(); support::endian::Writer OSE(OS, IsLittleEndian ? support::little : support::big); @@ -174,5 +164,4 @@ uint64_t BPFMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op, return Encoding; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "BPFGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp index 5a1e251cd29c..77db5f99225e 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/Host.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "BPFGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h index fc190504581c..ea30e714a5b7 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h @@ -54,6 +54,7 @@ std::unique_ptr<MCObjectTargetWriter> createBPFELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the BPF instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "BPFGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp index 0236b22ad379..ea5b4555757e 100644 --- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp +++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp @@ -141,6 +141,9 @@ void CSKYAsmPrinter::emitEndOfAsmFile(Module &M) { } void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) { + CSKY_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td index 300ecceae906..8d3835b22bb0 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td @@ -153,7 +153,7 @@ def CSKYSymbol : AsmOperandClass { let ParserMethod = "parseCSKYSymbol"; } -def br_symbol : Operand<iPTR> { +def br_symbol : Operand<OtherVT> { let EncoderMethod = "getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm16_scale2>"; let ParserMatchClass = CSKYSymbol; diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td index 3be1ca8b7998..2d7fb85e89fa 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td @@ -24,7 +24,7 @@ def CSKY_NIR : SDNode<"CSKYISD::NIR", SDTNone, // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def br_symbol_16bit : Operand<iPTR> { +def br_symbol_16bit : Operand<OtherVT> { let EncoderMethod = "getBranchSymbolOpValue<CSKY::fixup_csky_pcrel_imm10_scale2>"; let ParserMatchClass = CSKYSymbol; diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp index 1a69dc8acde0..64f01cd1c9fa 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.cpp @@ -26,6 +26,7 @@ #include "llvm/MC/TargetRegistry.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "CSKYGenInstrInfo.inc" #define GET_REGINFO_MC_DESC diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h index 4b8c45e95b74..1137b4d6e9b1 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCTargetDesc.h @@ -41,6 +41,7 @@ MCCodeEmitter *createCSKYMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx); #include "CSKYGenRegisterInfo.inc" #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "CSKYGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 4d6e1a9d3166..709279889653 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -116,7 +116,7 @@ def ThreadId :dxil_op< "ThreadId", 93, ThreadIdClass, ComputeID, "reads the thr dxil_param<1, "i32", "opcode", "DXIL opcode">, dxil_param<2, "i32", "component", "component to read (x,y,z)"> ]>, - dxil_map_intrinsic<int_dxil_thread_id>; + dxil_map_intrinsic<int_dx_thread_id>; def GroupId :dxil_op< "GroupId", 94, GroupIdClass, ComputeID, "reads the group ID (SV_GroupID)", "i32;", "rn", [ @@ -124,7 +124,7 @@ def GroupId :dxil_op< "GroupId", 94, GroupIdClass, ComputeID, "reads the group dxil_param<1, "i32", "opcode", "DXIL opcode">, dxil_param<2, "i32", "component", "component to read"> ]>, - dxil_map_intrinsic<int_dxil_group_id>; + dxil_map_intrinsic<int_dx_group_id>; def ThreadIdInGroup :dxil_op< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeID, "reads the thread ID within the group (SV_GroupThreadID)", "i32;", "rn", @@ -133,7 +133,7 @@ def ThreadIdInGroup :dxil_op< "ThreadIdInGroup", 95, ThreadIdInGroupClass, Comp dxil_param<1, "i32", "opcode", "DXIL opcode">, dxil_param<2, "i32", "component", "component to read (x,y,z)"> ]>, - dxil_map_intrinsic<int_dxil_thread_id_in_group>; + dxil_map_intrinsic<int_dx_thread_id_in_group>; def FlattenedThreadIdInGroup :dxil_op< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeID, "provides a flattened index for a given thread within a given group (SV_GroupIndex)", "i32;", "rn", @@ -141,4 +141,4 @@ def FlattenedThreadIdInGroup :dxil_op< "FlattenedThreadIdInGroup", 96, Flattene dxil_param<0, "i32", "", "result">, dxil_param<1, "i32", "opcode", "DXIL opcode"> ]>, - dxil_map_intrinsic<int_dxil_flattened_thread_id_in_group>; + dxil_map_intrinsic<int_dx_flattened_thread_id_in_group>; diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 494a71e51a89..3e09270a66d0 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -595,6 +595,10 @@ unsigned DXILBitcodeWriter::getEncodedRMWOperation(AtomicRMWInst::BinOp Op) { return bitc::RMW_FADD; case AtomicRMWInst::FSub: return bitc::RMW_FSUB; + case AtomicRMWInst::FMax: + return bitc::RMW_FMAX; + case AtomicRMWInst::FMin: + return bitc::RMW_FMIN; } } diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 48d339234e9e..1064296b0991 100644 --- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -743,6 +743,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst, /// Print out a single Hexagon MI to the current output stream. void HexagonAsmPrinter::emitInstruction(const MachineInstr *MI) { + Hexagon_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MCInst MCB; MCB.setOpcode(Hexagon::BUNDLE); MCB.addOperand(MCOperand::createImm(0)); diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 0b4a95bc9ce5..01501109f3b1 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1024,7 +1024,7 @@ void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const { for (auto &B : MF) { auto At = findCFILocation(B); if (At) - insertCFIInstructionsAt(B, At.getValue()); + insertCFIInstructionsAt(B, At.value()); } } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index ed2856eb1fe9..9c235776c160 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -376,11 +376,9 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, State.Bundle = &MI; State.Index = 0; size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1; - FeatureBitset Features = computeAvailableFeatures(STI.getFeatureBits()); for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) { MCInst &HMI = const_cast<MCInst &>(*I.getInst()); - verifyInstructionPredicates(HMI, Features); EncodeSingleInstruction(HMI, OS, Fixups, STI, parseBits(Last, HMB, HMI)); State.Extended = HexagonMCInstrInfo::isImmext(HMI); @@ -793,5 +791,4 @@ MCCodeEmitter *llvm::createHexagonMCCodeEmitter(MCInstrInfo const &MII, return new HexagonMCCodeEmitter(MII, MCT); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "HexagonGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h index 9e86dc8e4989..151964bf818b 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h @@ -81,11 +81,6 @@ private: // Return parse bits for instruction `MCI' inside bundle `MCB' uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const; - - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index d068baf05998..f2d1173cd503 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -46,6 +46,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "HexagonGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index d717e710f3c0..3932077c08f1 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -110,6 +110,7 @@ unsigned HexagonConvertUnits(unsigned ItinUnits, unsigned *Lanes); // #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_SCHED_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "HexagonGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index d715ba901a2b..33e7068622f1 100644 --- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -705,14 +705,14 @@ LanaiAsmParser::parseRegister(bool RestoreOnFailure) { RegNum = MatchRegisterName(Lexer.getTok().getIdentifier()); if (RegNum == 0) { if (PercentTok && RestoreOnFailure) - Lexer.UnLex(PercentTok.getValue()); + Lexer.UnLex(PercentTok.value()); return nullptr; } Parser.Lex(); // Eat identifier token return LanaiOperand::createReg(RegNum, Start, End); } if (PercentTok && RestoreOnFailure) - Lexer.UnLex(PercentTok.getValue()); + Lexer.UnLex(PercentTok.value()); return nullptr; } diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp index c0b7fd3fdd5d..d142fd3a414f 100644 --- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp +++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp @@ -195,6 +195,9 @@ void LanaiAsmPrinter::customEmitInstruction(const MachineInstr *MI) { } void LanaiAsmPrinter::emitInstruction(const MachineInstr *MI) { + Lanai_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp index eb6bf8d3836c..c43450869832 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp @@ -28,6 +28,7 @@ #include <string> #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "LanaiGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h index e8da1bc88142..93fe1a4609d8 100644 --- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h +++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h @@ -43,6 +43,7 @@ std::unique_ptr<MCObjectTargetWriter> createLanaiELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the Lanai instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "LanaiGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp index dd61bb2df077..1467d1757ff0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp @@ -27,6 +27,9 @@ using namespace llvm; #include "LoongArchGenMCPseudoLowering.inc" void LoongArchAsmPrinter::emitInstruction(const MachineInstr *MI) { + LoongArch_MC::verifyInstructionPredicates( + MI->getOpcode(), getSubtargetInfo().getFeatureBits()); + // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h index 7e5aa49f227c..b51c19188051 100644 --- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h +++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h @@ -39,6 +39,10 @@ public: // tblgen'erated function. bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, const MachineInstr *MI); + // Wrapper needed for tblgenned pseudo lowering. + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { + return lowerLoongArchMachineOperandToMCOperand(MO, MCOp, *this); + } }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index 5b117d40e0a9..20448492a558 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -11,6 +11,22 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// LoongArch specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDT_LoongArchMOVGR2FR_W_LA64 + : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>; +def SDT_LoongArchMOVFR2GR_S_LA64 + : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>; +def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; + +def loongarch_movgr2fr_w_la64 + : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>; +def loongarch_movfr2gr_s_la64 + : SDNode<"LoongArchISD::MOVFR2GR_S_LA64", SDT_LoongArchMOVFR2GR_S_LA64>; +def loongarch_ftint : SDNode<"LoongArchISD::FTINT", SDT_LoongArchFTINT>; + +//===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -149,6 +165,7 @@ def : PatFPSetcc<SETULT, FCMP_CULT_S, FPR32>; def : PatFPSetcc<SETULE, FCMP_CULE_S, FPR32>; def : PatFPSetcc<SETUNE, FCMP_CUNE_S, FPR32>; def : PatFPSetcc<SETUO, FCMP_CUN_S, FPR32>; +def : PatFPSetcc<SETLT, FCMP_CLT_S, FPR32>; // TODO: Match signaling comparison strict_fsetccs with FCMP_S*_S instructions. @@ -174,4 +191,39 @@ def : PatFPSelectcc<SETULE, FCMP_CULE_S, FSEL_S, FPR32>; def : PatFPSelectcc<SETUNE, FCMP_CUNE_S, FSEL_S, FPR32>; def : PatFPSelectcc<SETUO, FCMP_CUN_S, FSEL_S, FPR32>; +/// Loads + +defm : LdPat<load, FLD_S, f32>; + +/// Stores + +defm : StPat<store, FST_S, FPR32, f32>; + +/// Floating point constants + +def : Pat<(f32 fpimm0), (MOVGR2FR_W R0)>; +def : Pat<(f32 fpimm0neg), (FNEG_S (MOVGR2FR_W R0))>; +def : Pat<(f32 fpimm1), (FFINT_S_W (MOVGR2FR_W (ADDI_W R0, 1)))>; + +// FP Conversion +def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>; } // Predicates = [HasBasicF] + +let Predicates = [HasBasicF, IsLA64] in { +// GPR -> FPR +def : Pat<(loongarch_movgr2fr_w_la64 GPR:$src), (MOVGR2FR_W GPR:$src)>; +// FPR -> GPR +def : Pat<(loongarch_movfr2gr_s_la64 FPR32:$src), + (MOVFR2GR_S FPR32:$src)>; +// int -> f32 +def : Pat<(f32 (sint_to_fp GPR:$src)), (FFINT_S_W (MOVGR2FR_W GPR:$src))>; +} // Predicates = [HasBasicF, IsLA64] + +let Predicates = [HasBasicF, IsLA32] in { +// GPR -> FPR +def : Pat<(bitconvert (i32 GPR:$src)), (MOVGR2FR_W GPR:$src)>; +// FPR -> GPR +def : Pat<(i32 (bitconvert FPR32:$src)), (MOVFR2GR_S FPR32:$src)>; +// int -> f32 +def : Pat<(f32 (sint_to_fp (i32 GPR:$src))), (FFINT_S_W (MOVGR2FR_W GPR:$src))>; +} // Predicates = [HasBasicF, IsLA64] diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index 07fa61f4c361..bb50cec9f4c0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -131,6 +131,11 @@ def MOVGR2FR_D : FP_MOV<0b0000000100010100101010, "movgr2fr.d", FPR64, GPR>; def MOVFR2GR_D : FP_MOV<0b0000000100010100101110, "movfr2gr.d", GPR, FPR64>; } // Predicates = [HasBasicD, IsLA64] +// Instructions only available on LA32 +let Predicates = [HasBasicD, IsLA32], isCodeGenOnly = 1 in { +def MOVGR2FR_W_64 : FP_MOV<0b0000000100010100101001, "movgr2fr.w", FPR64, GPR>; +} // Predicates = [HasBasicD, IsLA32], isCodeGenOnly = 1 + //===----------------------------------------------------------------------===// // Pseudo-instructions and codegen patterns //===----------------------------------------------------------------------===// @@ -164,6 +169,7 @@ def : PatFPSetcc<SETULT, FCMP_CULT_D, FPR64>; def : PatFPSetcc<SETULE, FCMP_CULE_D, FPR64>; def : PatFPSetcc<SETUNE, FCMP_CUNE_D, FPR64>; def : PatFPSetcc<SETUO, FCMP_CUN_D, FPR64>; +def : PatFPSetcc<SETLT, FCMP_CLT_D, FPR64>; // TODO: Match signaling comparison strict_fsetccs with FCMP_S*_D instructions. @@ -185,4 +191,52 @@ def : PatFPSelectcc<SETULE, FCMP_CULE_D, FSEL_D, FPR64>; def : PatFPSelectcc<SETUNE, FCMP_CUNE_D, FSEL_D, FPR64>; def : PatFPSelectcc<SETUO, FCMP_CUN_D, FSEL_D, FPR64>; +/// Loads + +defm : LdPat<load, FLD_D, f64>; + +/// Stores + +defm : StPat<store, FST_D, FPR64, f64>; + +/// FP conversion operations + +def : Pat<(loongarch_ftint FPR64:$src), (FTINTRZ_W_D FPR64:$src)>; +def : Pat<(f64 (loongarch_ftint FPR64:$src)), (FTINTRZ_L_D FPR64:$src)>; +def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_L_S FPR32:$src)>; + +// f64 -> f32 +def : Pat<(f32 (fpround FPR64:$src)), (FCVT_S_D FPR64:$src)>; +// f32 -> f64 +def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>; } // Predicates = [HasBasicD] + +/// Floating point constants + +let Predicates = [HasBasicD, IsLA64] in { +def : Pat<(f64 fpimm0), (MOVGR2FR_D R0)>; +def : Pat<(f64 fpimm0neg), (FNEG_D (MOVGR2FR_D R0))>; +def : Pat<(f64 fpimm1), (FFINT_D_L (MOVGR2FR_D (ADDI_D R0, 1)))>; + +// Convert int to FP +def : Pat<(f64 (sint_to_fp (i64 (sexti32 (i64 GPR:$src))))), + (FFINT_D_W (MOVGR2FR_W GPR:$src))>; +def : Pat<(f64 (sint_to_fp GPR:$src)), (FFINT_D_L (MOVGR2FR_D GPR:$src))>; + +def : Pat<(f64 (uint_to_fp (i64 (zexti32 (i64 GPR:$src))))), + (FFINT_D_W (MOVGR2FR_W GPR:$src))>; + +def : Pat<(bitconvert GPR:$src), (MOVGR2FR_D GPR:$src)>; + +// Convert FP to int +def : Pat<(bitconvert FPR64:$src), (MOVFR2GR_D FPR64:$src)>; +} // Predicates = [HasBasicD, IsLA64] + +let Predicates = [HasBasicD, IsLA32] in { +def : Pat<(f64 fpimm0), (MOVGR2FRH_W (MOVGR2FR_W_64 R0), R0)>; +def : Pat<(f64 fpimm0neg), (FNEG_D (MOVGR2FRH_W (MOVGR2FR_W_64 R0), R0))>; +def : Pat<(f64 fpimm1), (FCVT_D_S (FFINT_S_W (MOVGR2FR_W (ADDI_W R0, 1))))>; + +// Convert int to FP +def : Pat<(f64 (sint_to_fp (i32 GPR:$src))), (FFINT_D_W (MOVGR2FR_W GPR:$src))>; +} // Predicates = [HasBasicD, IsLA32] diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index 7182d55ca3cf..0d9ec9e2eaaa 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -11,7 +11,9 @@ //===----------------------------------------------------------------------===// #include "LoongArchFrameLowering.h" +#include "LoongArchMachineFunctionInfo.h" #include "LoongArchSubtarget.h" +#include "MCTargetDesc/LoongArchBaseInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -44,12 +46,178 @@ bool LoongArchFrameLowering::hasBP(const MachineFunction &MF) const { return MFI.hasVarSizedObjects() && TRI->hasStackRealignment(MF); } +void LoongArchFrameLowering::adjustReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DestReg, + Register SrcReg, int64_t Val, + MachineInstr::MIFlag Flag) const { + const LoongArchInstrInfo *TII = STI.getInstrInfo(); + bool IsLA64 = STI.is64Bit(); + + if (DestReg == SrcReg && Val == 0) + return; + + if (isInt<12>(Val)) { + // addi.w/d $DstReg, $SrcReg, Val + BuildMI(MBB, MBBI, DL, + TII->get(IsLA64 ? LoongArch::ADDI_D : LoongArch::ADDI_W), DestReg) + .addReg(SrcReg) + .addImm(Val) + .setMIFlag(Flag); + return; + } + + report_fatal_error("adjustReg cannot yet handle adjustments >12 bits"); +} + +// Determine the size of the frame and maximum call frame size. +void LoongArchFrameLowering::determineFrameLayout(MachineFunction &MF) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Get the number of bytes to allocate from the FrameInfo. + uint64_t FrameSize = MFI.getStackSize(); + + // Make sure the frame is aligned. + FrameSize = alignTo(FrameSize, getStackAlign()); + + // Update frame info. + MFI.setStackSize(FrameSize); +} + void LoongArchFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - // TODO: Implement this when we have function calls + MachineFrameInfo &MFI = MF.getFrameInfo(); + const LoongArchRegisterInfo *RI = STI.getRegisterInfo(); + const LoongArchInstrInfo *TII = STI.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + + Register SPReg = LoongArch::R3; + Register FPReg = LoongArch::R22; + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + + // Determine the correct frame layout + determineFrameLayout(MF); + + // First, compute final stack size. + uint64_t StackSize = MFI.getStackSize(); + + // Early exit if there is no need to allocate space in the stack. + if (StackSize == 0 && !MFI.adjustsStack()) + return; + + // Adjust stack. + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup); + // Emit ".cfi_def_cfa_offset StackSize". + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + + const auto &CSI = MFI.getCalleeSavedInfo(); + + // The frame pointer is callee-saved, and code has been generated for us to + // save it to the stack. We need to skip over the storing of callee-saved + // registers as the frame pointer must be modified after it has been saved + // to the stack, not before. + std::advance(MBBI, CSI.size()); + + // Iterate over list of callee-saved registers and emit .cfi_offset + // directives. + for (const auto &Entry : CSI) { + int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx()); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( + nullptr, RI->getDwarfRegNum(Entry.getReg(), true), Offset)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Generate new FP. + if (hasFP(MF)) { + adjustReg(MBB, MBBI, DL, FPReg, SPReg, StackSize, MachineInstr::FrameSetup); + + // Emit ".cfi_def_cfa $fp, 0" + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( + nullptr, RI->getDwarfRegNum(FPReg, true), 0)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } } void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - // TODO: Implement this when we have function calls + const LoongArchRegisterInfo *RI = STI.getRegisterInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + Register SPReg = LoongArch::R3; + + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + const auto &CSI = MFI.getCalleeSavedInfo(); + // Skip to before the restores of callee-saved registers. + auto LastFrameDestroy = MBBI; + if (!CSI.empty()) + LastFrameDestroy = std::prev(MBBI, CSI.size()); + + // Get the number of bytes from FrameInfo. + uint64_t StackSize = MFI.getStackSize(); + + // Restore the stack pointer. + if (RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects()) { + assert(hasFP(MF) && "frame pointer should not have been eliminated"); + adjustReg(MBB, LastFrameDestroy, DL, SPReg, LoongArch::R22, -StackSize, + MachineInstr::FrameDestroy); + } + + // Deallocate stack + adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy); +} + +void LoongArchFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + // Unconditionally spill RA and FP only if the function uses a frame + // pointer. + if (hasFP(MF)) { + SavedRegs.set(LoongArch::R1); + SavedRegs.set(LoongArch::R22); + } + // Mark BP as used if function has dedicated base pointer. + if (hasBP(MF)) + SavedRegs.set(LoongArchABI::getBPReg()); +} + +StackOffset LoongArchFrameLowering::getFrameIndexReference( + const MachineFunction &MF, int FI, Register &FrameReg) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + + // Callee-saved registers should be referenced relative to the stack + // pointer (positive offset), otherwise use the frame pointer (negative + // offset). + const auto &CSI = MFI.getCalleeSavedInfo(); + int MinCSFI = 0; + int MaxCSFI = -1; + StackOffset Offset = + StackOffset::getFixed(MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + + MFI.getOffsetAdjustment()); + + if (CSI.size()) { + MinCSFI = CSI[0].getFrameIdx(); + MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); + } + + FrameReg = RI->getFrameRegister(MF); + if ((FI >= MinCSFI && FI <= MaxCSFI) || !hasFP(MF)) { + FrameReg = LoongArch::R3; + Offset += StackOffset::getFixed(MFI.getStackSize()); + } + + return Offset; } diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h index 25c53efc10f1..014b666de711 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h @@ -31,8 +31,26 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; + + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override { + return MBB.erase(MI); + } + + StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const override; + bool hasFP(const MachineFunction &MF) const override; bool hasBP(const MachineFunction &MF) const; + +private: + void determineFrameLayout(MachineFunction &MF) const; + void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DestReg, Register SrcReg, + int64_t Val, MachineInstr::MIFlag Flag) const; }; } // namespace llvm #endif // LLVM_LIB_TARGET_LOONGARCH_LOONGARCHFRAMELOWERING_H diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp index cc9ea0255d98..bb40ff817574 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp @@ -33,13 +33,14 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { unsigned Opcode = Node->getOpcode(); MVT GRLenVT = Subtarget->getGRLenVT(); SDLoc DL(Node); + MVT VT = Node->getSimpleValueType(0); switch (Opcode) { default: break; case ISD::Constant: { int64_t Imm = cast<ConstantSDNode>(Node)->getSExtValue(); - if (Imm == 0 && Node->getSimpleValueType(0) == GRLenVT) { + if (Imm == 0 && VT == GRLenVT) { SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, LoongArch::R0, GRLenVT); ReplaceNode(Node, New.getNode()); @@ -60,6 +61,15 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, Result); return; } + case ISD::FrameIndex: { + SDValue Imm = CurDAG->getTargetConstant(0, DL, GRLenVT); + int FI = cast<FrameIndexSDNode>(Node)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT); + unsigned ADDIOp = + Subtarget->is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W; + ReplaceNode(Node, CurDAG->getMachineNode(ADDIOp, DL, VT, TFI, Imm)); + return; + } // TODO: Add selection nodes needed later. } @@ -67,6 +77,17 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { SelectCode(Node); } +bool LoongArchDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) { + // If this is FrameIndex, select it directly. Otherwise just let it get + // selected to a register independently. + if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr)) + Base = + CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getGRLenVT()); + else + Base = Addr; + return true; +} + bool LoongArchDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt) { // Shift instructions on LoongArch only read the lower 5 or 6 bits of the @@ -125,6 +146,39 @@ bool LoongArchDAGToDAGISel::selectShiftMask(SDValue N, unsigned ShiftWidth, return true; } +bool LoongArchDAGToDAGISel::selectSExti32(SDValue N, SDValue &Val) { + if (N.getOpcode() == ISD::SIGN_EXTEND_INREG && + cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) { + Val = N.getOperand(0); + return true; + } + MVT VT = N.getSimpleValueType(); + if (CurDAG->ComputeNumSignBits(N) > (VT.getSizeInBits() - 32)) { + Val = N; + return true; + } + + return false; +} + +bool LoongArchDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) { + if (N.getOpcode() == ISD::AND) { + auto *C = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (C && C->getZExtValue() == UINT64_C(0xFFFFFFFF)) { + Val = N.getOperand(0); + return true; + } + } + MVT VT = N.getSimpleValueType(); + APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), 32); + if (CurDAG->MaskedValueIsZero(N, Mask)) { + Val = N; + return true; + } + + return false; +} + // This pass converts a legalized DAG into a LoongArch-specific DAG, ready // for instruction scheduling. FunctionPass *llvm::createLoongArchISelDag(LoongArchTargetMachine &TM) { diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h index f477129d933c..7ad329a64424 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h @@ -38,6 +38,8 @@ public: void Select(SDNode *Node) override; + bool SelectBaseAddr(SDValue Addr, SDValue &Base); + bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt); bool selectShiftMaskGRLen(SDValue N, SDValue &ShAmt) { return selectShiftMask(N, Subtarget->getGRLen(), ShAmt); @@ -46,6 +48,9 @@ public: return selectShiftMask(N, 32, ShAmt); } + bool selectSExti32(SDValue N, SDValue &Val); + bool selectZExti32(SDValue N, SDValue &Val); + // Include the pieces autogenerated from the target description. #include "LoongArchGenDAGISel.inc" }; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index d5a469216859..4acf90bd9788 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -17,14 +17,21 @@ #include "LoongArchRegisterInfo.h" #include "LoongArchSubtarget.h" #include "LoongArchTargetMachine.h" +#include "MCTargetDesc/LoongArchMCTargetDesc.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; #define DEBUG_TYPE "loongarch-isel-lowering" +static cl::opt<bool> ZeroDivCheck( + "loongarch-check-zero-division", cl::Hidden, + cl::desc("Trap on integer division by zero."), + cl::init(false)); + LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, const LoongArchSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -37,15 +44,25 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, if (Subtarget.hasBasicD()) addRegisterClass(MVT::f64, &LoongArch::FPR64RegClass); + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, GRLenVT, + MVT::i1, Promote); + // TODO: add necessary setOperationAction calls later. setOperationAction(ISD::SHL_PARTS, GRLenVT, Custom); setOperationAction(ISD::SRA_PARTS, GRLenVT, Custom); setOperationAction(ISD::SRL_PARTS, GRLenVT, Custom); + setOperationAction(ISD::FP_TO_SINT, GRLenVT, Custom); + + setOperationAction({ISD::GlobalAddress, ISD::ConstantPool}, GRLenVT, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::SHL, MVT::i32, Custom); setOperationAction(ISD::SRA, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::BITCAST, MVT::i32, Custom); + if (Subtarget.hasBasicF() && !Subtarget.hasBasicD()) + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); } static const ISD::CondCode FPCCToExpand[] = {ISD::SETOGT, ISD::SETOGE, @@ -58,10 +75,19 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, if (Subtarget.hasBasicD()) { setCondCodeAction(FPCCToExpand, MVT::f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); } + setOperationAction(ISD::BR_CC, GRLenVT, Expand); setOperationAction(ISD::SELECT_CC, GRLenVT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand); + if (!Subtarget.is64Bit()) + setLibcallName(RTLIB::MUL_I128, nullptr); + + setOperationAction(ISD::FP_TO_UINT, GRLenVT, Custom); + setOperationAction(ISD::UINT_TO_FP, GRLenVT, Custom); // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -70,11 +96,14 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setBooleanContents(ZeroOrOneBooleanContent); + setMaxAtomicSizeInBitsSupported(Subtarget.getGRLen()); + // Function alignments. const Align FunctionAlignment(4); setMinFunctionAlignment(FunctionAlignment); setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::SRL); } @@ -83,6 +112,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, switch (Op.getOpcode()) { default: report_fatal_error("unimplemented operand"); + case ISD::GlobalAddress: + return lowerGlobalAddress(Op, DAG); case ISD::SHL_PARTS: return lowerShiftLeftParts(Op, DAG); case ISD::SRA_PARTS: @@ -96,7 +127,105 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, assert(Op.getOperand(1).getValueType() == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); return SDValue(); + case ISD::ConstantPool: + return lowerConstantPool(Op, DAG); + case ISD::FP_TO_SINT: + return lowerFP_TO_SINT(Op, DAG); + case ISD::BITCAST: + return lowerBITCAST(Op, DAG); + case ISD::FP_TO_UINT: + return SDValue(); + case ISD::UINT_TO_FP: + return lowerUINT_TO_FP(Op, DAG); + } +} + +SDValue LoongArchTargetLowering::lowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + auto &TLI = DAG.getTargetLoweringInfo(); + SDValue Tmp1, Tmp2; + SDValue Op1 = Op.getOperand(0); + if (Op1->getOpcode() == ISD::AssertZext || + Op1->getOpcode() == ISD::AssertSext) + return Op; + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op.getOperand(0)); + SDValue Res = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f64, Trunc); + SDNode *N = Res.getNode(); + TLI.expandUINT_TO_FP(N, Tmp1, Tmp2, DAG); + return Tmp1; +} + +SDValue LoongArchTargetLowering::lowerBITCAST(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + SDValue Op0 = Op.getOperand(0); + + if (Op.getValueType() == MVT::f32 && Op0.getValueType() == MVT::i32 && + Subtarget.is64Bit() && Subtarget.hasBasicF()) { + SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0); + return DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, NewOp0); } + return Op; +} + +SDValue LoongArchTargetLowering::lowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + + if (Op.getValueSizeInBits() > 32 && Subtarget.hasBasicF() && + !Subtarget.hasBasicD()) { + SDValue Dst = + DAG.getNode(LoongArchISD::FTINT, DL, MVT::f32, Op.getOperand(0)); + return DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Dst); + } + + EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits()); + SDValue Trunc = DAG.getNode(LoongArchISD::FTINT, DL, FPTy, Op.getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Trunc); +} + +SDValue LoongArchTargetLowering::lowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT Ty = Op.getValueType(); + ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op); + + // FIXME: Only support PC-relative addressing to access the symbol. + // Target flags will be added later. + if (!isPositionIndependent()) { + SDValue ConstantN = DAG.getTargetConstantPool( + N->getConstVal(), Ty, N->getAlign(), N->getOffset()); + SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, ConstantN), + 0); + SDValue Addr(DAG.getMachineNode(Subtarget.is64Bit() ? LoongArch::ADDI_D + : LoongArch::ADDI_W, + DL, Ty, AddrHi, ConstantN), + 0); + return Addr; + } + report_fatal_error("Unable to lower ConstantPool"); +} + +SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT Ty = getPointerTy(DAG.getDataLayout()); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + unsigned ADDIOp = Subtarget.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W; + + // FIXME: Only support PC-relative addressing to access the symbol. + // TODO: Add target flags. + if (!isPositionIndependent()) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty); + SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, GA), 0); + SDValue Addr(DAG.getMachineNode(ADDIOp, DL, Ty, AddrHi, GA), 0); + return Addr; + } + report_fatal_error("Unable to lowerGlobalAddress"); } SDValue LoongArchTargetLowering::lowerShiftLeftParts(SDValue Op, @@ -238,6 +367,36 @@ void LoongArchTargetLowering::ReplaceNodeResults( break; } break; + case ISD::FP_TO_SINT: { + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + SDValue Src = N->getOperand(0); + EVT VT = EVT::getFloatingPointVT(N->getValueSizeInBits(0)); + SDValue Dst = DAG.getNode(LoongArchISD::FTINT, DL, VT, Src); + Results.push_back(DAG.getNode(ISD::BITCAST, DL, N->getValueType(0), Dst)); + break; + } + case ISD::BITCAST: { + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + if (VT == MVT::i32 && SrcVT == MVT::f32 && Subtarget.is64Bit() && + Subtarget.hasBasicF()) { + SDValue Dst = + DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Src); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Dst)); + } + break; + } + case ISD::FP_TO_UINT: { + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && + "Unexpected custom legalisation"); + auto &TLI = DAG.getTargetLoweringInfo(); + SDValue Tmp1, Tmp2; + TLI.expandFP_TO_UINT(N, Tmp1, Tmp2, DAG); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Tmp1)); + break; + } } } @@ -345,6 +504,224 @@ static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + MVT GRLenVT = Subtarget.getGRLenVT(); + EVT ValTy = N->getValueType(0); + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + ConstantSDNode *CN0, *CN1; + SDLoc DL(N); + unsigned ValBits = ValTy.getSizeInBits(); + unsigned MaskIdx0, MaskLen0, MaskIdx1, MaskLen1; + unsigned Shamt; + bool SwapAndRetried = false; + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (ValBits != 32 && ValBits != 64) + return SDValue(); + +Retry: + // 1st pattern to match BSTRINS: + // R = or (and X, mask0), (and (shl Y, lsb), mask1) + // where mask1 = (2**size - 1) << lsb, mask0 = ~mask1 + // => + // R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1) + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL && + (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) && + MaskIdx0 == MaskIdx1 && MaskLen0 == MaskLen1 && + (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskIdx0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 1\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0).getOperand(0), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 2nd pattern to match BSTRINS: + // R = or (and X, mask0), (shl (and Y, mask1), lsb) + // where mask1 = (2**size - 1), mask0 = ~(mask1 << lsb) + // => + // R = BSTRINS X, Y, msb, lsb (where msb = lsb + size - 1) + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND && + (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskIdx0 && + (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) && + isShiftedMask_64(CN1->getZExtValue(), MaskIdx1, MaskLen1) && + MaskLen0 == MaskLen1 && MaskIdx1 == 0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 2\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0).getOperand(0), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 3rd pattern to match BSTRINS: + // R = or (and X, mask0), (and Y, mask1) + // where ~mask0 = (2**size - 1) << lsb, mask0 & mask1 = 0 + // => + // R = BSTRINS X, (shr (and Y, mask1), lsb), msb, lsb + // where msb = lsb + size - 1 + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + (MaskIdx0 + MaskLen0 <= 64) && + (CN1 = dyn_cast<ConstantSDNode>(N1->getOperand(1))) && + (CN1->getSExtValue() & CN0->getSExtValue()) == 0) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 3\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + DAG.getNode(ISD::SRL, DL, N1->getValueType(0), N1, + DAG.getConstant(MaskIdx0, DL, GRLenVT)), + DAG.getConstant(ValBits == 32 + ? (MaskIdx0 + (MaskLen0 & 31) - 1) + : (MaskIdx0 + MaskLen0 - 1), + DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 4th pattern to match BSTRINS: + // R = or (and X, mask), (shl Y, shamt) + // where mask = (2**shamt - 1) + // => + // R = BSTRINS X, Y, ValBits - 1, shamt + // where ValBits = 32 or 64 + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::SHL && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(CN0->getZExtValue(), MaskIdx0, MaskLen0) && + MaskIdx0 == 0 && (CN1 = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + (Shamt = CN1->getZExtValue()) == MaskLen0 && + (MaskIdx0 + MaskLen0 <= ValBits)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 4\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + N1.getOperand(0), + DAG.getConstant((ValBits - 1), DL, GRLenVT), + DAG.getConstant(Shamt, DL, GRLenVT)); + } + + // 5th pattern to match BSTRINS: + // R = or (and X, mask), const + // where ~mask = (2**size - 1) << lsb, mask & const = 0 + // => + // R = BSTRINS X, (const >> lsb), msb, lsb + // where msb = lsb + size - 1 + if (N0.getOpcode() == ISD::AND && + (CN0 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) && + isShiftedMask_64(~CN0->getSExtValue(), MaskIdx0, MaskLen0) && + (CN1 = dyn_cast<ConstantSDNode>(N1)) && + (CN1->getSExtValue() & CN0->getSExtValue()) == 0) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 5\n"); + return DAG.getNode( + LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0), + DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy), + DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT), + DAG.getConstant(MaskIdx0, DL, GRLenVT)); + } + + // 6th pattern. + // a = b | ((c & mask) << shamt), where all positions in b to be overwritten + // by the incoming bits are known to be zero. + // => + // a = BSTRINS b, c, shamt + MaskLen - 1, shamt + // + // Note that the 1st pattern is a special situation of the 6th, i.e. the 6th + // pattern is more common than the 1st. So we put the 1st before the 6th in + // order to match as many nodes as possible. + ConstantSDNode *CNMask, *CNShamt; + unsigned MaskIdx, MaskLen; + if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::AND && + (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) && + MaskIdx == 0 && (CNShamt = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + CNShamt->getZExtValue() + MaskLen <= ValBits) { + Shamt = CNShamt->getZExtValue(); + APInt ShMask(ValBits, CNMask->getZExtValue() << Shamt); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 6\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + N1.getOperand(0).getOperand(0), + DAG.getConstant(Shamt + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(Shamt, DL, GRLenVT)); + } + } + + // 7th pattern. + // a = b | ((c << shamt) & shifted_mask), where all positions in b to be + // overwritten by the incoming bits are known to be zero. + // => + // a = BSTRINS b, c, MaskIdx + MaskLen - 1, MaskIdx + // + // Similarly, the 7th pattern is more common than the 2nd. So we put the 2nd + // before the 7th in order to match as many nodes as possible. + if (N1.getOpcode() == ISD::AND && + (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen) && + N1.getOperand(0).getOpcode() == ISD::SHL && + (CNShamt = dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(1))) && + CNShamt->getZExtValue() == MaskIdx) { + APInt ShMask(ValBits, CNMask->getZExtValue()); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 7\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + N1.getOperand(0).getOperand(0), + DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(MaskIdx, DL, GRLenVT)); + } + } + + // (or a, b) and (or b, a) are equivalent, so swap the operands and retry. + if (!SwapAndRetried) { + std::swap(N0, N1); + SwapAndRetried = true; + goto Retry; + } + + SwapAndRetried = false; +Retry2: + // 8th pattern. + // a = b | (c & shifted_mask), where all positions in b to be overwritten by + // the incoming bits are known to be zero. + // => + // a = BSTRINS b, c >> MaskIdx, MaskIdx + MaskLen - 1, MaskIdx + // + // Similarly, the 8th pattern is more common than the 4th and 5th patterns. So + // we put it here in order to match as many nodes as possible or generate less + // instructions. + if (N1.getOpcode() == ISD::AND && + (CNMask = dyn_cast<ConstantSDNode>(N1.getOperand(1))) && + isShiftedMask_64(CNMask->getZExtValue(), MaskIdx, MaskLen)) { + APInt ShMask(ValBits, CNMask->getZExtValue()); + if (ShMask.isSubsetOf(DAG.computeKnownBits(N0).Zero)) { + LLVM_DEBUG(dbgs() << "Perform OR combine: match pattern 8\n"); + return DAG.getNode(LoongArchISD::BSTRINS, DL, ValTy, N0, + DAG.getNode(ISD::SRL, DL, N1->getValueType(0), + N1->getOperand(0), + DAG.getConstant(MaskIdx, DL, GRLenVT)), + DAG.getConstant(MaskIdx + MaskLen - 1, DL, GRLenVT), + DAG.getConstant(MaskIdx, DL, GRLenVT)); + } + } + // Swap N0/N1 and retry. + if (!SwapAndRetried) { + std::swap(N0, N1); + SwapAndRetried = true; + goto Retry2; + } + + return SDValue(); +} + SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -353,12 +730,62 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::AND: return performANDCombine(N, DAG, DCI, Subtarget); + case ISD::OR: + return performORCombine(N, DAG, DCI, Subtarget); case ISD::SRL: return performSRLCombine(N, DAG, DCI, Subtarget); } return SDValue(); } +static MachineBasicBlock *insertDivByZeroTrap(MachineInstr &MI, + MachineBasicBlock &MBB, + const TargetInstrInfo &TII) { + if (!ZeroDivCheck) + return &MBB; + + // Build instructions: + // div(or mod) $dst, $dividend, $divisor + // bnez $divisor, 8 + // break 7 + // fallthrough + MachineOperand &Divisor = MI.getOperand(2); + auto FallThrough = std::next(MI.getIterator()); + + BuildMI(MBB, FallThrough, MI.getDebugLoc(), TII.get(LoongArch::BNEZ)) + .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill())) + .addImm(8); + + // See linux header file arch/loongarch/include/uapi/asm/break.h for the + // definition of BRK_DIVZERO. + BuildMI(MBB, FallThrough, MI.getDebugLoc(), TII.get(LoongArch::BREAK)) + .addImm(7/*BRK_DIVZERO*/); + + // Clear Divisor's kill flag. + Divisor.setIsKill(false); + + return &MBB; +} + +MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { + + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instr type to insert"); + case LoongArch::DIV_W: + case LoongArch::DIV_WU: + case LoongArch::MOD_W: + case LoongArch::MOD_WU: + case LoongArch::DIV_D: + case LoongArch::DIV_DU: + case LoongArch::MOD_D: + case LoongArch::MOD_DU: + return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo()); + break; + } +} + const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((LoongArchISD::NodeType)Opcode) { case LoongArchISD::FIRST_NUMBER: @@ -369,11 +796,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { return "LoongArchISD::" #node; // TODO: Add more target-dependent nodes later. + NODE_NAME_CASE(CALL) NODE_NAME_CASE(RET) NODE_NAME_CASE(SLL_W) NODE_NAME_CASE(SRA_W) NODE_NAME_CASE(SRL_W) + NODE_NAME_CASE(BSTRINS) NODE_NAME_CASE(BSTRPICK) + NODE_NAME_CASE(MOVGR2FR_W_LA64) + NODE_NAME_CASE(MOVFR2GR_S_LA64) + NODE_NAME_CASE(FTINT) } #undef NODE_NAME_CASE return nullptr; @@ -483,6 +915,132 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( return Chain; } +// Lower a call to a callseq_start + CALL + callseq_end chain, and add input +// and output parameter nodes. +SDValue +LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &DL = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + CallingConv::ID CallConv = CLI.CallConv; + bool IsVarArg = CLI.IsVarArg; + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + CLI.IsTailCall = false; + + if (IsVarArg) + report_fatal_error("LowerCall with varargs not implemented"); + + MachineFunction &MF = DAG.getMachineFunction(); + + // Analyze the operands of the call, assigning locations to each operand. + SmallVector<CCValAssign> ArgLocs; + CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + + analyzeOutputArgs(ArgCCInfo, Outs, CC_LoongArch); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = ArgCCInfo.getNextStackOffset(); + + for (auto &Arg : Outs) { + if (!Arg.Flags.isByVal()) + continue; + report_fatal_error("Passing arguments byval not implemented"); + } + + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); + + // Copy argument values to their designated locations. + SmallVector<std::pair<Register, SDValue>> RegsToPass; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue ArgValue = OutVals[i]; + + // Promote the value if needed. + // For now, only handle fully promoted arguments. + if (VA.getLocInfo() != CCValAssign::Full) + report_fatal_error("Unknown loc info"); + + if (VA.isRegLoc()) { + // Queue up the argument copies and emit them at the end. + RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); + } else { + report_fatal_error("Passing arguments via the stack not implemented"); + } + } + + SDValue Glue; + + // Build a sequence of copy-to-reg nodes, chained and glued together. + for (auto &Reg : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue); + Glue = Chain.getValue(1); + } + + // If the callee is a GlobalAddress/ExternalSymbol node, turn it into a + // TargetGlobalAddress/TargetExternalSymbol node so that legalize won't + // split it and then direct call can be matched by PseudoCALL. + // FIXME: Add target flags for relocation. + if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) + Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT); + else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT); + + // The first call operand is the chain and the second is the target address. + SmallVector<SDValue> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are + // known live into the call. + for (auto &Reg : RegsToPass) + Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); + + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + // Glue the call to the argument copies, if any. + if (Glue.getNode()) + Ops.push_back(Glue); + + // Emit the call. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); + Glue = Chain.getValue(1); + + // Mark the end of the call, which is glued to the call itself. + Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, DL, PtrVT, true), + DAG.getConstant(0, DL, PtrVT, true), Glue, DL); + Glue = Chain.getValue(1); + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign> RVLocs; + CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext()); + analyzeInputArgs(RetCCInfo, Ins, CC_LoongArch); + + // Copy all of the result registers out of their specified physreg. + for (auto &VA : RVLocs) { + // Copy the value out. + SDValue RetValue = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue); + Chain = RetValue.getValue(1); + Glue = RetValue.getValue(2); + + InVals.push_back(Chain.getValue(0)); + } + + return Chain; +} + bool LoongArchTargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { @@ -529,3 +1087,14 @@ SDValue LoongArchTargetLowering::LowerReturn( return DAG.getNode(LoongArchISD::RET, DL, MVT::Other, RetOps); } + +bool LoongArchTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { + assert((VT == MVT::f32 || VT == MVT::f64) && "Unexpected VT"); + + if (VT == MVT::f32 && !Subtarget.hasBasicF()) + return false; + if (VT == MVT::f64 && !Subtarget.hasBasicD()) + return false; + return (Imm.isZero() || Imm.isExactlyValue(+1.0)); +} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index c852577a3744..279550482675 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -27,6 +27,7 @@ enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, // TODO: add more LoongArchISDs + CALL, RET, // 32-bit shifts, directly matching the semantics of the named LoongArch // instructions. @@ -34,6 +35,13 @@ enum NodeType : unsigned { SRA_W, SRL_W, + // FPR<->GPR transfer operations + MOVGR2FR_W_LA64, + MOVFR2GR_S_LA64, + + FTINT, + + BSTRINS, BSTRPICK, }; @@ -72,6 +80,8 @@ public: const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; private: /// Target-specific function used to lower LoongArch calling conventions. @@ -86,8 +96,24 @@ private: const SmallVectorImpl<ISD::OutputArg> &Outs, LoongArchCCAssignFn Fn) const; + SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const override; + SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; + + bool shouldInsertFencesForAtomic(const Instruction *I) const override { + return isa<LoadInst>(I) || isa<StoreInst>(I); + } }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 146ef53befd5..bcbd4b28f3c7 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -12,6 +12,7 @@ #include "LoongArchInstrInfo.h" #include "LoongArch.h" +#include "LoongArchMachineFunctionInfo.h" using namespace llvm; @@ -19,8 +20,8 @@ using namespace llvm; #include "LoongArchGenInstrInfo.inc" LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI) - // FIXME: add CFSetup and CFDestroy Inst when we implement function call. - : LoongArchGenInstrInfo() {} + : LoongArchGenInstrInfo(LoongArch::ADJCALLSTACKDOWN, + LoongArch::ADJCALLSTACKUP) {} void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -47,3 +48,68 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, get(Opc), DstReg) .addReg(SrcReg, getKillRegState(KillSrc)); } + +void LoongArchInstrInfo::storeRegToStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, + bool IsKill, int FI, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) + DL = I->getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + unsigned Opcode; + if (LoongArch::GPRRegClass.hasSubClassEq(RC)) + Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32 + ? LoongArch::ST_W + : LoongArch::ST_D; + else if (LoongArch::FPR32RegClass.hasSubClassEq(RC)) + Opcode = LoongArch::FST_S; + else if (LoongArch::FPR64RegClass.hasSubClassEq(RC)) + Opcode = LoongArch::FST_D; + else + llvm_unreachable("Can't store this register to stack slot"); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); + + BuildMI(MBB, I, DL, get(Opcode)) + .addReg(SrcReg, getKillRegState(IsKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); +} + +void LoongArchInstrInfo::loadRegFromStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, + int FI, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) + DL = I->getDebugLoc(); + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + unsigned Opcode; + if (LoongArch::GPRRegClass.hasSubClassEq(RC)) + Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32 + ? LoongArch::LD_W + : LoongArch::LD_D; + else if (LoongArch::FPR32RegClass.hasSubClassEq(RC)) + Opcode = LoongArch::FLD_S; + else if (LoongArch::FPR64RegClass.hasSubClassEq(RC)) + Opcode = LoongArch::FLD_D; + else + llvm_unreachable("Can't load this register from stack slot"); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); + + BuildMI(MBB, I, DL, get(Opcode), DstReg) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); +} diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index f31943b85a51..0a8c86a5e0c2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -30,6 +30,16 @@ public: void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg, bool KillSrc) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, Register SrcReg, + bool IsKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, Register DstReg, + int FrameIndex, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 6b8ee9e43f94..d07d086bd7da 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -14,22 +14,45 @@ // LoongArch specific DAG Nodes. //===----------------------------------------------------------------------===// +// Target-independent type requirements, but with target-specific formats. +def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; +def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; + // Target-dependent type requirements. +def SDT_LoongArchCall : SDTypeProfile<0, -1, [SDTCisVT<0, GRLenVT>]>; def SDT_LoongArchIntBinOpW : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64> ]>; +def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [ + SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>, + SDTCisSameAs<3, 4> +]>; + def SDT_LoongArchBStrPick: SDTypeProfile<1, 3, [ SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<2, 3> ]>; // TODO: Add LoongArch specific DAG Nodes +// Target-independent nodes, but with target-specific formats. +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + // Target-dependent nodes. +def loongarch_call : SDNode<"LoongArchISD::CALL", SDT_LoongArchCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>; def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>; def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; +def loongarch_bstrins + : SDNode<"LoongArchISD::BSTRINS", SDT_LoongArchBStrIns>; def loongarch_bstrpick : SDNode<"LoongArchISD::BSTRPICK", SDT_LoongArchBStrPick>; @@ -106,7 +129,14 @@ def simm16 : Operand<GRLenVT> { let DecoderMethod = "decodeSImmOperand<16>"; } -def simm16_lsl2 : Operand<GRLenVT> { +def simm16_lsl2 : Operand<GRLenVT>, + ImmLeaf<GRLenVT, [{return isInt<16>(Imm>>2);}]> { + let ParserMatchClass = SImmAsmOperand<16, "lsl2">; + let EncoderMethod = "getImmOpValueAsr2"; + let DecoderMethod = "decodeSImmOperand<16, 2>"; +} + +def simm16_lsl2_br : Operand<OtherVT> { let ParserMatchClass = SImmAsmOperand<16, "lsl2">; let EncoderMethod = "getImmOpValueAsr2"; let DecoderMethod = "decodeSImmOperand<16, 2>"; @@ -117,13 +147,13 @@ def simm20 : Operand<GRLenVT> { let DecoderMethod = "decodeSImmOperand<20>"; } -def simm21_lsl2 : Operand<GRLenVT> { +def simm21_lsl2 : Operand<OtherVT> { let ParserMatchClass = SImmAsmOperand<21, "lsl2">; let EncoderMethod = "getImmOpValueAsr2"; let DecoderMethod = "decodeSImmOperand<21, 2>"; } -def simm26_lsl2 : Operand<GRLenVT> { +def simm26_lsl2 : Operand<OtherVT> { let ParserMatchClass = SImmAsmOperand<26, "lsl2">; let EncoderMethod = "getImmOpValueAsr2"; let DecoderMethod = "decodeSImmOperand<26, 2>"; @@ -141,6 +171,24 @@ def NegImm : SDNodeXForm<imm, [{ N->getValueType(0)); }]>; +// FP immediate patterns. +def fpimm0 : PatLeaf<(fpimm), [{return N->isExactlyValue(+0.0);}]>; +def fpimm0neg : PatLeaf<(fpimm), [{return N->isExactlyValue(-0.0);}]>; +def fpimm1 : PatLeaf<(fpimm), [{return N->isExactlyValue(+1.0);}]>; + +def CallSymbol: AsmOperandClass { + let Name = "CallSymbol"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isImm"; +} + +// A bare symbol used in call only. +def call_symbol : Operand<iPTR> { + let ParserMatchClass = CallSymbol; +} + +def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">; + //===----------------------------------------------------------------------===// // Instruction Formats //===----------------------------------------------------------------------===// @@ -185,7 +233,7 @@ class RDTIME_2R<bits<22> op, string opstr> : Fmt2R<op, (outs GPR:$rd, GPR:$rj), (ins), opstr, "$rd, $rj">; class BrCC_2RI16<bits<6> op, string opstr> - : Fmt2RI16<op, (outs), (ins GPR:$rj, GPR:$rd, simm16_lsl2:$imm16), opstr, + : Fmt2RI16<op, (outs), (ins GPR:$rj, GPR:$rd, simm16_lsl2_br:$imm16), opstr, "$rj, $rd, $imm16"> { let isBranch = 1; let isTerminator = 1; @@ -274,10 +322,12 @@ def XORI : ALU_2RI12<0b0000001111, "xori", uimm12>; def MUL_W : ALU_3R<0b00000000000111000, "mul.w">; def MULH_W : ALU_3R<0b00000000000111001, "mulh.w">; def MULH_WU : ALU_3R<0b00000000000111010, "mulh.wu">; +let usesCustomInserter = true in { def DIV_W : ALU_3R<0b00000000001000000, "div.w">; def MOD_W : ALU_3R<0b00000000001000001, "mod.w">; def DIV_WU : ALU_3R<0b00000000001000010, "div.wu">; def MOD_WU : ALU_3R<0b00000000001000011, "mod.wu">; +} // usesCustomInserter = true // Bit-shift Instructions def SLL_W : ALU_3R<0b00000000000101110, "sll.w">; @@ -379,10 +429,12 @@ def MULH_D : ALU_3R<0b00000000000111100, "mulh.d">; def MULH_DU : ALU_3R<0b00000000000111101, "mulh.du">; def MULW_D_W : ALU_3R<0b00000000000111110, "mulw.d.w">; def MULW_D_WU : ALU_3R<0b00000000000111111, "mulw.d.wu">; +let usesCustomInserter = true in { def DIV_D : ALU_3R<0b00000000001000100, "div.d">; def MOD_D : ALU_3R<0b00000000001000101, "mod.d">; def DIV_DU : ALU_3R<0b00000000001000110, "div.du">; def MOD_DU : ALU_3R<0b00000000001000111, "mod.du">; +} // usesCustomInserter = true // Bit-shift Instructions for 64-bits def SLL_D : ALU_3R<0b00000000000110001, "sll.d">; @@ -545,6 +597,9 @@ def shiftMaskGRLen : ComplexPattern<GRLenVT, 1, "selectShiftMaskGRLen", [], [], 0>; def shiftMask32 : ComplexPattern<i64, 1, "selectShiftMask32", [], [], 0>; +def sexti32 : ComplexPattern<i64, 1, "selectSExti32">; +def zexti32 : ComplexPattern<i64, 1, "selectZExti32">; + class shiftop<SDPatternOperator operator> : PatFrag<(ops node:$val, node:$count), (operator node:$val, (GRLenVT (shiftMaskGRLen node:$count)))>; @@ -556,6 +611,13 @@ let Predicates = [IsLA32] in { def : PatGprGpr<add, ADD_W>; def : PatGprImm<add, ADDI_W, simm12>; def : PatGprGpr<sub, SUB_W>; +def : PatGprGpr<sdiv, DIV_W>; +def : PatGprGpr<udiv, DIV_WU>; +def : PatGprGpr<srem, MOD_W>; +def : PatGprGpr<urem, MOD_WU>; +def : PatGprGpr<mul, MUL_W>; +def : PatGprGpr<mulhs, MULH_W>; +def : PatGprGpr<mulhu, MULH_WU>; } // Predicates = [IsLA32] let Predicates = [IsLA64] in { @@ -565,6 +627,24 @@ def : PatGprImm<add, ADDI_D, simm12>; def : PatGprImm_32<add, ADDI_W, simm12>; def : PatGprGpr<sub, SUB_D>; def : PatGprGpr_32<sub, SUB_W>; +def : PatGprGpr<sdiv, DIV_D>; +def : PatGprGpr<udiv, DIV_DU>; +def : PatGprGpr<srem, MOD_D>; +def : PatGprGpr<urem, MOD_DU>; +// TODO: Select "_W[U]" instructions for i32xi32 if only lower 32 bits of the +// product are used. +def : PatGprGpr<mul, MUL_D>; +def : PatGprGpr<mulhs, MULH_D>; +def : PatGprGpr<mulhu, MULH_DU>; +// Select MULW_D_W for calculating the full 64 bits product of i32xi32 signed +// multiplication. +def : Pat<(i64 (mul (sext_inreg GPR:$rj, i32), (sext_inreg GPR:$rk, i32))), + (MULW_D_W GPR:$rj, GPR:$rk)>; +// Select MULW_D_WU for calculating the full 64 bits product of i32xi32 +// unsigned multiplication. +def : Pat<(i64 (mul (loongarch_bstrpick GPR:$rj, (i64 31), (i64 0)), + (loongarch_bstrpick GPR:$rk, (i64 31), (i64 0)))), + (MULW_D_WU GPR:$rj, GPR:$rk)>; } // Predicates = [IsLA64] def : PatGprGpr<and, AND>; @@ -649,19 +729,143 @@ def : Pat<(select GPR:$cond, GPR:$t, GPR:$f), /// Branches and jumps +class BccPat<PatFrag CondOp, LAInst Inst> + : Pat<(brcond (GRLenVT (CondOp GPR:$rj, GPR:$rd)), bb:$imm16), + (Inst GPR:$rj, GPR:$rd, bb:$imm16)>; + +def : BccPat<seteq, BEQ>; +def : BccPat<setne, BNE>; +def : BccPat<setlt, BLT>; +def : BccPat<setge, BGE>; +def : BccPat<setult, BLTU>; +def : BccPat<setuge, BGEU>; + +class BccSwapPat<PatFrag CondOp, LAInst InstBcc> + : Pat<(brcond (GRLenVT (CondOp GPR:$rd, GPR:$rj)), bb:$imm16), + (InstBcc GPR:$rj, GPR:$rd, bb:$imm16)>; + +// Condition codes that don't have matching LoongArch branch instructions, but +// are trivially supported by swapping the two input operands. +def : BccSwapPat<setgt, BLT>; +def : BccSwapPat<setle, BGE>; +def : BccSwapPat<setugt, BLTU>; +def : BccSwapPat<setule, BGEU>; + +// An extra pattern is needed for a brcond without a setcc (i.e. where the +// condition was calculated elsewhere). +def : Pat<(brcond GPR:$rj, bb:$imm21), (BNEZ GPR:$rj, bb:$imm21)>; + +let isBarrier = 1, isBranch = 1, isTerminator = 1 in +def PseudoBR : Pseudo<(outs), (ins simm26_lsl2:$imm26), [(br bb:$imm26)]>, + PseudoInstExpansion<(B simm26_lsl2:$imm26)>; + +let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in +def PseudoBRIND : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16), []>, + PseudoInstExpansion<(JIRL R0, GPR:$rj, simm16_lsl2:$imm16)>; + +def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>; +def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)), + (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>; + +let isCall = 1, Defs = [R1] in +def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> { + let AsmString = "bl\t$func"; +} + +def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>; +def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; + +let isCall = 1, Defs = [R1] in +def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj), + [(loongarch_call GPR:$rj)]>, + PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>; + let isBarrier = 1, isReturn = 1, isTerminator = 1 in def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>, PseudoInstExpansion<(JIRL R0, R1, 0)>; -/// BSTRPICK +/// BSTRINS and BSTRPICK -let Predicates = [IsLA32] in +let Predicates = [IsLA32] in { +def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd), + (BSTRINS_W GPR:$rd, GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>; def : Pat<(loongarch_bstrpick GPR:$rj, uimm5:$msbd, uimm5:$lsbd), (BSTRPICK_W GPR:$rj, uimm5:$msbd, uimm5:$lsbd)>; +} // Predicates = [IsLA32] -let Predicates = [IsLA64] in +let Predicates = [IsLA64] in { +def : Pat<(loongarch_bstrins GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd), + (BSTRINS_D GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>; def : Pat<(loongarch_bstrpick GPR:$rj, uimm6:$msbd, uimm6:$lsbd), (BSTRPICK_D GPR:$rj, uimm6:$msbd, uimm6:$lsbd)>; +} // Predicates = [IsLA64] + +/// Loads + +multiclass LdPat<PatFrag LoadOp, LAInst Inst, ValueType vt = GRLenVT> { + def : Pat<(vt (LoadOp BaseAddr:$rj)), (Inst BaseAddr:$rj, 0)>; + def : Pat<(vt (LoadOp (add BaseAddr:$rj, simm12:$imm12))), + (Inst BaseAddr:$rj, simm12:$imm12)>; +} + +defm : LdPat<sextloadi8, LD_B>; +defm : LdPat<extloadi8, LD_B>; +defm : LdPat<sextloadi16, LD_H>; +defm : LdPat<extloadi16, LD_H>; +defm : LdPat<load, LD_W>, Requires<[IsLA32]>; +defm : LdPat<zextloadi8, LD_BU>; +defm : LdPat<zextloadi16, LD_HU>; +let Predicates = [IsLA64] in { +defm : LdPat<sextloadi32, LD_W, i64>; +defm : LdPat<extloadi32, LD_W, i64>; +defm : LdPat<zextloadi32, LD_WU, i64>; +defm : LdPat<load, LD_D, i64>; +} // Predicates = [IsLA64] + +/// Stores + +multiclass StPat<PatFrag StoreOp, LAInst Inst, RegisterClass StTy, + ValueType vt> { + def : Pat<(StoreOp (vt StTy:$rd), BaseAddr:$rj), + (Inst StTy:$rd, BaseAddr:$rj, 0)>; + def : Pat<(StoreOp (vt StTy:$rd), (add BaseAddr:$rj, simm12:$imm12)), + (Inst StTy:$rd, BaseAddr:$rj, simm12:$imm12)>; +} + +defm : StPat<truncstorei8, ST_B, GPR, GRLenVT>; +defm : StPat<truncstorei16, ST_H, GPR, GRLenVT>; +defm : StPat<store, ST_W, GPR, i32>, Requires<[IsLA32]>; +let Predicates = [IsLA64] in { +defm : StPat<truncstorei32, ST_W, GPR, i64>; +defm : StPat<store, ST_D, GPR, i64>; +} // Predicates = [IsLA64] + +/// Atomic loads and stores + +def : Pat<(atomic_fence timm, timm), (DBAR 0)>; + +defm : LdPat<atomic_load_8, LD_B>; +defm : LdPat<atomic_load_16, LD_H>; +defm : LdPat<atomic_load_32, LD_W>; + +defm : StPat<atomic_store_8, ST_B, GPR, GRLenVT>; +defm : StPat<atomic_store_16, ST_H, GPR, GRLenVT>; +defm : StPat<atomic_store_32, ST_W, GPR, i32>, Requires<[IsLA32]>; +let Predicates = [IsLA64] in { +defm : LdPat<atomic_load_64, LD_D>; +defm : StPat<atomic_store_32, ST_W, GPR, i64>; +defm : StPat<atomic_store_64, ST_D, GPR, i64>; +} // Predicates = [IsLA64] + +/// Other pseudo-instructions + +// Pessimistically assume the stack pointer will be clobbered +let Defs = [R3], Uses = [R3] in { +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(callseq_start timm:$amt1, timm:$amt2)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(callseq_end timm:$amt1, timm:$amt2)]>; +} // Defs = [R3], Uses = [R3] //===----------------------------------------------------------------------===// // Assembler Pseudo Instructions diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp index 7416c93b4d05..488c66f47863 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp @@ -22,6 +22,22 @@ using namespace llvm; +static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym, + const AsmPrinter &AP) { + MCContext &Ctx = AP.OutContext; + + // TODO: Processing target flags. + + const MCExpr *ME = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx); + + if (!MO.isJTI() && !MO.isMBB() && MO.getOffset()) + ME = MCBinaryExpr::createAdd( + ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + + return MCOperand::createExpr(ME); +} + bool llvm::lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO, MCOperand &MCOp, const AsmPrinter &AP) { @@ -41,12 +57,21 @@ bool llvm::lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO, case MachineOperand::MO_Immediate: MCOp = MCOperand::createImm(MO.getImm()); break; - // TODO: lower special operands - case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_ConstantPoolIndex: + MCOp = lowerSymbolOperand(MO, AP.GetCPISymbol(MO.getIndex()), AP); + break; case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_BlockAddress: + MCOp = lowerSymbolOperand(MO, AP.getSymbolPreferLocal(*MO.getGlobal()), AP); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = lowerSymbolOperand(MO, MO.getMBB()->getSymbol(), AP); + break; case MachineOperand::MO_ExternalSymbol: - case MachineOperand::MO_ConstantPoolIndex: + MCOp = lowerSymbolOperand( + MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP); + break; + // TODO: lower special operands + case MachineOperand::MO_BlockAddress: case MachineOperand::MO_JumpTableIndex: break; } diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp index b9bae8e56304..05902ebb7ba6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp @@ -110,6 +110,28 @@ void LoongArchRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { + // TODO: this implementation is a temporary placeholder which does just + // enough to allow other aspects of code generation to be tested. + assert(SPAdj == 0 && "Unexpected non-zero SPAdj value"); - // TODO: Implement this when we have function calls + + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + DebugLoc DL = MI.getDebugLoc(); + + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + Register FrameReg; + StackOffset Offset = + TFI->getFrameIndexReference(MF, FrameIndex, FrameReg) + + StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm()); + + // Offsets must be encodable with a 12-bit immediate field. + if (!isInt<12>(Offset.getFixed())) { + report_fatal_error("Frame offsets outside of the signed 12-bit range is " + "not supported currently"); + } + + MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false); + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); } diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 3a1a46a9e624..468c4f43cb90 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -102,6 +102,7 @@ public: return getTM<LoongArchTargetMachine>(); } + void addIRPasses() override; bool addInstSelector() override; }; } // namespace @@ -111,6 +112,12 @@ LoongArchTargetMachine::createPassConfig(PassManagerBase &PM) { return new LoongArchPassConfig(*this, PM); } +void LoongArchPassConfig::addIRPasses() { + addPass(createAtomicExpandPass()); + + TargetPassConfig::addIRPasses(); +} + bool LoongArchPassConfig::addInstSelector() { addPass(createLoongArchISelDag(getLoongArchTargetMachine())); diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp index c733c194e6a2..e50761ab1e27 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/Compiler.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "LoongArchGenInstrInfo.inc" #define GET_REGINFO_MC_DESC diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h index e576b9a49cd6..a606ccdbc47c 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h @@ -46,6 +46,7 @@ createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit); // Defines symbolic names for LoongArch instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "LoongArchGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp index 3bcce9e3ba3b..4933d40f3388 100644 --- a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp +++ b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp @@ -77,6 +77,9 @@ bool M68kAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, } void M68kAsmPrinter::emitInstruction(const MachineInstr *MI) { + M68k_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + switch (MI->getOpcode()) { default: { if (MI->isPseudo()) { diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp index 2606e22410fc..e6290d4cbec5 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp @@ -31,6 +31,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "M68kGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h index 0dc601ad876b..2a1cc678016a 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h @@ -52,6 +52,7 @@ std::unique_ptr<MCObjectTargetWriter> createM68kELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the M68k instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "M68kGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp index 3f006056955d..13a880de68b5 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp @@ -22,6 +22,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "MSP430GenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h index 24b0b3298592..e596c3f1ce46 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h @@ -53,6 +53,7 @@ createMSP430ELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the MSP430 instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "MSP430GenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp index 85c59d5b14b5..9cd2cbe89e46 100644 --- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp +++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp @@ -149,6 +149,9 @@ bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, //===----------------------------------------------------------------------===// void MSP430AsmPrinter::emitInstruction(const MachineInstr *MI) { + MSP430_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MSP430MCInstLower MCInstLowering(OutContext, *this); MCInst TmpInst; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index 6fc8fcb482cd..40c807082fdc 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -36,6 +36,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "MipsGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h index 8531177ee924..d51f3b9abcfd 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h @@ -55,6 +55,7 @@ StringRef selectMipsCPU(const Triple &TT, StringRef CPU); // Defines symbolic names for the Mips instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "MipsGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 9330a791a7cc..fcaf450cc511 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -181,6 +181,10 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI, } void MipsAsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // Mips_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + MipsTargetStreamer &TS = getTargetStreamer(); unsigned Opc = MI->getOpcode(); TS.forbidModuleDirective(); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp index 856d03f0b210..0ba29fb48b05 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp @@ -23,6 +23,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "NVPTXGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h index b394566edd0d..78f4e6745502 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h @@ -21,6 +21,7 @@ // Defines symbolic names for the PTX instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "NVPTXGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 41e9f375e536..8c92766faecb 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -183,6 +183,7 @@ enum CmpMode { // Defines symbolic names for the NVPTX instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "NVPTXGenInstrInfo.inc" #endif diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index b1d842122060..9977d8ba0300 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -139,6 +139,9 @@ VisitGlobalVariableForEmission(const GlobalVariable *GV, } void NVPTXAsmPrinter::emitInstruction(const MachineInstr *MI) { + NVPTX_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MCInst Inst; lowerToMCInst(MI, Inst); EmitToStreamer(*OutStreamer, Inst); diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 2201eb19c80f..b4f7a64f144a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -270,10 +270,6 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, // ShuffleVector return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1], NewOperands[2]); - case Instruction::InsertValue: - // InsertValueConstantExpr - return Builder.CreateInsertValue(NewOperands[0], NewOperands[1], - C->getIndices()); case Instruction::GetElementPtr: // GetElementPtrConstantExpr return Builder.CreateGEP(cast<GEPOperator>(C)->getSourceElementType(), diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 746f652bfa36..6ad016dfa0a7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1861,7 +1861,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Ret.getValue(2); if (ProxyRegTruncates[i]) { - Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret); + Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].value(), Ret); } InVals.push_back(Ret); diff --git a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index eeedce2d99cb..202134ed7035 100644 --- a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -35,6 +35,8 @@ public: bool runOnFunction(Function &F) override; + StringRef getPassName() const override { return "NVPTX Image Optimizer"; } + private: bool replaceIsTypePSampler(Instruction &I); bool replaceIsTypePSurface(Instruction &I); diff --git a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 16fbe1a65562..7929bd2e0df0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -36,6 +36,8 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { return "NVPTX Prolog Epilog Pass"; } + private: void calculateFrameObjectOffsets(MachineFunction &Fn); }; diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 2d6d72777db2..4e41515b997d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -18,7 +18,6 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Mutex.h" #include <algorithm> #include <cstring> @@ -32,19 +31,27 @@ namespace llvm { namespace { typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t; typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t; -typedef std::map<const Module *, global_val_annot_t> per_module_annot_t; -} // anonymous namespace -static ManagedStatic<per_module_annot_t> annotationCache; -static sys::Mutex Lock; +struct AnnotationCache { + sys::Mutex Lock; + std::map<const Module *, global_val_annot_t> Cache; +}; + +AnnotationCache &getAnnotationCache() { + static AnnotationCache AC; + return AC; +} +} // anonymous namespace void clearAnnotationCache(const Module *Mod) { - std::lock_guard<sys::Mutex> Guard(Lock); - annotationCache->erase(Mod); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); + AC.Cache.erase(Mod); } static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { - std::lock_guard<sys::Mutex> Guard(Lock); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); assert(md && "Invalid mdnode for annotation"); assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands"); // start index = 1, to skip the global variable key @@ -70,7 +77,8 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { } static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { - std::lock_guard<sys::Mutex> Guard(Lock); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations"); if (!NMD) return; @@ -93,40 +101,42 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { if (tmp.empty()) // no annotations for this gv return; - if ((*annotationCache).find(m) != (*annotationCache).end()) - (*annotationCache)[m][gv] = std::move(tmp); + if (AC.Cache.find(m) != AC.Cache.end()) + AC.Cache[m][gv] = std::move(tmp); else { global_val_annot_t tmp1; tmp1[gv] = std::move(tmp); - (*annotationCache)[m] = std::move(tmp1); + AC.Cache[m] = std::move(tmp1); } } bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, unsigned &retval) { - std::lock_guard<sys::Mutex> Guard(Lock); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); const Module *m = gv->getParent(); - if ((*annotationCache).find(m) == (*annotationCache).end()) + if (AC.Cache.find(m) == AC.Cache.end()) cacheAnnotationFromMD(m, gv); - else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end()) + else if (AC.Cache[m].find(gv) == AC.Cache[m].end()) cacheAnnotationFromMD(m, gv); - if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end()) + if (AC.Cache[m][gv].find(prop) == AC.Cache[m][gv].end()) return false; - retval = (*annotationCache)[m][gv][prop][0]; + retval = AC.Cache[m][gv][prop][0]; return true; } bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, std::vector<unsigned> &retval) { - std::lock_guard<sys::Mutex> Guard(Lock); + auto &AC = getAnnotationCache(); + std::lock_guard<sys::Mutex> Guard(AC.Lock); const Module *m = gv->getParent(); - if ((*annotationCache).find(m) == (*annotationCache).end()) + if (AC.Cache.find(m) == AC.Cache.end()) cacheAnnotationFromMD(m, gv); - else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end()) + else if (AC.Cache[m].find(gv) == AC.Cache[m].end()) cacheAnnotationFromMD(m, gv); - if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end()) + if (AC.Cache[m][gv].find(prop) == AC.Cache[m][gv].end()) return false; - retval = (*annotationCache)[m][gv][prop]; + retval = AC.Cache[m][gv][prop]; return true; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 46bbc44e1681..fa9e69f2e607 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -449,12 +449,9 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, return MO.getImm(); } -void PPCMCCodeEmitter::encodeInstruction( - const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - +void PPCMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); // Output the constant in big/little endian byte order. @@ -492,5 +489,4 @@ bool PPCMCCodeEmitter::isPrefixedInstruction(const MCInst &MI) const { return InstrInfo->isPrefixed(Opcode); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "PPCGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h index 39b2f1211f29..c4d4d35a6665 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h @@ -121,12 +121,6 @@ public: // Is this instruction a prefixed instruction. bool isPrefixedInstruction(const MCInst &MI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // namespace llvm diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index a651362f703b..1008dc63d064 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -48,6 +48,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "PPCGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index acb860e16518..3ca6f394f60b 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -118,6 +118,7 @@ static inline bool isRunOfOnes64(uint64_t Val, unsigned &MB, unsigned &ME) { // #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_SCHED_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "PPCGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 22f35c8fa8d3..58a75baf8081 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -230,6 +230,9 @@ private: void emitGlobalVariableHelper(const GlobalVariable *); + // Get the offset of an alias based on its AliaseeObject. + uint64_t getAliasOffset(const Constant *C); + public: PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : PPCAsmPrinter(TM, std::move(Streamer)) { @@ -656,6 +659,9 @@ static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO, /// the current output stream. /// void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { + PPC_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + MCInst TmpInst; const bool IsPPC64 = Subtarget->isPPC64(); const bool IsAIX = Subtarget->isAIXABI(); @@ -2352,6 +2358,24 @@ static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) { .Default(false); } +uint64_t PPCAIXAsmPrinter::getAliasOffset(const Constant *C) { + if (auto *GA = dyn_cast<GlobalAlias>(C)) + return getAliasOffset(GA->getAliasee()); + if (auto *CE = dyn_cast<ConstantExpr>(C)) { + const MCExpr *LowC = lowerConstant(CE); + const MCBinaryExpr *CBE = dyn_cast<MCBinaryExpr>(LowC); + if (!CBE) + return 0; + if (CBE->getOpcode() != MCBinaryExpr::Add) + report_fatal_error("Only adding an offset is supported now."); + auto *RHS = dyn_cast<MCConstantExpr>(CBE->getRHS()); + if (!RHS) + report_fatal_error("Unable to get the offset of alias."); + return RHS->getValue(); + } + return 0; +} + void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { // Special LLVM global arrays have been handled at the initialization. if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV)) @@ -2422,20 +2446,34 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) { } MCSymbol *EmittedInitSym = GVSym; + + // Emit linkage for the global variable and its aliases. emitLinkage(GV, EmittedInitSym); + for (const GlobalAlias *GA : GOAliasMap[GV]) + emitLinkage(GA, getSymbol(GA)); + emitAlignment(getGVAlignment(GV, DL), GV); // When -fdata-sections is enabled, every GlobalVariable will // be put into its own csect; therefore, label is not necessary here. - if (!TM.getDataSections() || GV->hasSection()) { + if (!TM.getDataSections() || GV->hasSection()) OutStreamer->emitLabel(EmittedInitSym); + + // No alias to emit. + if (!GOAliasMap[GV].size()) { + emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer()); + return; } - // Emit aliasing label for global variable. - for (const GlobalAlias *Alias : GOAliasMap[GV]) - OutStreamer->emitLabel(getSymbol(Alias)); + // Aliases with the same offset should be aligned. Record the list of aliases + // associated with the offset. + AliasMapTy AliasList; + for (const GlobalAlias *GA : GOAliasMap[GV]) + AliasList[getAliasOffset(GA->getAliasee())].push_back(GA); - emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer()); + // Emit alias label and element value for global variable. + emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer(), + &AliasList); } void PPCAIXAsmPrinter::emitFunctionDescriptor() { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 5b9d1e66b04e..3c461a627d61 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -392,8 +392,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // MASS transformation for LLVM intrinsics with replicating fast-math flag // to be consistent to PPCGenScalarMASSEntries pass - if (TM.getOptLevel() == CodeGenOpt::Aggressive && - TM.Options.PPCGenScalarMASSEntries) { + if (TM.getOptLevel() == CodeGenOpt::Aggressive) { setOperationAction(ISD::FSIN , MVT::f64, Custom); setOperationAction(ISD::FCOS , MVT::f64, Custom); setOperationAction(ISD::FPOW , MVT::f64, Custom); @@ -17886,13 +17885,17 @@ bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const { return Op.getNode()->getFlags().hasApproximateFuncs(); } +bool PPCTargetLowering::isScalarMASSConversionEnabled() const { + return getTargetMachine().Options.PPCGenScalarMASSEntries; +} + SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName, const char *LibCallFloatName, const char *LibCallDoubleNameFinite, const char *LibCallFloatNameFinite, SDValue Op, SelectionDAG &DAG) const { - if (!isLowringToMASSSafe(Op)) + if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op)) return SDValue(); if (!isLowringToMASSFiniteSafe(Op)) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index f92a117fe27f..4a08cc42fa9d 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1293,6 +1293,7 @@ namespace llvm { SelectionDAG &DAG) const; bool isLowringToMASSFiniteSafe(SDValue Op) const; bool isLowringToMASSSafe(SDValue Op) const; + bool isScalarMASSConversionEnabled() const; SDValue lowerLibCallBase(const char *LibCallDoubleName, const char *LibCallFloatName, const char *LibCallDoubleNameFinite, diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index fbd487fbcfd5..59e8f3ff84a4 100644 --- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -43,7 +43,6 @@ namespace { } const PPCInstrInfo *TII; - LiveIntervals *LIS; protected: bool processBlock(MachineBasicBlock &MBB) { @@ -83,11 +82,8 @@ protected: Register InReg = PPC::NoRegister; Register GPR3 = Is64Bit ? PPC::X3 : PPC::R3; Register GPR4 = Is64Bit ? PPC::X4 : PPC::R4; - SmallVector<Register, 3> OrigRegs = {OutReg, GPR3}; - if (!IsPCREL) { + if (!IsPCREL) InReg = MI.getOperand(1).getReg(); - OrigRegs.push_back(InReg); - } DebugLoc DL = MI.getDebugLoc(); unsigned Opc1, Opc2; @@ -139,11 +135,6 @@ protected: BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0) .addImm(0); - // The ADDItls* instruction is the first instruction in the - // repair range. - MachineBasicBlock::iterator First = I; - --First; - if (IsAIX) { // The variable offset and region handle are copied in r4 and r3. The // copies are followed by GETtlsADDR32AIX/GETtlsADDR64AIX. @@ -177,16 +168,10 @@ protected: BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg) .addReg(GPR3); - // The COPY is the last instruction in the repair range. - MachineBasicBlock::iterator Last = I; - --Last; - // Move past the original instruction and remove it. ++I; MI.removeFromParent(); - // Repair the live intervals. - LIS->repairIntervalsInRange(&MBB, First, Last, OrigRegs); Changed = true; } @@ -204,7 +189,6 @@ public: bool runOnMachineFunction(MachineFunction &MF) override { TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo(); - LIS = &getAnalysis<LiveIntervals>(); bool Changed = false; @@ -217,9 +201,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveIntervals>(); - AU.addPreserved<LiveIntervals>(); AU.addRequired<SlotIndexes>(); - AU.addPreserved<SlotIndexes>(); MachineFunctionPass::getAnalysisUsage(AU); } }; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 7c062387fecd..a335b2d23394 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -84,12 +84,6 @@ public: unsigned getVMaskReg(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -188,9 +182,6 @@ void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI, raw_ostream &OS, void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); // Get byte count of instruction. unsigned Size = Desc.getSize(); @@ -403,5 +394,4 @@ unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo, } } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "RISCVGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp index 917d93479f18..c63e0c8e737d 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/ErrorHandling.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "RISCVGenInstrInfo.inc" #define GET_REGINFO_MC_DESC diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h index 276fc9efb6c0..d157257d976c 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h @@ -45,6 +45,7 @@ std::unique_ptr<MCObjectTargetWriter> createRISCVELFObjectWriter(uint8_t OSABI, // Defines symbolic names for RISC-V instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "RISCVGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 5b2a247ebda0..edd39f6547ed 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -91,6 +91,9 @@ void RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) { #include "RISCVGenMCPseudoLowering.inc" void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) { + RISCV_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + // Do any auto-generated pseudo lowerings. if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 57d8ba6f0161..a7286b2963c2 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -899,7 +899,8 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF, } std::pair<int64_t, Align> -RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const { +RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFunction &MF) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); // Create a buffer of RVV objects to allocate. SmallVector<int, 8> ObjectsToAllocate; for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { @@ -912,10 +913,18 @@ RISCVFrameLowering::assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const { ObjectsToAllocate.push_back(I); } - // Allocate all RVV locals and spills - int64_t Offset = 0; // The minimum alignment is 16 bytes. Align RVVStackAlign(16); + const auto &ST = MF.getSubtarget<RISCVSubtarget>(); + + if (!ST.hasVInstructions()) { + assert(ObjectsToAllocate.empty() && + "Can't allocate scalable-vector objects without V instructions"); + return std::make_pair(0, RVVStackAlign); + } + + // Allocate all RVV locals and spills + int64_t Offset = 0; for (int FI : ObjectsToAllocate) { // ObjectSize in bytes. int64_t ObjectSize = MFI.getObjectSize(FI); @@ -997,7 +1006,7 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized( int64_t RVVStackSize; Align RVVStackAlign; - std::tie(RVVStackSize, RVVStackAlign) = assignRVVStackObjectOffsets(MFI); + std::tie(RVVStackSize, RVVStackAlign) = assignRVVStackObjectOffsets(MF); RVFI->setRVVStackSize(RVVStackSize); RVFI->setRVVStackAlign(RVVStackAlign); diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 466cd059b749..a5cf68a6ea94 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -84,7 +84,7 @@ private: MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount, MachineInstr::MIFlag Flag) const; std::pair<int64_t, Align> - assignRVVStackObjectOffsets(MachineFrameInfo &MFI) const; + assignRVVStackObjectOffsets(MachineFunction &MF) const; }; -} +} // namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index cfaafc7b53d2..5b823af1e9b8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -43,92 +43,95 @@ namespace RISCV { } // namespace llvm void RISCVDAGToDAGISel::PreprocessISelDAG() { - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), - E = CurDAG->allnodes_end(); - I != E;) { - SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. - - // Convert integer SPLAT_VECTOR to VMV_V_X_VL and floating-point - // SPLAT_VECTOR to VFMV_V_F_VL to reduce isel burden. - if (N->getOpcode() == ISD::SPLAT_VECTOR) { + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); + + bool MadeChange = false; + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + if (N->use_empty()) + continue; + + SDValue Result; + switch (N->getOpcode()) { + case ISD::SPLAT_VECTOR: { + // Convert integer SPLAT_VECTOR to VMV_V_X_VL and floating-point + // SPLAT_VECTOR to VFMV_V_F_VL to reduce isel burden. MVT VT = N->getSimpleValueType(0); unsigned Opc = VT.isInteger() ? RISCVISD::VMV_V_X_VL : RISCVISD::VFMV_V_F_VL; SDLoc DL(N); SDValue VL = CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT()); - SDValue Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT), - N->getOperand(0), VL); + Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT), + N->getOperand(0), VL); + break; + } + case RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL: { + // Lower SPLAT_VECTOR_SPLIT_I64 to two scalar stores and a stride 0 vector + // load. Done after lowering and combining so that we have a chance to + // optimize this to VMV_V_X_VL when the upper bits aren't needed. + assert(N->getNumOperands() == 4 && "Unexpected number of operands"); + MVT VT = N->getSimpleValueType(0); + SDValue Passthru = N->getOperand(0); + SDValue Lo = N->getOperand(1); + SDValue Hi = N->getOperand(2); + SDValue VL = N->getOperand(3); + assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() && + Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 && + "Unexpected VTs!"); + MachineFunction &MF = CurDAG->getMachineFunction(); + RISCVMachineFunctionInfo *FuncInfo = + MF.getInfo<RISCVMachineFunctionInfo>(); + SDLoc DL(N); - --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); - ++I; - CurDAG->DeleteNode(N); - continue; + // We use the same frame index we use for moving two i32s into 64-bit FPR. + // This is an analogous operation. + int FI = FuncInfo->getMoveF64FrameIndex(MF); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); + const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); + SDValue StackSlot = + CurDAG->getFrameIndex(FI, TLI.getPointerTy(CurDAG->getDataLayout())); + + SDValue Chain = CurDAG->getEntryNode(); + Lo = CurDAG->getStore(Chain, DL, Lo, StackSlot, MPI, Align(8)); + + SDValue OffsetSlot = + CurDAG->getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), DL); + Hi = CurDAG->getStore(Chain, DL, Hi, OffsetSlot, MPI.getWithOffset(4), + Align(8)); + + Chain = CurDAG->getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); + + SDVTList VTs = CurDAG->getVTList({VT, MVT::Other}); + SDValue IntID = + CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64); + SDValue Ops[] = {Chain, + IntID, + Passthru, + StackSlot, + CurDAG->getRegister(RISCV::X0, MVT::i64), + VL}; + + Result = CurDAG->getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, + MVT::i64, MPI, Align(8), + MachineMemOperand::MOLoad); + break; + } } - // Lower SPLAT_VECTOR_SPLIT_I64 to two scalar stores and a stride 0 vector - // load. Done after lowering and combining so that we have a chance to - // optimize this to VMV_V_X_VL when the upper bits aren't needed. - if (N->getOpcode() != RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL) - continue; + if (Result) { + LLVM_DEBUG(dbgs() << "RISCV DAG preprocessing replacing:\nOld: "); + LLVM_DEBUG(N->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\nNew: "); + LLVM_DEBUG(Result->dump(CurDAG)); + LLVM_DEBUG(dbgs() << "\n"); - assert(N->getNumOperands() == 4 && "Unexpected number of operands"); - MVT VT = N->getSimpleValueType(0); - SDValue Passthru = N->getOperand(0); - SDValue Lo = N->getOperand(1); - SDValue Hi = N->getOperand(2); - SDValue VL = N->getOperand(3); - assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() && - Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 && - "Unexpected VTs!"); - MachineFunction &MF = CurDAG->getMachineFunction(); - RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>(); - SDLoc DL(N); - - // We use the same frame index we use for moving two i32s into 64-bit FPR. - // This is an analogous operation. - int FI = FuncInfo->getMoveF64FrameIndex(MF); - MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); - const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); - SDValue StackSlot = - CurDAG->getFrameIndex(FI, TLI.getPointerTy(CurDAG->getDataLayout())); - - SDValue Chain = CurDAG->getEntryNode(); - Lo = CurDAG->getStore(Chain, DL, Lo, StackSlot, MPI, Align(8)); - - SDValue OffsetSlot = - CurDAG->getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), DL); - Hi = CurDAG->getStore(Chain, DL, Hi, OffsetSlot, MPI.getWithOffset(4), - Align(8)); - - Chain = CurDAG->getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); - - SDVTList VTs = CurDAG->getVTList({VT, MVT::Other}); - SDValue IntID = - CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64); - SDValue Ops[] = {Chain, - IntID, - Passthru, - StackSlot, - CurDAG->getRegister(RISCV::X0, MVT::i64), - VL}; - - SDValue Result = CurDAG->getMemIntrinsicNode( - ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MVT::i64, MPI, Align(8), - MachineMemOperand::MOLoad); - - // We're about to replace all uses of the SPLAT_VECTOR_SPLIT_I64 with the - // vlse we created. This will cause general havok on the dag because - // anything below the conversion could be folded into other existing nodes. - // To avoid invalidating 'I', back it up to the convert node. - --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); - - // Now that we did that, the node is dead. Increment the iterator to the - // next node to process, then delete N. - ++I; - CurDAG->DeleteNode(N); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + MadeChange = true; + } } + + if (MadeChange) + CurDAG->RemoveDeadNodes(); } void RISCVDAGToDAGISel::PostprocessISelDAG() { @@ -143,7 +146,6 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() { continue; MadeChange |= doPeepholeSExtW(N); - MadeChange |= doPeepholeLoadStoreADDI(N); MadeChange |= doPeepholeMaskedRVV(N); } @@ -153,40 +155,6 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() { CurDAG->RemoveDeadNodes(); } -// Returns true if N is a MachineSDNode that has a reg and simm12 memory -// operand. The indices of the base pointer and offset are returned in BaseOpIdx -// and OffsetOpIdx. -static bool hasMemOffset(SDNode *N, unsigned &BaseOpIdx, - unsigned &OffsetOpIdx) { - switch (N->getMachineOpcode()) { - case RISCV::LB: - case RISCV::LH: - case RISCV::LW: - case RISCV::LBU: - case RISCV::LHU: - case RISCV::LWU: - case RISCV::LD: - case RISCV::FLH: - case RISCV::FLW: - case RISCV::FLD: - BaseOpIdx = 0; - OffsetOpIdx = 1; - return true; - case RISCV::SB: - case RISCV::SH: - case RISCV::SW: - case RISCV::SD: - case RISCV::FSH: - case RISCV::FSW: - case RISCV::FSD: - BaseOpIdx = 1; - OffsetOpIdx = 2; - return true; - } - - return false; -} - static SDNode *selectImmSeq(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, RISCVMatInt::InstSeq &Seq) { SDNode *Result = nullptr; @@ -285,9 +253,7 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands( SDValue Chain = Node->getOperand(0); SDValue Glue; - SDValue Base; - SelectBaseAddr(Node->getOperand(CurOp++), Base); - Operands.push_back(Base); // Base pointer. + Operands.push_back(Node->getOperand(CurOp++)); // Base pointer. if (IsStridedOrIndexed) { Operands.push_back(Node->getOperand(CurOp++)); // Index. @@ -651,83 +617,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget)); return; } - case ISD::ADD: { - // Try to select ADD + immediate used as memory addresses to - // (ADDI (ADD X, Imm-Lo12), Lo12) if it will allow the ADDI to be removed by - // doPeepholeLoadStoreADDI. - - // LHS should be an immediate. - auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); - if (!N1C) - break; - - int64_t Offset = N1C->getSExtValue(); - int64_t Lo12 = SignExtend64<12>(Offset); - - // Don't do this if the lower 12 bits are 0 or we could use ADDI directly. - if (Lo12 == 0 || isInt<12>(Offset)) - break; - - // Don't do this if we can use a pair of ADDIs. - if (isInt<12>(Offset / 2) && isInt<12>(Offset - Offset / 2)) - break; - - RISCVMatInt::InstSeq Seq = - RISCVMatInt::generateInstSeq(Offset, Subtarget->getFeatureBits()); - - Offset -= Lo12; - // Restore sign bits for RV32. - if (!Subtarget->is64Bit()) - Offset = SignExtend64<32>(Offset); - - // We can fold if the last operation is an ADDI or its an ADDIW that could - // be treated as an ADDI. - if (Seq.back().Opc != RISCV::ADDI && - !(Seq.back().Opc == RISCV::ADDIW && isInt<32>(Offset))) - break; - assert(Seq.back().Imm == Lo12 && "Expected immediate to match Lo12"); - // Drop the last operation. - Seq.pop_back(); - assert(!Seq.empty() && "Expected more instructions in sequence"); - - bool AllPointerUses = true; - for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - - // Is this user a memory instruction that uses a register and immediate - // that has this ADD as its pointer. - unsigned BaseOpIdx, OffsetOpIdx; - if (!User->isMachineOpcode() || - !hasMemOffset(User, BaseOpIdx, OffsetOpIdx) || - UI.getOperandNo() != BaseOpIdx) { - AllPointerUses = false; - break; - } - - // If the memory instruction already has an offset, make sure the combined - // offset is foldable. - int64_t MemOffs = - cast<ConstantSDNode>(User->getOperand(OffsetOpIdx))->getSExtValue(); - MemOffs += Lo12; - if (!isInt<12>(MemOffs)) { - AllPointerUses = false; - break; - } - } - - if (!AllPointerUses) - break; - - // Emit (ADDI (ADD X, Hi), Lo) - SDNode *Imm = selectImmSeq(CurDAG, DL, VT, Seq); - SDNode *ADD = CurDAG->getMachineNode(RISCV::ADD, DL, VT, - Node->getOperand(0), SDValue(Imm, 0)); - SDNode *ADDI = - CurDAG->getMachineNode(RISCV::ADDI, DL, VT, SDValue(ADD, 0), - CurDAG->getTargetConstant(Lo12, DL, VT)); - ReplaceNode(Node, ADDI); - return; - } case ISD::SHL: { auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); if (!N1C) @@ -856,10 +745,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C) break; - uint64_t C2 = C->getZExtValue(); + unsigned C2 = C->getZExtValue(); unsigned XLen = Subtarget->getXLen(); - if (!C2 || C2 >= XLen) - break; + assert((C2 > 0 && C2 < XLen) && "Unexpected shift amount!"); uint64_t C1 = N1C->getZExtValue(); @@ -885,10 +773,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Turn (and (srl x, c2) c1) -> (srli (slli x, c3-c2), c3) if c1 is a mask // with c3 leading zeros. if (!LeftShift && isMask_64(C1)) { - uint64_t C3 = XLen - (64 - countLeadingZeros(C1)); - if (C2 < C3) { + unsigned Leading = XLen - (64 - countLeadingZeros(C1)); + if (C2 < Leading) { // If the number of leading zeros is C2+32 this can be SRLIW. - if (C2 + 32 == C3) { + if (C2 + 32 == Leading) { SDNode *SRLIW = CurDAG->getMachineNode( RISCV::SRLIW, DL, VT, X, CurDAG->getTargetConstant(C2, DL, VT)); ReplaceNode(Node, SRLIW); @@ -900,7 +788,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // // This pattern occurs when (i32 (srl (sra 31), c3 - 32)) is type // legalized and goes through DAG combine. - if (C2 >= 32 && (C3 - C2) == 1 && N0.hasOneUse() && + if (C2 >= 32 && (Leading - C2) == 1 && N0.hasOneUse() && X.getOpcode() == ISD::SIGN_EXTEND_INREG && cast<VTSDNode>(X.getOperand(1))->getVT() == MVT::i32) { SDNode *SRAIW = @@ -908,25 +796,25 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { CurDAG->getTargetConstant(31, DL, VT)); SDNode *SRLIW = CurDAG->getMachineNode( RISCV::SRLIW, DL, VT, SDValue(SRAIW, 0), - CurDAG->getTargetConstant(C3 - 32, DL, VT)); + CurDAG->getTargetConstant(Leading - 32, DL, VT)); ReplaceNode(Node, SRLIW); return; } // (srli (slli x, c3-c2), c3). // Skip if we could use (zext.w (sraiw X, C2)). - bool Skip = Subtarget->hasStdExtZba() && C3 == 32 && + bool Skip = Subtarget->hasStdExtZba() && Leading == 32 && X.getOpcode() == ISD::SIGN_EXTEND_INREG && cast<VTSDNode>(X.getOperand(1))->getVT() == MVT::i32; // Also Skip if we can use bexti. - Skip |= Subtarget->hasStdExtZbs() && C3 == XLen - 1; + Skip |= Subtarget->hasStdExtZbs() && Leading == XLen - 1; if (OneUseOrZExtW && !Skip) { SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, X, - CurDAG->getTargetConstant(C3 - C2, DL, VT)); - SDNode *SRLI = - CurDAG->getMachineNode(RISCV::SRLI, DL, VT, SDValue(SLLI, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Leading - C2, DL, VT)); + SDNode *SRLI = CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, SDValue(SLLI, 0), + CurDAG->getTargetConstant(Leading, DL, VT)); ReplaceNode(Node, SRLI); return; } @@ -936,12 +824,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Turn (and (shl x, c2), c1) -> (srli (slli c2+c3), c3) if c1 is a mask // shifted by c2 bits with c3 leading zeros. if (LeftShift && isShiftedMask_64(C1)) { - uint64_t C3 = XLen - (64 - countLeadingZeros(C1)); + unsigned Leading = XLen - (64 - countLeadingZeros(C1)); - if (C2 + C3 < XLen && - C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + C3)) << C2)) { + if (C2 + Leading < XLen && + C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + Leading)) << C2)) { // Use slli.uw when possible. - if ((XLen - (C2 + C3)) == 32 && Subtarget->hasStdExtZba()) { + if ((XLen - (C2 + Leading)) == 32 && Subtarget->hasStdExtZba()) { SDNode *SLLI_UW = CurDAG->getMachineNode( RISCV::SLLI_UW, DL, VT, X, CurDAG->getTargetConstant(C2, DL, VT)); ReplaceNode(Node, SLLI_UW); @@ -952,10 +840,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (OneUseOrZExtW && !IsCANDI) { SDNode *SLLI = CurDAG->getMachineNode( RISCV::SLLI, DL, VT, X, - CurDAG->getTargetConstant(C2 + C3, DL, VT)); - SDNode *SRLI = - CurDAG->getMachineNode(RISCV::SRLI, DL, VT, SDValue(SLLI, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(C2 + Leading, DL, VT)); + SDNode *SRLI = CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, SDValue(SLLI, 0), + CurDAG->getTargetConstant(Leading, DL, VT)); ReplaceNode(Node, SRLI); return; } @@ -965,9 +853,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Turn (and (shr x, c2), c1) -> (slli (srli x, c2+c3), c3) if c1 is a // shifted mask with c2 leading zeros and c3 trailing zeros. if (!LeftShift && isShiftedMask_64(C1)) { - uint64_t Leading = XLen - (64 - countLeadingZeros(C1)); - uint64_t C3 = countTrailingZeros(C1); - if (Leading == C2 && C2 + C3 < XLen && OneUseOrZExtW && !IsCANDI) { + unsigned Leading = XLen - (64 - countLeadingZeros(C1)); + unsigned Trailing = countTrailingZeros(C1); + if (Leading == C2 && C2 + Trailing < XLen && OneUseOrZExtW && !IsCANDI) { unsigned SrliOpc = RISCV::SRLI; // If the input is zexti32 we should use SRLIW. if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && @@ -976,22 +864,23 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { X = X.getOperand(0); } SDNode *SRLI = CurDAG->getMachineNode( - SrliOpc, DL, VT, X, CurDAG->getTargetConstant(C2 + C3, DL, VT)); + SrliOpc, DL, VT, X, + CurDAG->getTargetConstant(C2 + Trailing, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLI, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Trailing, DL, VT)); ReplaceNode(Node, SLLI); return; } // If the leading zero count is C2+32, we can use SRLIW instead of SRLI. - if (Leading > 32 && (Leading - 32) == C2 && C2 + C3 < 32 && + if (Leading > 32 && (Leading - 32) == C2 && C2 + Trailing < 32 && OneUseOrZExtW && !IsCANDI) { - SDNode *SRLIW = - CurDAG->getMachineNode(RISCV::SRLIW, DL, VT, X, - CurDAG->getTargetConstant(C2 + C3, DL, VT)); + SDNode *SRLIW = CurDAG->getMachineNode( + RISCV::SRLIW, DL, VT, X, + CurDAG->getTargetConstant(C2 + Trailing, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Trailing, DL, VT)); ReplaceNode(Node, SLLI); return; } @@ -1000,25 +889,26 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { // Turn (and (shl x, c2), c1) -> (slli (srli x, c3-c2), c3) if c1 is a // shifted mask with no leading zeros and c3 trailing zeros. if (LeftShift && isShiftedMask_64(C1)) { - uint64_t Leading = XLen - (64 - countLeadingZeros(C1)); - uint64_t C3 = countTrailingZeros(C1); - if (Leading == 0 && C2 < C3 && OneUseOrZExtW && !IsCANDI) { + unsigned Leading = XLen - (64 - countLeadingZeros(C1)); + unsigned Trailing = countTrailingZeros(C1); + if (Leading == 0 && C2 < Trailing && OneUseOrZExtW && !IsCANDI) { SDNode *SRLI = CurDAG->getMachineNode( - RISCV::SRLI, DL, VT, X, CurDAG->getTargetConstant(C3 - C2, DL, VT)); + RISCV::SRLI, DL, VT, X, + CurDAG->getTargetConstant(Trailing - C2, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLI, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Trailing, DL, VT)); ReplaceNode(Node, SLLI); return; } // If we have (32-C2) leading zeros, we can use SRLIW instead of SRLI. - if (C2 < C3 && Leading + C2 == 32 && OneUseOrZExtW && !IsCANDI) { - SDNode *SRLIW = - CurDAG->getMachineNode(RISCV::SRLIW, DL, VT, X, - CurDAG->getTargetConstant(C3 - C2, DL, VT)); + if (C2 < Trailing && Leading + C2 == 32 && OneUseOrZExtW && !IsCANDI) { + SDNode *SRLIW = CurDAG->getMachineNode( + RISCV::SRLIW, DL, VT, X, + CurDAG->getTargetConstant(Trailing - C2, DL, VT)); SDNode *SLLI = CurDAG->getMachineNode(RISCV::SLLI, DL, VT, SDValue(SRLIW, 0), - CurDAG->getTargetConstant(C3, DL, VT)); + CurDAG->getTargetConstant(Trailing, DL, VT)); ReplaceNode(Node, SLLI); return; } @@ -1885,13 +1775,74 @@ bool RISCVDAGToDAGISel::SelectFrameAddrRegImm(SDValue Addr, SDValue &Base, return false; } -bool RISCVDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) { - // If this is FrameIndex, select it directly. Otherwise just let it get - // selected to a register independently. - if (auto *FIN = dyn_cast<FrameIndexSDNode>(Addr)) - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT()); - else - Base = Addr; +// Fold constant addresses. +static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL, + const MVT VT, const RISCVSubtarget *Subtarget, + SDValue Addr, SDValue &Base, SDValue &Offset) { + if (!isa<ConstantSDNode>(Addr)) + return false; + + int64_t CVal = cast<ConstantSDNode>(Addr)->getSExtValue(); + + // If the constant is a simm12, we can fold the whole constant and use X0 as + // the base. If the constant can be materialized with LUI+simm12, use LUI as + // the base. We can't use generateInstSeq because it favors LUI+ADDIW. + int64_t Lo12 = SignExtend64<12>(CVal); + int64_t Hi = (uint64_t)CVal - (uint64_t)Lo12; + if (!Subtarget->is64Bit() || isInt<32>(Hi)) { + if (Hi) { + int64_t Hi20 = (Hi >> 12) & 0xfffff; + Base = SDValue( + CurDAG->getMachineNode(RISCV::LUI, DL, VT, + CurDAG->getTargetConstant(Hi20, DL, VT)), + 0); + } else { + Base = CurDAG->getRegister(RISCV::X0, VT); + } + Offset = CurDAG->getTargetConstant(Lo12, DL, VT); + return true; + } + + // Ask how constant materialization would handle this constant. + RISCVMatInt::InstSeq Seq = + RISCVMatInt::generateInstSeq(CVal, Subtarget->getFeatureBits()); + + // If the last instruction would be an ADDI, we can fold its immediate and + // emit the rest of the sequence as the base. + if (Seq.back().Opc != RISCV::ADDI) + return false; + Lo12 = Seq.back().Imm; + + // Drop the last instruction. + Seq.pop_back(); + assert(!Seq.empty() && "Expected more instructions in sequence"); + + Base = SDValue(selectImmSeq(CurDAG, DL, VT, Seq), 0); + Offset = CurDAG->getTargetConstant(Lo12, DL, VT); + return true; +} + +// Is this ADD instruction only used as the base pointer of scalar loads and +// stores? +static bool isWorthFoldingAdd(SDValue Add) { + for (auto Use : Add->uses()) { + if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && + Use->getOpcode() != ISD::ATOMIC_LOAD && + Use->getOpcode() != ISD::ATOMIC_STORE) + return false; + EVT VT = cast<MemSDNode>(Use)->getMemoryVT(); + if (!VT.isScalarInteger() && VT != MVT::f16 && VT != MVT::f32 && + VT != MVT::f64) + return false; + // Don't allow stores of the value. It must be used as the address. + if (Use->getOpcode() == ISD::STORE && + cast<StoreSDNode>(Use)->getValue() == Add) + return false; + if (Use->getOpcode() == ISD::ATOMIC_STORE && + cast<AtomicSDNode>(Use)->getVal() == Add) + return false; + } + return true; } @@ -1947,9 +1898,10 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); assert(!isInt<12>(CVal) && "simm12 not already handled?"); + // Handle immediates in the range [-4096,-2049] or [2048, 4094]. We can use + // an ADDI for part of the offset and fold the rest into the load/store. + // This mirrors the AddiPair PatFrag in RISCVInstrInfo.td. if (isInt<12>(CVal / 2) && isInt<12>(CVal - CVal / 2)) { - // We can use an ADDI for part of the offset and fold the rest into the - // load/store. This mirrors the AddiPair PatFrag in RISCVInstrInfo.td. int64_t Adj = CVal < 0 ? -2048 : 2047; Base = SDValue( CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0), @@ -1958,8 +1910,27 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, Offset = CurDAG->getTargetConstant(CVal - Adj, DL, VT); return true; } + + // For larger immediates, we might be able to save one instruction from + // constant materialization by folding the Lo12 bits of the immediate into + // the address. We should only do this if the ADD is only used by loads and + // stores that can fold the lo12 bits. Otherwise, the ADD will get iseled + // separately with the full materialized immediate creating extra + // instructions. + if (isWorthFoldingAdd(Addr) && + selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr.getOperand(1), Base, + Offset)) { + // Insert an ADD instruction with the materialized Hi52 bits. + Base = SDValue( + CurDAG->getMachineNode(RISCV::ADD, DL, VT, Addr.getOperand(0), Base), + 0); + return true; + } } + if (selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr, Base, Offset)) + return true; + Base = Addr; Offset = CurDAG->getTargetConstant(0, DL, VT); return true; @@ -2044,6 +2015,101 @@ bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) { return false; } +/// Look for various patterns that can be done with a SHL that can be folded +/// into a SHXADD. \p ShAmt contains 1, 2, or 3 and is set based on which +/// SHXADD we are trying to match. +bool RISCVDAGToDAGISel::selectSHXADDOp(SDValue N, unsigned ShAmt, + SDValue &Val) { + if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) { + SDValue N0 = N.getOperand(0); + + bool LeftShift = N0.getOpcode() == ISD::SHL; + if ((LeftShift || N0.getOpcode() == ISD::SRL) && + isa<ConstantSDNode>(N0.getOperand(1))) { + uint64_t Mask = N.getConstantOperandVal(1); + unsigned C2 = N0.getConstantOperandVal(1); + + unsigned XLen = Subtarget->getXLen(); + if (LeftShift) + Mask &= maskTrailingZeros<uint64_t>(C2); + else + Mask &= maskTrailingOnes<uint64_t>(XLen - C2); + + // Look for (and (shl y, c2), c1) where c1 is a shifted mask with no + // leading zeros and c3 trailing zeros. We can use an SRLI by c2+c3 + // followed by a SHXADD with c3 for the X amount. + if (isShiftedMask_64(Mask)) { + unsigned Leading = XLen - (64 - countLeadingZeros(Mask)); + unsigned Trailing = countTrailingZeros(Mask); + if (LeftShift && Leading == 0 && C2 < Trailing && Trailing == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue(CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Trailing - C2, DL, VT)), + 0); + return true; + } + // Look for (and (shr y, c2), c1) where c1 is a shifted mask with c2 + // leading zeros and c3 trailing zeros. We can use an SRLI by C3 + // followed by a SHXADD using c3 for the X amount. + if (!LeftShift && Leading == C2 && Trailing == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue( + CurDAG->getMachineNode( + RISCV::SRLI, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Leading + Trailing, DL, VT)), + 0); + return true; + } + } + } + } + + bool LeftShift = N.getOpcode() == ISD::SHL; + if ((LeftShift || N.getOpcode() == ISD::SRL) && + isa<ConstantSDNode>(N.getOperand(1))) { + SDValue N0 = N.getOperand(0); + if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && + isa<ConstantSDNode>(N0.getOperand(1))) { + uint64_t Mask = N0.getConstantOperandVal(1); + if (isShiftedMask_64(Mask)) { + unsigned C1 = N.getConstantOperandVal(1); + unsigned XLen = Subtarget->getXLen(); + unsigned Leading = XLen - (64 - countLeadingZeros(Mask)); + unsigned Trailing = countTrailingZeros(Mask); + // Look for (shl (and X, Mask), C1) where Mask has 32 leading zeros and + // C3 trailing zeros. If C1+C3==ShAmt we can use SRLIW+SHXADD. + if (LeftShift && Leading == 32 && Trailing > 0 && + (Trailing + C1) == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue(CurDAG->getMachineNode( + RISCV::SRLIW, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Trailing, DL, VT)), + 0); + return true; + } + // Look for (srl (and X, Mask), C1) where Mask has 32 leading zeros and + // C3 trailing zeros. If C3-C1==ShAmt we can use SRLIW+SHXADD. + if (!LeftShift && Leading == 32 && Trailing > C1 && + (Trailing - C1) == ShAmt) { + SDLoc DL(N); + EVT VT = N.getValueType(); + Val = SDValue(CurDAG->getMachineNode( + RISCV::SRLIW, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Trailing, DL, VT)), + 0); + return true; + } + } + } + } + + return false; +} + // Return true if all users of this SDNode* only consume the lower \p Bits. // This can be used to form W instructions for add/sub/mul/shl even when the // root isn't a sext_inreg. This can allow the ADDW/SUBW/MULW/SLLIW to CSE if @@ -2271,102 +2337,6 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width, return false; } -// Merge an ADDI into the offset of a load/store instruction where possible. -// (load (addi base, off1), off2) -> (load base, off1+off2) -// (store val, (addi base, off1), off2) -> (store val, base, off1+off2) -// (load (add base, (addi src, off1)), off2) -// -> (load (add base, src), off1+off2) -// (store val, (add base, (addi src, off1)), off2) -// -> (store val, (add base, src), off1+off2) -// This is possible when off1+off2 fits a 12-bit immediate. -bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) { - unsigned OffsetOpIdx, BaseOpIdx; - if (!hasMemOffset(N, BaseOpIdx, OffsetOpIdx)) - return false; - - if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx))) - return false; - - SDValue Base = N->getOperand(BaseOpIdx); - - if (!Base.isMachineOpcode()) - return false; - - if (Base.getMachineOpcode() == RISCV::ADDI) { - // If the base is an ADDI, we can merge it in to the load/store. - } else if (Base.getMachineOpcode() == RISCV::ADDIW && - isa<ConstantSDNode>(Base.getOperand(1)) && - Base.getOperand(0).isMachineOpcode() && - Base.getOperand(0).getMachineOpcode() == RISCV::LUI && - isa<ConstantSDNode>(Base.getOperand(0).getOperand(0))) { - // ADDIW can be merged if it's part of LUI+ADDIW constant materialization - // and LUI+ADDI would have produced the same result. This is true for all - // simm32 values except 0x7ffff800-0x7fffffff. - int64_t Offset = - SignExtend64<32>(Base.getOperand(0).getConstantOperandVal(0) << 12); - Offset += cast<ConstantSDNode>(Base.getOperand(1))->getSExtValue(); - if (!isInt<32>(Offset)) - return false; - } else - return false; - - SDValue ImmOperand = Base.getOperand(1); - uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx); - - if (auto *Const = dyn_cast<ConstantSDNode>(ImmOperand)) { - int64_t Offset1 = Const->getSExtValue(); - int64_t CombinedOffset = Offset1 + Offset2; - if (!isInt<12>(CombinedOffset)) - return false; - ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand), - ImmOperand.getValueType()); - } else if (auto *GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) { - // If the off1 in (addi base, off1) is a global variable's address (its - // low part, really), then we can rely on the alignment of that variable - // to provide a margin of safety before off1 can overflow the 12 bits. - // Check if off2 falls within that margin; if so off1+off2 can't overflow. - const DataLayout &DL = CurDAG->getDataLayout(); - Align Alignment = commonAlignment(GA->getGlobal()->getPointerAlignment(DL), - GA->getOffset()); - if (Offset2 != 0 && Alignment <= Offset2) - return false; - int64_t Offset1 = GA->getOffset(); - int64_t CombinedOffset = Offset1 + Offset2; - ImmOperand = CurDAG->getTargetGlobalAddress( - GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(), - CombinedOffset, GA->getTargetFlags()); - } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) { - // Ditto. - Align Alignment = commonAlignment(CP->getAlign(), CP->getOffset()); - if (Offset2 != 0 && Alignment <= Offset2) - return false; - int64_t Offset1 = CP->getOffset(); - int64_t CombinedOffset = Offset1 + Offset2; - ImmOperand = CurDAG->getTargetConstantPool( - CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(), - CombinedOffset, CP->getTargetFlags()); - } else { - return false; - } - - LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: "); - LLVM_DEBUG(Base->dump(CurDAG)); - LLVM_DEBUG(dbgs() << "\nN: "); - LLVM_DEBUG(N->dump(CurDAG)); - LLVM_DEBUG(dbgs() << "\n"); - - // Modify the offset operand of the load/store. - if (BaseOpIdx == 0) { // Load - N = CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand, - N->getOperand(2)); - } else { // Store - N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0), - ImmOperand, N->getOperand(3)); - } - - return true; -} - // Try to remove sext.w if the input is a W instruction or can be made into // a W instruction cheaply. bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) { diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index b50927cfcca5..ef46204c00ac 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -47,7 +47,6 @@ public: bool SelectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectFrameAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectBaseAddr(SDValue Addr, SDValue &Base); bool SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset); bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt); @@ -61,6 +60,17 @@ public: bool selectSExti32(SDValue N, SDValue &Val); bool selectZExti32(SDValue N, SDValue &Val); + bool selectSHXADDOp(SDValue N, unsigned ShAmt, SDValue &Val); + bool selectSH1ADDOp(SDValue N, SDValue &Val) { + return selectSHXADDOp(N, 1, Val); + } + bool selectSH2ADDOp(SDValue N, SDValue &Val) { + return selectSHXADDOp(N, 2, Val); + } + bool selectSH3ADDOp(SDValue N, SDValue &Val) { + return selectSHXADDOp(N, 3, Val); + } + bool hasAllNBitUsers(SDNode *Node, unsigned Bits) const; bool hasAllHUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 16); } bool hasAllWUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 32); } @@ -118,7 +128,6 @@ public: #include "RISCVGenDAGISel.inc" private: - bool doPeepholeLoadStoreADDI(SDNode *Node); bool doPeepholeSExtW(SDNode *Node); bool doPeepholeMaskedRVV(SDNode *Node); }; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index ff645dea4e7a..658865703079 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -526,6 +526,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::VP_FPTOSI, ISD::VP_FPTOUI, ISD::VP_TRUNCATE, ISD::VP_SETCC}, VT, Custom); setOperationAction(ISD::VECTOR_REVERSE, VT, Custom); + + setOperationPromotedToType( + ISD::VECTOR_SPLICE, VT, + MVT::getVectorVT(MVT::i8, VT.getVectorElementCount())); } for (MVT VT : IntVecVTs) { @@ -1157,6 +1161,37 @@ bool RISCVTargetLowering::hasBitTest(SDValue X, SDValue Y) const { return C && C->getAPIntValue().ule(10); } +bool RISCVTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getIntegerBitWidth(); + if (BitSize > Subtarget.getXLen()) + return false; + + // Fast path, assume 32-bit immediates are cheap. + int64_t Val = Imm.getSExtValue(); + if (isInt<32>(Val)) + return true; + + // A constant pool entry may be more aligned thant he load we're trying to + // replace. If we don't support unaligned scalar mem, prefer the constant + // pool. + // TODO: Can the caller pass down the alignment? + if (!Subtarget.enableUnalignedScalarMem()) + return true; + + // Prefer to keep the load if it would require many instructions. + // This uses the same threshold we use for constant pools but doesn't + // check useConstantPoolForLargeInts. + // TODO: Should we keep the load only when we're definitely going to emit a + // constant pool? + + RISCVMatInt::InstSeq Seq = + RISCVMatInt::generateInstSeq(Val, Subtarget.getFeatureBits()); + return Seq.size() <= Subtarget.getMaxBuildIntsCost(); +} + bool RISCVTargetLowering:: shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, @@ -1659,7 +1694,7 @@ static SDValue convertFromScalableVector(EVT VT, SDValue V, SelectionDAG &DAG, /// Return the type of the mask type suitable for masking the provided /// vector type. This is simply an i1 element type vector of the same /// (possibly scalable) length. -static MVT getMaskTypeFor(EVT VecVT) { +static MVT getMaskTypeFor(MVT VecVT) { assert(VecVT.isVector()); ElementCount EC = VecVT.getVectorElementCount(); return MVT::getVectorVT(MVT::i1, EC); @@ -5748,8 +5783,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_SPLICE(SDValue Op, DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VecVT, DAG.getUNDEF(VecVT), V1, DownOffset, TrueMask, UpOffset); return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VecVT, SlideDown, V2, UpOffset, - TrueMask, - DAG.getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT)); + TrueMask, DAG.getRegister(RISCV::X0, XLenVT)); } SDValue @@ -8530,12 +8564,6 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) { return Opcode; } -// Combine (sra (shl X, 32), 32 - C) -> (shl (sext_inreg X, i32), C) -// FIXME: Should this be a generic combine? There's a similar combine on X86. -// -// Also try these folds where an add or sub is in the middle. -// (sra (add (shl X, 32), C1), 32 - C) -> (shl (sext_inreg (add X, C1), C) -// (sra (sub C1, (shl X, 32)), 32 - C) -> (shl (sext_inreg (sub C1, X), C) static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { assert(N->getOpcode() == ISD::SRA && "Unexpected opcode"); @@ -8543,12 +8571,40 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, if (N->getValueType(0) != MVT::i64 || !Subtarget.is64Bit()) return SDValue(); - auto *ShAmtC = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!ShAmtC || ShAmtC->getZExtValue() > 32) + if (!isa<ConstantSDNode>(N->getOperand(1))) + return SDValue(); + uint64_t ShAmt = N->getConstantOperandVal(1); + if (ShAmt > 32) return SDValue(); SDValue N0 = N->getOperand(0); + // Combine (sra (sext_inreg (shl X, C1), i32), C2) -> + // (sra (shl X, C1+32), C2+32) so it gets selected as SLLI+SRAI instead of + // SLLIW+SRAIW. SLLI+SRAI have compressed forms. + if (ShAmt < 32 && + N0.getOpcode() == ISD::SIGN_EXTEND_INREG && N0.hasOneUse() && + cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i32 && + N0.getOperand(0).getOpcode() == ISD::SHL && N0.getOperand(0).hasOneUse() && + isa<ConstantSDNode>(N0.getOperand(0).getOperand(1))) { + uint64_t LShAmt = N0.getOperand(0).getConstantOperandVal(1); + if (LShAmt < 32) { + SDLoc ShlDL(N0.getOperand(0)); + SDValue Shl = DAG.getNode(ISD::SHL, ShlDL, MVT::i64, + N0.getOperand(0).getOperand(0), + DAG.getConstant(LShAmt + 32, ShlDL, MVT::i64)); + SDLoc DL(N); + return DAG.getNode(ISD::SRA, DL, MVT::i64, Shl, + DAG.getConstant(ShAmt + 32, DL, MVT::i64)); + } + } + + // Combine (sra (shl X, 32), 32 - C) -> (shl (sext_inreg X, i32), C) + // FIXME: Should this be a generic combine? There's a similar combine on X86. + // + // Also try these folds where an add or sub is in the middle. + // (sra (add (shl X, 32), C1), 32 - C) -> (shl (sext_inreg (add X, C1), C) + // (sra (sub C1, (shl X, 32)), 32 - C) -> (shl (sext_inreg (sub C1, X), C) SDValue Shl; ConstantSDNode *AddC = nullptr; @@ -8594,12 +8650,12 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, In, DAG.getValueType(MVT::i32)); - if (ShAmtC->getZExtValue() == 32) + if (ShAmt == 32) return SExt; return DAG.getNode( ISD::SHL, DL, MVT::i64, SExt, - DAG.getConstant(32 - ShAmtC->getZExtValue(), DL, MVT::i64)); + DAG.getConstant(32 - ShAmt, DL, MVT::i64)); } SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, @@ -9152,10 +9208,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, // FIXME: Support FP. if (Val.getOpcode() == RISCVISD::VMV_X_S) { SDValue Src = Val.getOperand(0); - EVT VecVT = Src.getValueType(); + MVT VecVT = Src.getSimpleValueType(); EVT MemVT = Store->getMemoryVT(); // The memory VT and the element type must match. - if (VecVT.getVectorElementType() == MemVT) { + if (MemVT == VecVT.getVectorElementType()) { SDLoc DL(N); MVT MaskVT = getMaskTypeFor(VecVT); return DAG.getStoreVP( @@ -9864,7 +9920,7 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second, Register FLHS = First.getOperand(1).getReg(); Register FRHS = First.getOperand(2).getReg(); // Insert appropriate branch. - BuildMI(ThisMBB, DL, TII.getBrCond(FirstCC)) + BuildMI(FirstMBB, DL, TII.getBrCond(FirstCC)) .addReg(FLHS) .addReg(FRHS) .addMBB(SinkMBB); @@ -9876,7 +9932,7 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second, auto SecondCC = static_cast<RISCVCC::CondCode>(Second.getOperand(3).getImm()); // Insert appropriate branch. - BuildMI(FirstMBB, DL, TII.getBrCond(SecondCC)) + BuildMI(ThisMBB, DL, TII.getBrCond(SecondCC)) .addReg(SLHS) .addReg(SRHS) .addMBB(SinkMBB); @@ -9884,9 +9940,9 @@ EmitLoweredCascadedSelect(MachineInstr &First, MachineInstr &Second, Register DestReg = Second.getOperand(0).getReg(); Register Op2Reg4 = Second.getOperand(4).getReg(); BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII.get(RISCV::PHI), DestReg) - .addReg(Op1Reg4) - .addMBB(ThisMBB) .addReg(Op2Reg4) + .addMBB(ThisMBB) + .addReg(Op1Reg4) .addMBB(FirstMBB) .addReg(Op1Reg5) .addMBB(SecondMBB); @@ -12096,6 +12152,17 @@ const MCExpr *RISCVTargetLowering::LowerCustomJumpTableEntry( return MCSymbolRefExpr::create(MBB->getSymbol(), Ctx); } +bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const { + // We define vscale to be VLEN/RVVBitsPerBlock. VLEN is always a power + // of two >= 64, and RVVBitsPerBlock is 64. Thus, vscale must be + // a power of two as well. + // FIXME: This doesn't work for zve32, but that's already broken + // elsewhere for the same reason. + assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported"); + assert(RISCV::RVVBitsPerBlock == 64 && "RVVBitsPerBlock changed, audit needed"); + return true; +} + bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index eb013d4b6682..5e15176de59c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -520,9 +520,7 @@ public: SmallVectorImpl<SDValue> &InVals) const override; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const override { - return true; - } + Type *Ty) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; bool shouldConsiderGEPOffsetSplit() const override { return true; } @@ -599,6 +597,8 @@ public: unsigned uid, MCContext &Ctx) const override; + bool isVScaleKnownToBeAPowerOfTwo() const override; + private: /// RISCVCCAssignFn - This target-specific function extends the default /// CCValAssign with additional information used to lower RISC-V calling diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index ee4c026af8f4..06a90438838e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -384,7 +384,6 @@ def uimm6gt32 : ImmLeaf<XLenVT, [{ // Necessary because a frameindex can't be matched directly in a pattern. def FrameAddrRegImm : ComplexPattern<iPTR, 2, "SelectFrameAddrRegImm", [frameindex, or, add]>; -def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">; def AddrRegImm : ComplexPattern<iPTR, 2, "SelectAddrRegImm">; // Return the negation of an immediate value. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index f8bc241039f8..1ad634344c09 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -115,6 +115,35 @@ class VSXSched<int n, string o> : class VLFSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDFF" # n), ReadVLDX, ReadVMask]>; +// Unit-Stride Segment Loads and Stores +class VLSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLSEG" #nf #"e" #eew), ReadVLDX, ReadVMask]>; +class VSSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVSSEG" #nf #"e" #eew), + !cast<SchedReadWrite>("ReadVSTE" #eew #"V"), ReadVSTX, ReadVMask]>; +class VLSEGFFSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLSEGFF" #nf #"e" #eew), ReadVLDX, ReadVMask]>; +// Strided Segment Loads and Stores +class VLSSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLSSEG" #nf #"e" #eew), ReadVLDX, ReadVLDSX, + ReadVMask]>; +class VSSSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVSSSEG" #nf #"e" #eew), + !cast<SchedReadWrite>("ReadVSTS" #eew #"V"), ReadVSTX, ReadVSTSX, ReadVMask]>; +// Indexed Segment Loads and Stores +class VLUXSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLUXSEG" #nf #"e" #eew), ReadVLDX, ReadVLDUXV, + ReadVMask]>; +class VLOXSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVLOXSEG" #nf #"e" #eew), ReadVLDX, ReadVLDOXV, + ReadVMask]>; +class VSUXSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVSUXSEG" #nf #"e" #eew), + !cast<SchedReadWrite>("ReadVSTUX" #eew), ReadVSTX, ReadVSTUXV, ReadVMask]>; +class VSOXSEGSched<int nf, int eew> : Sched<[ + !cast<SchedReadWrite>("WriteVSOXSEG" #nf #"e" #eew), + !cast<SchedReadWrite>("ReadVSTOX" #eew), ReadVSTX, ReadVSTOXV, ReadVMask]>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -1476,14 +1505,9 @@ defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0, RVVConstraint = NoConstraint in { -def VMV1R_V : RVInstV<0b100111, 0, OPIVI, (outs VR:$vd), (ins VR:$vs2), - "vmv1r.v", "$vd, $vs2">, VMVRSched<1> { - let Uses = []; - let vm = 1; -} // A future extension may relax the vector register alignment restrictions. -foreach n = [2, 4, 8] in { - defvar vrc = !cast<VReg>("VRM"#n); +foreach n = [1, 2, 4, 8] in { + defvar vrc = !cast<VReg>(!if(!eq(n, 1), "VR", "VRM"#n)); def VMV#n#R_V : RVInstV<0b100111, !add(n, -1), OPIVI, (outs vrc:$vd), (ins vrc:$vs2), "vmv" # n # "r.v", "$vd, $vs2">, VMVRSched<n> { @@ -1500,31 +1524,35 @@ let Predicates = [HasVInstructions] in { defvar w = !cast<RISCVWidth>("LSWidth"#eew); def VLSEG#nf#E#eew#_V : - VUnitStrideSegmentLoad<!add(nf, -1), w, "vlseg"#nf#"e"#eew#".v">; + VUnitStrideSegmentLoad<!add(nf, -1), w, "vlseg"#nf#"e"#eew#".v">, + VLSEGSched<nf, eew>; def VLSEG#nf#E#eew#FF_V : - VUnitStrideSegmentLoadFF<!add(nf, -1), w, "vlseg"#nf#"e"#eew#"ff.v">; + VUnitStrideSegmentLoadFF<!add(nf, -1), w, "vlseg"#nf#"e"#eew#"ff.v">, + VLSEGFFSched<nf, eew>; def VSSEG#nf#E#eew#_V : - VUnitStrideSegmentStore<!add(nf, -1), w, "vsseg"#nf#"e"#eew#".v">; - + VUnitStrideSegmentStore<!add(nf, -1), w, "vsseg"#nf#"e"#eew#".v">, + VSSEGSched<nf, eew>; // Vector Strided Instructions def VLSSEG#nf#E#eew#_V : - VStridedSegmentLoad<!add(nf, -1), w, "vlsseg"#nf#"e"#eew#".v">; + VStridedSegmentLoad<!add(nf, -1), w, "vlsseg"#nf#"e"#eew#".v">, + VLSSEGSched<nf, eew>; def VSSSEG#nf#E#eew#_V : - VStridedSegmentStore<!add(nf, -1), w, "vssseg"#nf#"e"#eew#".v">; + VStridedSegmentStore<!add(nf, -1), w, "vssseg"#nf#"e"#eew#".v">, + VSSSEGSched<nf, eew>; // Vector Indexed Instructions def VLUXSEG#nf#EI#eew#_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, w, - "vluxseg"#nf#"ei"#eew#".v">; + "vluxseg"#nf#"ei"#eew#".v">, VLUXSEGSched<nf, eew>; def VLOXSEG#nf#EI#eew#_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, w, - "vloxseg"#nf#"ei"#eew#".v">; + "vloxseg"#nf#"ei"#eew#".v">, VLOXSEGSched<nf, eew>; def VSUXSEG#nf#EI#eew#_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, w, - "vsuxseg"#nf#"ei"#eew#".v">; + "vsuxseg"#nf#"ei"#eew#".v">, VSUXSEGSched<nf, eew>; def VSOXSEG#nf#EI#eew#_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, w, - "vsoxseg"#nf#"ei"#eew#".v">; + "vsoxseg"#nf#"ei"#eew#".v">, VSOXSEGSched<nf, eew>; } } } // Predicates = [HasVInstructions] @@ -1533,17 +1561,22 @@ let Predicates = [HasVInstructionsI64] in { foreach nf=2-8 in { // Vector Unit-strided Segment Instructions def VLSEG#nf#E64_V : - VUnitStrideSegmentLoad<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64.v">; + VUnitStrideSegmentLoad<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64.v">, + VLSEGSched<nf, 64>; def VLSEG#nf#E64FF_V : - VUnitStrideSegmentLoadFF<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64ff.v">; + VUnitStrideSegmentLoadFF<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64ff.v">, + VLSEGFFSched<nf, 64>; def VSSEG#nf#E64_V : - VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">; + VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">, + VSSEGSched<nf, 64>; // Vector Strided Segment Instructions def VLSSEG#nf#E64_V : - VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">; + VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">, + VLSSEGSched<nf, 64>; def VSSSEG#nf#E64_V : - VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">; + VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">, + VSSSEGSched<nf, 64>; } } // Predicates = [HasVInstructionsI64] let Predicates = [HasVInstructionsI64, IsRV64] in { @@ -1551,16 +1584,16 @@ let Predicates = [HasVInstructionsI64, IsRV64] in { // Vector Indexed Segment Instructions def VLUXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, LSWidth64, - "vluxseg"#nf#"ei64.v">; + "vluxseg"#nf#"ei64.v">, VLUXSEGSched<nf, 64>; def VLOXSEG#nf#EI64_V : VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, LSWidth64, - "vloxseg"#nf#"ei64.v">; + "vloxseg"#nf#"ei64.v">, VLOXSEGSched<nf, 64>; def VSUXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, LSWidth64, - "vsuxseg"#nf#"ei64.v">; + "vsuxseg"#nf#"ei64.v">, VSUXSEGSched<nf, 64>; def VSOXSEG#nf#EI64_V : VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, LSWidth64, - "vsoxseg"#nf#"ei64.v">; + "vsoxseg"#nf#"ei64.v">, VSOXSEGSched<nf, 64>; } } // Predicates = [HasVInstructionsI64, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 06d4c4d0a9e6..b7b25643e397 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -34,11 +34,11 @@ multiclass VPatUSLoadStoreSDNode<ValueType type, defvar load_instr = !cast<Instruction>("PseudoVLE"#sew#"_V_"#vlmul.MX); defvar store_instr = !cast<Instruction>("PseudoVSE"#sew#"_V_"#vlmul.MX); // Load - def : Pat<(type (load BaseAddr:$rs1)), - (load_instr BaseAddr:$rs1, avl, log2sew)>; + def : Pat<(type (load GPR:$rs1)), + (load_instr GPR:$rs1, avl, log2sew)>; // Store - def : Pat<(store type:$rs2, BaseAddr:$rs1), - (store_instr reg_class:$rs2, BaseAddr:$rs1, avl, log2sew)>; + def : Pat<(store type:$rs2, GPR:$rs1), + (store_instr reg_class:$rs2, GPR:$rs1, avl, log2sew)>; } multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type, @@ -53,11 +53,11 @@ multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type, !cast<Instruction>("VS"#!substr(vlmul.MX, 1)#"R_V"); // Load - def : Pat<(type (load BaseAddr:$rs1)), - (load_instr BaseAddr:$rs1)>; + def : Pat<(type (load GPR:$rs1)), + (load_instr GPR:$rs1)>; // Store - def : Pat<(store type:$rs2, BaseAddr:$rs1), - (store_instr reg_class:$rs2, BaseAddr:$rs1)>; + def : Pat<(store type:$rs2, GPR:$rs1), + (store_instr reg_class:$rs2, GPR:$rs1)>; } multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m> @@ -65,11 +65,11 @@ multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m> defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#m.BX); defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#m.BX); // Load - def : Pat<(m.Mask (load BaseAddr:$rs1)), - (load_instr BaseAddr:$rs1, m.AVL, m.Log2SEW)>; + def : Pat<(m.Mask (load GPR:$rs1)), + (load_instr GPR:$rs1, m.AVL, m.Log2SEW)>; // Store - def : Pat<(store m.Mask:$rs2, BaseAddr:$rs1), - (store_instr VR:$rs2, BaseAddr:$rs1, m.AVL, m.Log2SEW)>; + def : Pat<(store m.Mask:$rs2, GPR:$rs1), + (store_instr VR:$rs2, GPR:$rs1, m.AVL, m.Log2SEW)>; } class VPatBinarySDNode_VV<SDNode vop, @@ -1038,10 +1038,14 @@ let Predicates = [HasVInstructionsAnyF] in foreach vti = AllFloatVectors in { // Fold store of vmv.f.s to a vse with VL=1. defvar store_instr = !cast<Instruction>("PseudoVSE"#vti.SEW#"_V_"#vti.LMul.MX); - def : Pat<(store (vti.Scalar (int_riscv_vfmv_f_s (vti.Vector vti.RegClass:$rs2))), BaseAddr:$rs1), - (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>; - def : Pat<(store (extractelt (vti.Vector vti.RegClass:$rs2), 0), BaseAddr:$rs1), - (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>; + + let AddedComplexity = 2 in { + // Add complexity to increase the priority of this pattern being matched. + def : Pat<(store (vti.Scalar (int_riscv_vfmv_f_s (vti.Vector vti.RegClass:$rs2))), GPR:$rs1), + (store_instr vti.RegClass:$rs2, GPR:$rs1, 1, vti.Log2SEW)>; + def : Pat<(store (extractelt (vti.Vector vti.RegClass:$rs2), 0), GPR:$rs1), + (store_instr vti.RegClass:$rs2, GPR:$rs1, 1, vti.Log2SEW)>; + } defvar vmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_", vti.ScalarSuffix, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 081f61617d59..49306bb0f4e2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -76,13 +76,13 @@ def riscv_urem_vl : SDNode<"RISCVISD::UREM_VL", SDT_RISCVIntBinOp_VL>; def riscv_shl_vl : SDNode<"RISCVISD::SHL_VL", SDT_RISCVIntBinOp_VL>; def riscv_sra_vl : SDNode<"RISCVISD::SRA_VL", SDT_RISCVIntBinOp_VL>; def riscv_srl_vl : SDNode<"RISCVISD::SRL_VL", SDT_RISCVIntBinOp_VL>; -def riscv_smin_vl : SDNode<"RISCVISD::SMIN_VL", SDT_RISCVIntBinOp_VL>; -def riscv_smax_vl : SDNode<"RISCVISD::SMAX_VL", SDT_RISCVIntBinOp_VL>; -def riscv_umin_vl : SDNode<"RISCVISD::UMIN_VL", SDT_RISCVIntBinOp_VL>; -def riscv_umax_vl : SDNode<"RISCVISD::UMAX_VL", SDT_RISCVIntBinOp_VL>; +def riscv_smin_vl : SDNode<"RISCVISD::SMIN_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_smax_vl : SDNode<"RISCVISD::SMAX_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_umin_vl : SDNode<"RISCVISD::UMIN_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_umax_vl : SDNode<"RISCVISD::UMAX_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; -def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL>; -def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL>; +def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_ssubsat_vl : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>; def riscv_usubsat_vl : SDNode<"RISCVISD::USUBSAT_VL", SDT_RISCVIntBinOp_VL>; @@ -94,8 +94,8 @@ def riscv_fneg_vl : SDNode<"RISCVISD::FNEG_VL", SDT_RISCVFPUnOp_VL>; def riscv_fabs_vl : SDNode<"RISCVISD::FABS_VL", SDT_RISCVFPUnOp_VL>; def riscv_fsqrt_vl : SDNode<"RISCVISD::FSQRT_VL", SDT_RISCVFPUnOp_VL>; def riscv_fcopysign_vl : SDNode<"RISCVISD::FCOPYSIGN_VL", SDT_RISCVFPBinOp_VL>; -def riscv_fminnum_vl : SDNode<"RISCVISD::FMINNUM_VL", SDT_RISCVFPBinOp_VL>; -def riscv_fmaxnum_vl : SDNode<"RISCVISD::FMAXNUM_VL", SDT_RISCVFPBinOp_VL>; +def riscv_fminnum_vl : SDNode<"RISCVISD::FMINNUM_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>; +def riscv_fmaxnum_vl : SDNode<"RISCVISD::FMAXNUM_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>; def SDT_RISCVVecFMA_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 9532d1dd3dd2..02ae4f88d56a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -83,13 +83,13 @@ def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{ def BCLRXForm : SDNodeXForm<imm, [{ // Find the lowest 0. - return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingOnes(), + return CurDAG->getTargetConstant(countTrailingOnes(N->getZExtValue()), SDLoc(N), N->getValueType(0)); }]>; def BSETINVXForm : SDNodeXForm<imm, [{ // Find the lowest 1. - return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingZeros(), + return CurDAG->getTargetConstant(countTrailingZeros(N->getZExtValue()), SDLoc(N), N->getValueType(0)); }]>; @@ -239,6 +239,10 @@ def non_imm12 : PatLeaf<(XLenVT GPR:$a), [{ return !C || !isInt<12>(C->getSExtValue()); }]>; +def sh1add_op : ComplexPattern<XLenVT, 1, "selectSH1ADDOp", [], [], 6>; +def sh2add_op : ComplexPattern<XLenVT, 1, "selectSH2ADDOp", [], [], 6>; +def sh3add_op : ComplexPattern<XLenVT, 1, "selectSH3ADDOp", [], [], 6>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -1095,6 +1099,14 @@ def : Pat<(add (shl GPR:$rs1, (XLenVT 2)), non_imm12:$rs2), def : Pat<(add (shl GPR:$rs1, (XLenVT 3)), non_imm12:$rs2), (SH3ADD GPR:$rs1, GPR:$rs2)>; +// More complex cases use a ComplexPattern. +def : Pat<(add sh1add_op:$rs1, non_imm12:$rs2), + (SH1ADD sh1add_op:$rs1, GPR:$rs2)>; +def : Pat<(add sh2add_op:$rs1, non_imm12:$rs2), + (SH2ADD sh2add_op:$rs1, GPR:$rs2)>; +def : Pat<(add sh3add_op:$rs1, non_imm12:$rs2), + (SH3ADD sh3add_op:$rs1, GPR:$rs2)>; + def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 6)), GPR:$rs2), (SH1ADD (SH1ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>; def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 10)), GPR:$rs2), @@ -1190,18 +1202,6 @@ def : Pat<(i64 (add (and GPR:$rs1, 0x3FFFFFFFC), non_imm12:$rs2)), (SH2ADD_UW (SRLI GPR:$rs1, 2), GPR:$rs2)>; def : Pat<(i64 (add (and GPR:$rs1, 0x7FFFFFFF8), non_imm12:$rs2)), (SH3ADD_UW (SRLI GPR:$rs1, 3), GPR:$rs2)>; - -// Use SRLIW to shift out the LSBs and zero the upper 32-bits. Use SHXADD to -// shift zeros into the LSBs the addition shl amount. -def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFE), (i64 1)), - non_imm12:$rs2)), - (SH2ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>; -def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFE), (i64 2)), - non_imm12:$rs2)), - (SH3ADD (SRLIW GPR:$rs1, 1), GPR:$rs2)>; -def : Pat<(i64 (add (shl (binop_oneuse<and> GPR:$rs1, 0xFFFFFFFC), (i64 1)), - non_imm12:$rs2)), - (SH3ADD (SRLIW GPR:$rs1, 2), GPR:$rs2)>; } // Predicates = [HasStdExtZba, IsRV64] let Predicates = [HasStdExtZbcOrZbkc] in { diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp index 1fc424411c12..dad0aa476471 100644 --- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp +++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp @@ -293,8 +293,16 @@ static void updateOperands(MachineInstr &MI, RegImmPair OldRegImm, assert((isCompressibleLoad(MI) || isCompressibleStore(MI)) && "Unsupported instruction for this optimization."); + int SkipN = 0; + + // Skip the first (value) operand to a store instruction (except if the store + // offset is zero) in order to avoid an incorrect transformation. + // e.g. sd a0, 808(a0) to addi a2, a0, 768; sd a2, 40(a2) + if (isCompressibleStore(MI) && OldRegImm.Imm != 0) + SkipN = 1; + // Update registers - for (MachineOperand &MO : MI.operands()) + for (MachineOperand &MO : drop_begin(MI.operands(), SkipN)) if (MO.isReg() && MO.getReg() == OldRegImm.Reg) { // Do not update operands that define the old register. // diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 43af1802d706..bafcf47b82e4 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -53,6 +53,20 @@ def WriteVLDFF8 : SchedWrite; def WriteVLDFF16 : SchedWrite; def WriteVLDFF32 : SchedWrite; def WriteVLDFF64 : SchedWrite; +// 7.8. Vector Segment Instructions +foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + def WriteVLSEG # nf # e # eew : SchedWrite; + def WriteVSSEG # nf # e # eew : SchedWrite; + def WriteVLSEGFF # nf # e # eew : SchedWrite; + def WriteVLSSEG # nf # e # eew : SchedWrite; + def WriteVSSSEG # nf # e # eew : SchedWrite; + def WriteVLUXSEG # nf # e # eew : SchedWrite; + def WriteVLOXSEG # nf # e # eew : SchedWrite; + def WriteVSUXSEG # nf # e # eew : SchedWrite; + def WriteVSOXSEG # nf # e # eew : SchedWrite; + } +} // 7.9. Vector Whole Register Instructions def WriteVLD1R8 : SchedWrite; def WriteVLD1R16 : SchedWrite; @@ -538,6 +552,20 @@ def : WriteRes<WriteVST1R, []>; def : WriteRes<WriteVST2R, []>; def : WriteRes<WriteVST4R, []>; def : WriteRes<WriteVST8R, []>; +// Vector Segment Loads and Stores +foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + def : WriteRes <!cast<SchedWrite>("WriteVLSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVLSEGFF" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVSSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVLSSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVSSSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVLUXSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVLOXSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVSUXSEG" # nf # "e" # eew), []>; + def : WriteRes <!cast<SchedWrite>("WriteVSOXSEG" # nf # "e" # eew), []>; + } +} // 12. Vector Integer Arithmetic Instructions def : WriteRes<WriteVIALUV, []>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 7caf0fedb2ca..96c46fb7554f 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -57,6 +57,10 @@ public: bool shouldExpandReduction(const IntrinsicInst *II) const; bool supportsScalableVectors() const { return ST->hasVInstructions(); } + PredicationStyle emitGetActiveLaneMask() const { + return ST->hasVInstructions() ? PredicationStyle::Data + : PredicationStyle::None; + } Optional<unsigned> getMaxVScale() const; Optional<unsigned> getVScaleForTuning() const; diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp index d953bc590473..f726f42c9bcb 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCCodeEmitter.cpp @@ -46,12 +46,6 @@ public: void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -110,9 +104,6 @@ static void emitUntypedInstrOperands(const MCInst &MI, EndianWriter &OSE) { void SPIRVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - auto Features = computeAvailableFeatures(STI.getFeatureBits()); - verifyInstructionPredicates(MI, Features); - EndianWriter OSE(OS, support::little); // Encode the first 32 SPIR-V bytes with the number of args and the opcode. @@ -128,5 +119,4 @@ void SPIRVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, emitUntypedInstrOperands(MI, OSE); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SPIRVGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp index 6b8b4a73af92..62ce15550ae7 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp @@ -22,6 +22,7 @@ #include "llvm/MC/TargetRegistry.h" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SPIRVGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h index 4009fa96aa68..abc8df34be0a 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.h @@ -44,6 +44,7 @@ std::unique_ptr<MCObjectTargetWriter> createSPIRVObjectTargetWriter(); // Defines symbolic names for the SPIR-V instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "SPIRVGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 0de232651377..605bf949187f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -215,6 +215,9 @@ void SPIRVAsmPrinter::outputInstruction(const MachineInstr *MI) { } void SPIRVAsmPrinter::emitInstruction(const MachineInstr *MI) { + SPIRV_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + if (!MAI->getSkipEmission(MI)) outputInstruction(MI); diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index df07a126eeea..5b6b82aebf30 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -68,6 +68,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, ArrayRef<ArrayRef<Register>> VRegs, FunctionLoweringInfo &FLI) const { assert(GR && "Must initialize the SPIRV type registry before lowering args."); + GR->setCurrentFunc(MIRBuilder.getMF()); // Assign types and names to all args, and store their types for later. SmallVector<Register, 4> ArgTypeVRegs; @@ -114,6 +115,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, auto MRI = MIRBuilder.getMRI(); Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass); + if (F.isDeclaration()) + GR->add(&F, &MIRBuilder.getMF(), FuncVReg); auto *FTy = F.getFunctionType(); auto FuncTy = GR->assignTypeToVReg(FTy, FuncVReg, MIRBuilder); @@ -136,6 +139,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, MIRBuilder.buildInstr(SPIRV::OpFunctionParameter) .addDef(VRegs[i][0]) .addUse(ArgTypeVRegs[i]); + if (F.isDeclaration()) + GR->add(F.getArg(i), &MIRBuilder.getMF(), VRegs[i][0]); } // Name the function. if (F.hasName()) @@ -165,6 +170,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (Info.OrigRet.Regs.size() > 1) return false; + GR->setCurrentFunc(MIRBuilder.getMF()); Register ResVReg = Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0]; // Emit a regular OpFunctionCall. If it's an externally declared function, diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp new file mode 100644 index 000000000000..57cd4bafd351 --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp @@ -0,0 +1,95 @@ +//===-- SPIRVDuplicatesTracker.cpp - SPIR-V Duplicates Tracker --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// General infrastructure for keeping track of the values that according to +// the SPIR-V binary layout should be global to the whole module. +// +//===----------------------------------------------------------------------===// + +#include "SPIRVDuplicatesTracker.h" + +using namespace llvm; + +template <typename T> +void SPIRVGeneralDuplicatesTracker::prebuildReg2Entry( + SPIRVDuplicatesTracker<T> &DT, SPIRVReg2EntryTy &Reg2Entry) { + for (auto &TPair : DT.getAllUses()) { + for (auto &RegPair : TPair.second) { + const MachineFunction *MF = RegPair.first; + Register R = RegPair.second; + MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(R); + if (!MI) + continue; + Reg2Entry[&MI->getOperand(0)] = &TPair.second; + } + } +} + +void SPIRVGeneralDuplicatesTracker::buildDepsGraph( + std::vector<SPIRV::DTSortableEntry *> &Graph, + MachineModuleInfo *MMI = nullptr) { + SPIRVReg2EntryTy Reg2Entry; + prebuildReg2Entry(TT, Reg2Entry); + prebuildReg2Entry(CT, Reg2Entry); + prebuildReg2Entry(GT, Reg2Entry); + prebuildReg2Entry(FT, Reg2Entry); + prebuildReg2Entry(AT, Reg2Entry); + + for (auto &Op2E : Reg2Entry) { + SPIRV::DTSortableEntry *E = Op2E.second; + Graph.push_back(E); + for (auto &U : *E) { + const MachineRegisterInfo &MRI = U.first->getRegInfo(); + MachineInstr *MI = MRI.getUniqueVRegDef(U.second); + if (!MI) + continue; + assert(MI && MI->getParent() && "No MachineInstr created yet"); + for (auto i = MI->getNumDefs(); i < MI->getNumOperands(); i++) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg()) + continue; + MachineOperand *RegOp = &MRI.getVRegDef(Op.getReg())->getOperand(0); + assert((MI->getOpcode() == SPIRV::OpVariable && i == 3) || + Reg2Entry.count(RegOp)); + if (Reg2Entry.count(RegOp)) + E->addDep(Reg2Entry[RegOp]); + } + + if (E->getIsFunc()) { + MachineInstr *Next = MI->getNextNode(); + if (Next && (Next->getOpcode() == SPIRV::OpFunction || + Next->getOpcode() == SPIRV::OpFunctionParameter)) { + E->addDep(Reg2Entry[&Next->getOperand(0)]); + } + } + } + } + + if (MMI) { + const Module *M = MMI->getModule(); + for (auto F = M->begin(), E = M->end(); F != E; ++F) { + const MachineFunction *MF = MMI->getMachineFunction(*F); + if (!MF) + continue; + for (const MachineBasicBlock &MBB : *MF) { + for (const MachineInstr &CMI : MBB) { + MachineInstr &MI = const_cast<MachineInstr &>(CMI); + MI.dump(); + if (MI.getNumExplicitDefs() > 0 && + Reg2Entry.count(&MI.getOperand(0))) { + dbgs() << "\t["; + for (SPIRV::DTSortableEntry *D : + Reg2Entry.lookup(&MI.getOperand(0))->getDeps()) + dbgs() << Register::virtReg2Index(D->lookup(MF)) << ", "; + dbgs() << "]\n"; + } + } + } + } + } +}
\ No newline at end of file diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h new file mode 100644 index 000000000000..58ae1f86ce42 --- /dev/null +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h @@ -0,0 +1,174 @@ +//===-- SPIRVDuplicatesTracker.h - SPIR-V Duplicates Tracker ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// General infrastructure for keeping track of the values that according to +// the SPIR-V binary layout should be global to the whole module. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H +#define LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H + +#include "MCTargetDesc/SPIRVBaseInfo.h" +#include "MCTargetDesc/SPIRVMCTargetDesc.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" + +#include <type_traits> + +namespace llvm { +namespace SPIRV { +// NOTE: using MapVector instead of DenseMap because it helps getting +// everything ordered in a stable manner for a price of extra (NumKeys)*PtrSize +// memory and expensive removals which do not happen anyway. +class DTSortableEntry : public MapVector<const MachineFunction *, Register> { + SmallVector<DTSortableEntry *, 2> Deps; + + struct FlagsTy { + unsigned IsFunc : 1; + unsigned IsGV : 1; + // NOTE: bit-field default init is a C++20 feature. + FlagsTy() : IsFunc(0), IsGV(0) {} + }; + FlagsTy Flags; + +public: + // Common hoisting utility doesn't support function, because their hoisting + // require hoisting of params as well. + bool getIsFunc() const { return Flags.IsFunc; } + bool getIsGV() const { return Flags.IsGV; } + void setIsFunc(bool V) { Flags.IsFunc = V; } + void setIsGV(bool V) { Flags.IsGV = V; } + + const SmallVector<DTSortableEntry *, 2> &getDeps() const { return Deps; } + void addDep(DTSortableEntry *E) { Deps.push_back(E); } +}; +} // namespace SPIRV + +template <typename KeyTy> class SPIRVDuplicatesTrackerBase { +public: + // NOTE: using MapVector instead of DenseMap helps getting everything ordered + // in a stable manner for a price of extra (NumKeys)*PtrSize memory and + // expensive removals which don't happen anyway. + using StorageTy = MapVector<KeyTy, SPIRV::DTSortableEntry>; + +private: + StorageTy Storage; + +public: + void add(KeyTy V, const MachineFunction *MF, Register R) { + if (find(V, MF).isValid()) + return; + + Storage[V][MF] = R; + if (std::is_same<Function, + typename std::remove_const< + typename std::remove_pointer<KeyTy>::type>::type>() || + std::is_same<Argument, + typename std::remove_const< + typename std::remove_pointer<KeyTy>::type>::type>()) + Storage[V].setIsFunc(true); + if (std::is_same<GlobalVariable, + typename std::remove_const< + typename std::remove_pointer<KeyTy>::type>::type>()) + Storage[V].setIsGV(true); + } + + Register find(KeyTy V, const MachineFunction *MF) const { + auto iter = Storage.find(V); + if (iter != Storage.end()) { + auto Map = iter->second; + auto iter2 = Map.find(MF); + if (iter2 != Map.end()) + return iter2->second; + } + return Register(); + } + + const StorageTy &getAllUses() const { return Storage; } + +private: + StorageTy &getAllUses() { return Storage; } + + // The friend class needs to have access to the internal storage + // to be able to build dependency graph, can't declare only one + // function a 'friend' due to the incomplete declaration at this point + // and mutual dependency problems. + friend class SPIRVGeneralDuplicatesTracker; +}; + +template <typename T> +class SPIRVDuplicatesTracker : public SPIRVDuplicatesTrackerBase<const T *> {}; + +class SPIRVGeneralDuplicatesTracker { + SPIRVDuplicatesTracker<Type> TT; + SPIRVDuplicatesTracker<Constant> CT; + SPIRVDuplicatesTracker<GlobalVariable> GT; + SPIRVDuplicatesTracker<Function> FT; + SPIRVDuplicatesTracker<Argument> AT; + + // NOTE: using MOs instead of regs to get rid of MF dependency to be able + // to use flat data structure. + // NOTE: replacing DenseMap with MapVector doesn't affect overall correctness + // but makes LITs more stable, should prefer DenseMap still due to + // significant perf difference. + using SPIRVReg2EntryTy = + MapVector<MachineOperand *, SPIRV::DTSortableEntry *>; + + template <typename T> + void prebuildReg2Entry(SPIRVDuplicatesTracker<T> &DT, + SPIRVReg2EntryTy &Reg2Entry); + +public: + void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph, + MachineModuleInfo *MMI); + + void add(const Type *T, const MachineFunction *MF, Register R) { + TT.add(T, MF, R); + } + + void add(const Constant *C, const MachineFunction *MF, Register R) { + CT.add(C, MF, R); + } + + void add(const GlobalVariable *GV, const MachineFunction *MF, Register R) { + GT.add(GV, MF, R); + } + + void add(const Function *F, const MachineFunction *MF, Register R) { + FT.add(F, MF, R); + } + + void add(const Argument *Arg, const MachineFunction *MF, Register R) { + AT.add(Arg, MF, R); + } + + Register find(const Type *T, const MachineFunction *MF) { + return TT.find(const_cast<Type *>(T), MF); + } + + Register find(const Constant *C, const MachineFunction *MF) { + return CT.find(const_cast<Constant *>(C), MF); + } + + Register find(const GlobalVariable *GV, const MachineFunction *MF) { + return GT.find(const_cast<GlobalVariable *>(GV), MF); + } + + Register find(const Function *F, const MachineFunction *MF) { + return FT.find(const_cast<Function *>(F), MF); + } + + Register find(const Argument *Arg, const MachineFunction *MF) { + return AT.find(const_cast<Argument *>(Arg), MF); + } +}; +} // namespace llvm +#endif
\ No newline at end of file diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 02a6905a1abc..5f890c003cbc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -101,7 +101,6 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val, SPIRVType *SpvType, bool EmitIR) { auto &MF = MIRBuilder.getMF(); - Register Res; const IntegerType *LLVMIntTy; if (SpvType) LLVMIntTy = cast<IntegerType>(getTypeForSPIRVType(SpvType)); @@ -110,15 +109,18 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val, // Find a constant in DT or build a new one. const auto ConstInt = ConstantInt::get(const_cast<IntegerType *>(LLVMIntTy), Val); - unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; - Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); - assignTypeToVReg(LLVMIntTy, Res, MIRBuilder); - if (EmitIR) - MIRBuilder.buildConstant(Res, *ConstInt); - else - MIRBuilder.buildInstr(SPIRV::OpConstantI) - .addDef(Res) - .addImm(ConstInt->getSExtValue()); + Register Res = DT.find(ConstInt, &MF); + if (!Res.isValid()) { + unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; + Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); + assignTypeToVReg(LLVMIntTy, Res, MIRBuilder); + if (EmitIR) + MIRBuilder.buildConstant(Res, *ConstInt); + else + MIRBuilder.buildInstr(SPIRV::OpConstantI) + .addDef(Res) + .addImm(ConstInt->getSExtValue()); + } return Res; } @@ -126,7 +128,6 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val, MachineIRBuilder &MIRBuilder, SPIRVType *SpvType) { auto &MF = MIRBuilder.getMF(); - Register Res; const Type *LLVMFPTy; if (SpvType) { LLVMFPTy = getTypeForSPIRVType(SpvType); @@ -136,10 +137,13 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val, } // Find a constant in DT or build a new one. const auto ConstFP = ConstantFP::get(LLVMFPTy->getContext(), Val); - unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; - Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); - assignTypeToVReg(LLVMFPTy, Res, MIRBuilder); - MIRBuilder.buildFConstant(Res, *ConstFP); + Register Res = DT.find(ConstFP, &MF); + if (!Res.isValid()) { + unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32; + Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth)); + assignTypeToVReg(LLVMFPTy, Res, MIRBuilder); + MIRBuilder.buildFConstant(Res, *ConstFP); + } return Res; } @@ -184,6 +188,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( *Subtarget.getRegBankInfo()); } Reg = MIB->getOperand(0).getReg(); + DT.add(GVar, &MIRBuilder.getMF(), Reg); // Set to Reg the same type as ResVReg has. auto MRI = MIRBuilder.getMRI(); @@ -318,10 +323,11 @@ SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const { SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType( const Type *Type, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier AccessQual, bool EmitIR) { + Register Reg = DT.find(Type, &MIRBuilder.getMF()); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); SPIRVType *SpirvType = createSPIRVType(Type, MIRBuilder, AccessQual, EmitIR); - VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType; - SPIRVToLLVMType[SpirvType] = Type; - return SpirvType; + return restOfCreateSPIRVType(Type, SpirvType); } bool SPIRVGlobalRegistry::isScalarOfType(Register VReg, @@ -387,17 +393,21 @@ SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth, MIRBuilder); } -SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(Type *LLVMTy, - MachineInstrBuilder MIB) { - SPIRVType *SpirvType = MIB; +SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(const Type *LLVMTy, + SPIRVType *SpirvType) { + assert(CurMF == SpirvType->getMF()); VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType; SPIRVToLLVMType[SpirvType] = LLVMTy; + DT.add(LLVMTy, CurMF, getSPIRVTypeID(SpirvType)); return SpirvType; } SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType( unsigned BitWidth, MachineInstr &I, const SPIRVInstrInfo &TII) { Type *LLVMTy = IntegerType::get(CurMF->getFunction().getContext(), BitWidth); + Register Reg = DT.find(LLVMTy, CurMF); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); MachineBasicBlock &BB = *I.getParent(); auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeInt)) .addDef(createTypeVReg(CurMF->getRegInfo())) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index 952ab4c13e29..13dcc20a3e0a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -17,6 +17,7 @@ #define LLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H #include "MCTargetDesc/SPIRVBaseInfo.h" +#include "SPIRVDuplicatesTracker.h" #include "SPIRVInstrInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" @@ -30,7 +31,10 @@ class SPIRVGlobalRegistry { // where Reg = OpType... // while VRegToTypeMap tracks SPIR-V type assigned to other regs (i.e. not // type-declaring ones) - DenseMap<MachineFunction *, DenseMap<Register, SPIRVType *>> VRegToTypeMap; + DenseMap<const MachineFunction *, DenseMap<Register, SPIRVType *>> + VRegToTypeMap; + + SPIRVGeneralDuplicatesTracker DT; DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType; @@ -48,6 +52,39 @@ public: MachineFunction *CurMF; + void add(const Constant *C, MachineFunction *MF, Register R) { + DT.add(C, MF, R); + } + + void add(const GlobalVariable *GV, MachineFunction *MF, Register R) { + DT.add(GV, MF, R); + } + + void add(const Function *F, MachineFunction *MF, Register R) { + DT.add(F, MF, R); + } + + void add(const Argument *Arg, MachineFunction *MF, Register R) { + DT.add(Arg, MF, R); + } + + Register find(const Constant *C, MachineFunction *MF) { + return DT.find(C, MF); + } + + Register find(const GlobalVariable *GV, MachineFunction *MF) { + return DT.find(GV, MF); + } + + Register find(const Function *F, MachineFunction *MF) { + return DT.find(F, MF); + } + + void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph, + MachineModuleInfo *MMI = nullptr) { + DT.buildDepsGraph(Graph, MMI); + } + // Get or create a SPIR-V type corresponding the given LLVM IR type, // and map it to the given VReg by creating an ASSIGN_TYPE instruction. SPIRVType *assignTypeToVReg( @@ -136,7 +173,7 @@ private: SPIRVType *getOpTypeFunction(SPIRVType *RetType, const SmallVectorImpl<SPIRVType *> &ArgTypes, MachineIRBuilder &MIRBuilder); - SPIRVType *restOfCreateSPIRVType(Type *LLVMTy, MachineInstrBuilder MIB); + SPIRVType *restOfCreateSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType); public: Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder, diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 9294a60506a8..90b921a06f21 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -807,23 +807,29 @@ void SPIRVInstructionSelector::renderImm32(MachineInstrBuilder &MIB, Register SPIRVInstructionSelector::buildI32Constant(uint32_t Val, MachineInstr &I, const SPIRVType *ResType) const { + Type *LLVMTy = IntegerType::get(GR.CurMF->getFunction().getContext(), 32); const SPIRVType *SpvI32Ty = ResType ? ResType : GR.getOrCreateSPIRVIntegerType(32, I, TII); - Register NewReg; - NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); - MachineInstr *MI; - MachineBasicBlock &BB = *I.getParent(); - if (Val == 0) { - MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) - .addDef(NewReg) - .addUse(GR.getSPIRVTypeID(SpvI32Ty)); - } else { - MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI)) - .addDef(NewReg) - .addUse(GR.getSPIRVTypeID(SpvI32Ty)) - .addImm(APInt(32, Val).getZExtValue()); + // Find a constant in DT or build a new one. + auto ConstInt = ConstantInt::get(LLVMTy, Val); + Register NewReg = GR.find(ConstInt, GR.CurMF); + if (!NewReg.isValid()) { + NewReg = MRI->createGenericVirtualRegister(LLT::scalar(32)); + GR.add(ConstInt, GR.CurMF, NewReg); + MachineInstr *MI; + MachineBasicBlock &BB = *I.getParent(); + if (Val == 0) { + MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) + .addDef(NewReg) + .addUse(GR.getSPIRVTypeID(SpvI32Ty)); + } else { + MI = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI)) + .addDef(NewReg) + .addUse(GR.getSPIRVTypeID(SpvI32Ty)) + .addImm(APInt(32, Val).getZExtValue()); + } + constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); } - constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); return NewReg; } diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index fa78dd7942c6..a39df5234935 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -28,6 +28,11 @@ using namespace llvm; #define DEBUG_TYPE "spirv-module-analysis" +static cl::opt<bool> + SPVDumpDeps("spv-dump-deps", + cl::desc("Dump MIR with SPIR-V dependencies info"), + cl::Optional, cl::init(false)); + char llvm::SPIRVModuleAnalysis::ID = 0; namespace llvm { @@ -113,6 +118,83 @@ static bool findSameInstrInMS(const MachineInstr &A, return false; } +// Collect MI which defines the register in the given machine function. +static void collectDefInstr(Register Reg, const MachineFunction *MF, + SPIRV::ModuleAnalysisInfo *MAI, + SPIRV::ModuleSectionType MSType, + bool DoInsert = true) { + assert(MAI->hasRegisterAlias(MF, Reg) && "Cannot find register alias"); + MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(Reg); + assert(MI && "There should be an instruction that defines the register"); + MAI->setSkipEmission(MI); + if (DoInsert) + MAI->MS[MSType].push_back(MI); +} + +void SPIRVModuleAnalysis::collectGlobalEntities( + const std::vector<SPIRV::DTSortableEntry *> &DepsGraph, + SPIRV::ModuleSectionType MSType, + std::function<bool(const SPIRV::DTSortableEntry *)> Pred, + bool UsePreOrder) { + DenseSet<const SPIRV::DTSortableEntry *> Visited; + for (const auto *E : DepsGraph) { + std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil; + // NOTE: here we prefer recursive approach over iterative because + // we don't expect depchains long enough to cause SO. + RecHoistUtil = [MSType, UsePreOrder, &Visited, &Pred, + &RecHoistUtil](const SPIRV::DTSortableEntry *E) { + if (Visited.count(E) || !Pred(E)) + return; + Visited.insert(E); + + // Traversing deps graph in post-order allows us to get rid of + // register aliases preprocessing. + // But pre-order is required for correct processing of function + // declaration and arguments processing. + if (!UsePreOrder) + for (auto *S : E->getDeps()) + RecHoistUtil(S); + + Register GlobalReg = Register::index2VirtReg(MAI.getNextID()); + bool IsFirst = true; + for (auto &U : *E) { + const MachineFunction *MF = U.first; + Register Reg = U.second; + MAI.setRegisterAlias(MF, Reg, GlobalReg); + if (!MF->getRegInfo().getUniqueVRegDef(Reg)) + continue; + collectDefInstr(Reg, MF, &MAI, MSType, IsFirst); + IsFirst = false; + if (E->getIsGV()) + MAI.GlobalVarList.push_back(MF->getRegInfo().getUniqueVRegDef(Reg)); + } + + if (UsePreOrder) + for (auto *S : E->getDeps()) + RecHoistUtil(S); + }; + RecHoistUtil(E); + } +} + +// The function initializes global register alias table for types, consts, +// global vars and func decls and collects these instruction for output +// at module level. Also it collects explicit OpExtension/OpCapability +// instructions. +void SPIRVModuleAnalysis::processDefInstrs(const Module &M) { + std::vector<SPIRV::DTSortableEntry *> DepsGraph; + + GR->buildDepsGraph(DepsGraph, SPVDumpDeps ? MMI : nullptr); + + collectGlobalEntities( + DepsGraph, SPIRV::MB_TypeConstVars, + [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); }, false); + + collectGlobalEntities( + DepsGraph, SPIRV::MB_ExtFuncDecls, + [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true); +} + // Look for IDs declared with Import linkage, and map the imported name string // to the register defining that variable (which will usually be the result of // an OpFunction). This lets us call externally imported functions using @@ -146,10 +228,9 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI, // numbering has already occurred by this point. We can directly compare reg // arguments when detecting duplicates. static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, - SPIRV::ModuleSectionType MSType, - bool IsConstOrType = false) { + SPIRV::ModuleSectionType MSType) { MAI.setSkipEmission(&MI); - if (findSameInstrInMS(MI, MSType, MAI, IsConstOrType, IsConstOrType ? 1 : 0)) + if (findSameInstrInMS(MI, MSType, MAI, false)) return; // Found a duplicate, so don't add it. // No duplicates, so add it. MAI.MS[MSType].push_back(&MI); @@ -163,18 +244,11 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { continue; MachineFunction *MF = MMI->getMachineFunction(*F); assert(MF); - unsigned FCounter = 0; for (MachineBasicBlock &MBB : *MF) for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == SPIRV::OpFunction) - FCounter++; if (MAI.getSkipEmission(&MI)) continue; const unsigned OpCode = MI.getOpcode(); - const bool IsFuncOrParm = - OpCode == SPIRV::OpFunction || OpCode == SPIRV::OpFunctionParameter; - const bool IsConstOrType = - TII->isConstantInstr(MI) || TII->isTypeDeclInstr(MI); if (OpCode == SPIRV::OpName || OpCode == SPIRV::OpMemberName) { collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames); } else if (OpCode == SPIRV::OpEntryPoint) { @@ -182,12 +256,6 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { } else if (TII->isDecorationInstr(MI)) { collectOtherInstr(MI, MAI, SPIRV::MB_Annotations); collectFuncNames(MI, *F); - } else if (IsConstOrType || (FCounter > 1 && IsFuncOrParm)) { - // Now OpSpecConstant*s are not in DT, - // but they need to be collected anyway. - enum SPIRV::ModuleSectionType Type = - IsFuncOrParm ? SPIRV::MB_ExtFuncDecls : SPIRV::MB_TypeConstVars; - collectOtherInstr(MI, MAI, Type, IsConstOrType); } else if (OpCode == SPIRV::OpFunction) { collectFuncNames(MI, *F); } @@ -239,6 +307,7 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) { // TODO: Process type/const/global var/func decl instructions, number their // destination registers from 0 to N, collect Extensions and Capabilities. + processDefInstrs(M); // Number rest of registers from N+1 onwards. numberRegistersGlobally(M); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h index 1bef13d458c1..585868909d28 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_SPIRV_SPIRVMODULEANALYSIS_H #include "MCTargetDesc/SPIRVBaseInfo.h" +#include "SPIRVDuplicatesTracker.h" #include "SPIRVSubtarget.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" @@ -123,6 +124,11 @@ public: private: void setBaseInfo(const Module &M); template <typename T> void collectTypesConstsVars(); + void collectGlobalEntities( + const std::vector<SPIRV::DTSortableEntry *> &DepsGraph, + SPIRV::ModuleSectionType MSType, + std::function<bool(const SPIRV::DTSortableEntry *)> Pred, + bool UsePreOrder); void processDefInstrs(const Module &M); void collectFuncNames(MachineInstr &MI, const Function &F); void processOtherInstrs(const Module &M); diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp index d75d41b35838..ee460002fc58 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp @@ -44,12 +44,11 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); namespace { class SparcMCCodeEmitter : public MCCodeEmitter { - const MCInstrInfo &MCII; MCContext &Ctx; public: - SparcMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : MCII(mcii), Ctx(ctx) {} + SparcMCCodeEmitter(const MCInstrInfo &, MCContext &ctx) + : Ctx(ctx) {} SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete; SparcMCCodeEmitter &operator=(const SparcMCCodeEmitter &) = delete; ~SparcMCCodeEmitter() override = default; @@ -84,12 +83,6 @@ public: unsigned getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -97,9 +90,6 @@ private: void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI); support::endian::write(OS, Bits, Ctx.getAsmInfo()->isLittleEndian() ? support::little @@ -253,7 +243,6 @@ getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo, return 0; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SparcGenMCCodeEmitter.inc" MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp index 49b75b7e0bd1..b11c786e7856 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp @@ -24,6 +24,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SparcGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h index 7ef043d9df40..8e6a9ebdb2dd 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h @@ -46,6 +46,7 @@ std::unique_ptr<MCObjectTargetWriter> createSparcELFObjectWriter(bool Is64Bit, // Defines symbolic names for the Sparc instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "SparcGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index f6f9c0a1de81..c8961d507c72 100644 --- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -250,6 +250,8 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI, } void SparcAsmPrinter::emitInstruction(const MachineInstr *MI) { + Sparc_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); switch (MI->getOpcode()) { default: break; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index 242f566da2c9..1a71ff28424f 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -150,23 +150,13 @@ private: return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC24DBL, 3, false); } - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace -void SystemZMCCodeEmitter:: -encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - +void SystemZMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { MemOpsEmitted = 0; uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); unsigned Size = MCII.get(MI.getOpcode()).getSize(); @@ -329,7 +319,6 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum, return 0; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SystemZGenMCCodeEmitter.inc" MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 03141ecf551d..08886507fdb7 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -23,6 +23,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "SystemZGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h index db4485423416..f2bfc9ac48e5 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h @@ -95,6 +95,7 @@ std::unique_ptr<MCObjectTargetWriter> createSystemZObjectWriter(uint8_t OSABI); // Defines symbolic names for the SystemZ instructions. #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "SystemZGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index 6fb080607f51..1d55bf9a5804 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -143,6 +143,9 @@ void SystemZAsmPrinter::emitCallInformation(CallType CT) { } void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) { + SystemZ_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + SystemZMCInstLower Lower(MF->getContext(), *this); MCInst LoweredMI; switch (MI->getOpcode()) { diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index a7ea5e1e4bf8..fdd82a01f211 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -162,11 +162,7 @@ def CSR_SystemZ_NoRegs : CalleeSavedRegs<(add)>; //===----------------------------------------------------------------------===// // z/OS XPLINK64 callee-saved registers //===----------------------------------------------------------------------===// -// %R7D is volatile by the spec, but it must be saved in the prologue by -// any non-leaf function and restored in the epilogue for use by the -// return instruction so it functions exactly like a callee-saved register. -def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15), - (sequence "R%dD", 4, 4), +def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 8, 15), (sequence "F%dD", 15, 8))>; def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64, diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 43bc7426cfa8..975eb8862e82 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -918,72 +918,74 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>(); const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); + auto &GRRegClass = SystemZ::GR64BitRegClass; + + // For non-leaf functions: + // - the address of callee (entry point) register R6 must be saved + CSI.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister())); + CSI.back().setRestored(false); + + // The return address register R7 must be saved and restored. + CSI.push_back(CalleeSavedInfo(Regs.getReturnFunctionAddressRegister())); + + // If the function needs a frame pointer, or if the backchain pointer should + // be stored, then save the stack pointer register R4. + if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain")) + CSI.push_back(CalleeSavedInfo(Regs.getStackPointerRegister())); // Scan the call-saved GPRs and find the bounds of the register spill area. - unsigned LowGPR = 0; - int LowOffset = INT32_MAX; - unsigned HighGPR = LowGPR; + Register LowRestoreGPR = 0; + int LowRestoreOffset = INT32_MAX; + Register LowSpillGPR = 0; + int LowSpillOffset = INT32_MAX; + Register HighGPR = 0; int HighOffset = -1; - unsigned RegSP = Regs.getStackPointerRegister(); - auto &GRRegClass = SystemZ::GR64BitRegClass; - const unsigned RegSize = 8; + for (auto &CS : CSI) { + Register Reg = CS.getReg(); + int Offset = RegSpillOffsets[Reg]; + if (Offset >= 0) { + if (GRRegClass.contains(Reg)) { + if (LowSpillOffset > Offset) { + LowSpillOffset = Offset; + LowSpillGPR = Reg; + } + if (CS.isRestored() && LowRestoreOffset > Offset) { + LowRestoreOffset = Offset; + LowRestoreGPR = Reg; + } - auto ProcessCSI = [&](std::vector<CalleeSavedInfo> &CSIList) { - for (auto &CS : CSIList) { - Register Reg = CS.getReg(); - int Offset = RegSpillOffsets[Reg]; - if (Offset >= 0) { - if (GRRegClass.contains(Reg)) { - if (LowOffset > Offset) { - LowOffset = Offset; - LowGPR = Reg; - } - - if (Offset > HighOffset) { - HighOffset = Offset; - HighGPR = Reg; - } + if (Offset > HighOffset) { + HighOffset = Offset; + HighGPR = Reg; } + // Non-volatile GPRs are saved in the dedicated register save area at + // the bottom of the stack and are not truly part of the "normal" stack + // frame. Mark the frame index as NoAlloc to indicate it as such. + unsigned RegSize = 8; int FrameIdx = MFFrame.CreateFixedSpillStackObject(RegSize, Offset); CS.setFrameIdx(FrameIdx); - } else - CS.setFrameIdx(INT32_MAX); + MFFrame.setStackID(FrameIdx, TargetStackID::NoAlloc); + } + } else { + Register Reg = CS.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + Align Alignment = TRI->getSpillAlign(*RC); + unsigned Size = TRI->getSpillSize(*RC); + Alignment = std::min(Alignment, getStackAlign()); + int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true); + CS.setFrameIdx(FrameIdx); } - }; - - std::vector<CalleeSavedInfo> Spills; - - // For non-leaf functions: - // - the address of callee (entry point) register R6 must be saved - Spills.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister())); - - // If the function needs a frame pointer, or if the backchain pointer should - // be stored, then save the stack pointer register R4. - if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain")) - Spills.push_back(CalleeSavedInfo(RegSP)); + } // Save the range of call-saved registers, for use by the // prologue/epilogue inserters. - ProcessCSI(CSI); - MFI->setRestoreGPRRegs(LowGPR, HighGPR, LowOffset); + if (LowRestoreGPR) + MFI->setRestoreGPRRegs(LowRestoreGPR, HighGPR, LowRestoreOffset); // Save the range of call-saved registers, for use by the epilogue inserter. - ProcessCSI(Spills); - MFI->setSpillGPRRegs(LowGPR, HighGPR, LowOffset); - - // Create spill slots for the remaining registers. - for (auto &CS : CSI) { - if (CS.getFrameIdx() != INT32_MAX) - continue; - Register Reg = CS.getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - Align Alignment = TRI->getSpillAlign(*RC); - unsigned Size = TRI->getSpillSize(*RC); - Alignment = std::min(Alignment, getStackAlign()); - int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true); - CS.setFrameIdx(FrameIdx); - } + assert(LowSpillGPR && "Expected registers to spill"); + MFI->setSpillGPRRegs(LowSpillGPR, HighGPR, LowSpillOffset); return true; } @@ -1001,13 +1003,6 @@ void SystemZXPLINKFrameLowering::determineCalleeSaves(MachineFunction &MF, // frame pointer will be clobbered. if (HasFP) SavedRegs.set(Regs.getFramePointerRegister()); - - // If the function is not an XPLeaf function, we need to save the - // return address register. We also always use that register for - // the return instruction, so it needs to be restored in the - // epilogue even though that register is considered to be volatile. - // #TODO: Implement leaf detection. - SavedRegs.set(Regs.getReturnFunctionAddressRegister()); } bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp index 8f633adbb9ef..29cc2840310d 100644 --- a/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -240,6 +240,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO, return SectionKind::getBSS(); } + // Global variables with '!exclude' should get the exclude section kind if + // they have an explicit section and no other metadata. + if (GVar->hasSection()) + if (MDNode *MD = GVar->getMetadata(LLVMContext::MD_exclude)) + if (!MD->getNumOperands()) + return SectionKind::getExclude(); + // If the global is marked constant, we can put it into a mergable section, // a mergable string section, or general .data if it contains relocations. if (GVar->isConstant()) { diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp index 3eb246f73679..45facd34f84e 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp @@ -39,12 +39,11 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); namespace { class VEMCCodeEmitter : public MCCodeEmitter { - const MCInstrInfo &MCII; MCContext &Ctx; public: - VEMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : MCII(mcii), Ctx(ctx) {} + VEMCCodeEmitter(const MCInstrInfo &, MCContext &ctx) + : Ctx(ctx) {} VEMCCodeEmitter(const VEMCCodeEmitter &) = delete; VEMCCodeEmitter &operator=(const VEMCCodeEmitter &) = delete; ~VEMCCodeEmitter() override = default; @@ -74,12 +73,6 @@ public: uint64_t getRDOpValue(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - -private: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // end anonymous namespace @@ -87,9 +80,6 @@ private: void VEMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); support::endian::write<uint64_t>(OS, Bits, support::little); @@ -155,7 +145,6 @@ uint64_t VEMCCodeEmitter::getRDOpValue(const MCInst &MI, unsigned OpNo, return 0; } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "VEGenMCCodeEmitter.inc" MCCodeEmitter *llvm::createVEMCCodeEmitter(const MCInstrInfo &MCII, diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp index f4fbf763e59c..5a562d77f941 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp @@ -24,6 +24,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "VEGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h index d8f9d0634c24..935a0bfc0c4c 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h @@ -44,6 +44,7 @@ std::unique_ptr<MCObjectTargetWriter> createVEELFObjectWriter(uint8_t OSABI); // Defines symbolic names for the VE instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "VEGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp index af69d04a17ca..5553087d6f47 100644 --- a/llvm/lib/Target/VE/VEAsmPrinter.cpp +++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp @@ -325,6 +325,8 @@ void VEAsmPrinter::lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI, } void VEAsmPrinter::emitInstruction(const MachineInstr *MI) { + VE_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); switch (MI->getOpcode()) { default: diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index 85285749b4fa..e54453b31354 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -325,22 +325,22 @@ def VEMEMziiAsmOperand : AsmOperandClass { // ASX format uses single assembly instruction format. def MEMrri : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops ptr_rc, ptr_rc, i32imm); + let MIOperandInfo = (ops ptr_rc, ptr_rc, i64imm); let ParserMatchClass = VEMEMrriAsmOperand; } def MEMrii : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops ptr_rc, i32imm, i32imm); + let MIOperandInfo = (ops ptr_rc, i32imm, i64imm); let ParserMatchClass = VEMEMriiAsmOperand; } def MEMzri : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i32imm); + let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i64imm); let ParserMatchClass = VEMEMzriAsmOperand; } def MEMzii : Operand<iPTR> { let PrintMethod = "printMemASXOperand"; - let MIOperandInfo = (ops i32imm /* = 0 */, i32imm, i32imm); + let MIOperandInfo = (ops i32imm /* = 0 */, i32imm, i64imm); let ParserMatchClass = VEMEMziiAsmOperand; } diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp index d175ad26c742..f334af128162 100644 --- a/llvm/lib/Target/VE/VERegisterInfo.cpp +++ b/llvm/lib/Target/VE/VERegisterInfo.cpp @@ -27,6 +27,8 @@ using namespace llvm; +#define DEBUG_TYPE "ve-register-info" + #define GET_REGINFO_TARGET_DESC #include "VEGenRegisterInfo.inc" @@ -133,66 +135,179 @@ static unsigned offsetToDisp(MachineInstr &MI) { return OffDisp; } -static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II, - MachineInstr &MI, const DebugLoc &dl, - unsigned FIOperandNum, int Offset, Register FrameReg) { - // Replace frame index with a frame pointer reference directly. - // VE has 32 bit offset field, so no need to expand a target instruction. - // Directly encode it. +class EliminateFrameIndex { + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; + const DebugLoc &DL; + MachineBasicBlock &MBB; + MachineBasicBlock::iterator II; + Register clobber; + + // Some helper functions for the ease of instruction building. + MachineFunction &getFunc() const { return *MBB.getParent(); } + inline MCRegister getSubReg(MCRegister Reg, unsigned Idx) const { + return TRI.getSubReg(Reg, Idx); + } + inline const MCInstrDesc &get(unsigned Opcode) const { + return TII.get(Opcode); + } + inline MachineInstrBuilder build(const MCInstrDesc &MCID, Register DestReg) { + return BuildMI(MBB, II, DL, MCID, DestReg); + } + inline MachineInstrBuilder build(unsigned InstOpc, Register DestReg) { + return build(get(InstOpc), DestReg); + } + inline MachineInstrBuilder build(const MCInstrDesc &MCID) { + return BuildMI(MBB, II, DL, MCID); + } + inline MachineInstrBuilder build(unsigned InstOpc) { + return build(get(InstOpc)); + } + + // Calculate an address of frame index from a frame register and a given + // offset if the offset doesn't fit in the immediate field. Use a clobber + // register to hold calculated address. + void prepareReplaceFI(MachineInstr &MI, Register &FrameReg, int64_t &Offset, + int64_t Bytes = 0); + // Replace the frame index in \p MI with a frame register and a given offset + // if it fits in the immediate field. Otherwise, use pre-calculated address + // in a clobber regsiter. + void replaceFI(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + + // Expand and eliminate Frame Index of pseudo STQrii and LDQrii. + void processSTQ(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + void processLDQ(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); + +public: + EliminateFrameIndex(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, + const DebugLoc &DL, MachineBasicBlock &MBB, + MachineBasicBlock::iterator II) + : TII(TII), TRI(TRI), DL(DL), MBB(MBB), II(II), clobber(VE::SX13) {} + + // Expand and eliminate Frame Index from MI + void processMI(MachineInstr &MI, Register FrameReg, int64_t Offset, + int FIOperandNum); +}; + +// Prepare the frame index if it doesn't fit in the immediate field. Use +// clobber register to hold calculated address. +void EliminateFrameIndex::prepareReplaceFI(MachineInstr &MI, Register &FrameReg, + int64_t &Offset, int64_t Bytes) { + if (isInt<32>(Offset) && isInt<32>(Offset + Bytes)) { + // If the offset is small enough to fit in the immediate field, directly + // encode it. So, nothing to prepare here. + return; + } + + // If the offset doesn't fit, emit following codes. This clobbers SX13 + // which we always know is available here. + // lea %clobber, Offset@lo + // and %clobber, %clobber, (32)0 + // lea.sl %clobber, Offset@hi(FrameReg, %clobber) + build(VE::LEAzii, clobber).addImm(0).addImm(0).addImm(Lo_32(Offset)); + build(VE::ANDrm, clobber).addReg(clobber).addImm(M0(32)); + build(VE::LEASLrri, clobber) + .addReg(clobber) + .addReg(FrameReg) + .addImm(Hi_32(Offset)); + + // Use clobber register as a frame register and 0 offset + FrameReg = clobber; + Offset = 0; +} + +// Replace the frame index in \p MI with a proper byte and framereg offset. +void EliminateFrameIndex::replaceFI(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(isInt<32>(Offset)); + + // The offset must be small enough to fit in the immediate field after + // call of prepareReplaceFI. Therefore, we directly encode it. MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false); MI.getOperand(FIOperandNum + offsetToDisp(MI)).ChangeToImmediate(Offset); } +void EliminateFrameIndex::processSTQ(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::STQrii); + LLVM_DEBUG(dbgs() << "processSTQ: "; MI.dump()); + + prepareReplaceFI(MI, FrameReg, Offset, 8); + + Register SrcReg = MI.getOperand(3).getReg(); + Register SrcHiReg = getSubReg(SrcReg, VE::sub_even); + Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd); + // VE stores HiReg to 8(addr) and LoReg to 0(addr) + MachineInstr *StMI = + build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(SrcLoReg); + replaceFI(*StMI, FrameReg, Offset, 0); + // Mutate to 'hi' store. + MI.setDesc(get(VE::STrii)); + MI.getOperand(3).setReg(SrcHiReg); + Offset += 8; + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processLDQ(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + assert(MI.getOpcode() == VE::LDQrii); + LLVM_DEBUG(dbgs() << "processLDQ: "; MI.dump()); + + prepareReplaceFI(MI, FrameReg, Offset, 8); + + Register DestReg = MI.getOperand(0).getReg(); + Register DestHiReg = getSubReg(DestReg, VE::sub_even); + Register DestLoReg = getSubReg(DestReg, VE::sub_odd); + // VE loads HiReg from 8(addr) and LoReg from 0(addr) + MachineInstr *StMI = + build(VE::LDrii, DestLoReg).addReg(FrameReg).addImm(0).addImm(0); + replaceFI(*StMI, FrameReg, Offset, 1); + MI.setDesc(get(VE::LDrii)); + MI.getOperand(0).setReg(DestHiReg); + Offset += 8; + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + +void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg, + int64_t Offset, int FIOperandNum) { + switch (MI.getOpcode()) { + case VE::STQrii: + processSTQ(MI, FrameReg, Offset, FIOperandNum); + return; + case VE::LDQrii: + processLDQ(MI, FrameReg, Offset, FIOperandNum); + return; + } + prepareReplaceFI(MI, FrameReg, Offset); + replaceFI(MI, FrameReg, Offset, FIOperandNum); +} + void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { assert(SPAdj == 0 && "Unexpected"); MachineInstr &MI = *II; - DebugLoc dl = MI.getDebugLoc(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + MachineFunction &MF = *MI.getParent()->getParent(); - const VEFrameLowering *TFI = getFrameLowering(MF); + const VESubtarget &Subtarget = MF.getSubtarget<VESubtarget>(); + const VEFrameLowering &TFI = *getFrameLowering(MF); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + const VERegisterInfo &TRI = *Subtarget.getRegisterInfo(); + DebugLoc DL = MI.getDebugLoc(); + EliminateFrameIndex EFI(TII, TRI, DL, *MI.getParent(), II); + // Retrieve FrameReg and byte offset for stack slot. Register FrameReg; - int Offset; - Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed(); - + int64_t Offset = + TFI.getFrameIndexReference(MF, FrameIndex, FrameReg).getFixed(); Offset += MI.getOperand(FIOperandNum + offsetToDisp(MI)).getImm(); - if (MI.getOpcode() == VE::STQrii) { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - Register SrcReg = MI.getOperand(3).getReg(); - Register SrcHiReg = getSubReg(SrcReg, VE::sub_even); - Register SrcLoReg = getSubReg(SrcReg, VE::sub_odd); - // VE stores HiReg to 8(addr) and LoReg to 0(addr) - MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(VE::STrii)) - .addReg(FrameReg) - .addImm(0) - .addImm(0) - .addReg(SrcLoReg); - replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg); - MI.setDesc(TII.get(VE::STrii)); - MI.getOperand(3).setReg(SrcHiReg); - Offset += 8; - } else if (MI.getOpcode() == VE::LDQrii) { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - Register DestReg = MI.getOperand(0).getReg(); - Register DestHiReg = getSubReg(DestReg, VE::sub_even); - Register DestLoReg = getSubReg(DestReg, VE::sub_odd); - // VE loads HiReg from 8(addr) and LoReg from 0(addr) - MachineInstr *StMI = - BuildMI(*MI.getParent(), II, dl, TII.get(VE::LDrii), DestLoReg) - .addReg(FrameReg) - .addImm(0) - .addImm(0); - replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg); - MI.setDesc(TII.get(VE::LDrii)); - MI.getOperand(0).setReg(DestHiReg); - Offset += 8; - } - - replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg); + EFI.processMI(MI, FrameReg, Offset, FIOperandNum); } Register VERegisterInfo::getFrameRegister(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp index 330eef4c7c2b..f88f298bc603 100644 --- a/llvm/lib/Target/VE/VVPISelLowering.cpp +++ b/llvm/lib/Target/VE/VVPISelLowering.cpp @@ -41,7 +41,7 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const { auto VVPOpcodeOpt = getVVPOpcode(Opcode); if (!VVPOpcodeOpt) return SDValue(); - unsigned VVPOpcode = VVPOpcodeOpt.getValue(); + unsigned VVPOpcode = VVPOpcodeOpt.value(); const bool FromVP = ISD::isVPOpcode(Opcode); // The representative and legalized vector type of this operation. diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp index ec72c1de0503..d31715e367ec 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp @@ -87,15 +87,14 @@ bool WebAssemblyAsmTypeCheck::popType(SMLoc ErrorLoc, if (Stack.empty()) { return typeError(ErrorLoc, EVT ? StringRef("empty stack while popping ") + - WebAssembly::typeToString(EVT.getValue()) + WebAssembly::typeToString(EVT.value()) : StringRef("empty stack while popping value")); } auto PVT = Stack.pop_back_val(); - if (EVT && EVT.getValue() != PVT) { + if (EVT && EVT.value() != PVT) { return typeError( ErrorLoc, StringRef("popped ") + WebAssembly::typeToString(PVT) + - ", expected " + - WebAssembly::typeToString(EVT.getValue())); + ", expected " + WebAssembly::typeToString(EVT.value())); } return false; } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index f52545a65dbb..97dbc35c991b 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -26,6 +26,7 @@ using namespace llvm; #define DEBUG_TYPE "wasm-mc-target-desc" #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "WebAssemblyGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 75d5d0675990..b5b12200505b 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -124,6 +124,7 @@ enum TOF { // Defines symbolic names for the WebAssembly instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "WebAssemblyGenInstrInfo.inc" namespace llvm { diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp index e3daf6bfa72e..ef2c77ade8cc 100644 --- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp +++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp @@ -37,4 +37,5 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetInfo() { // which have to be in a shared location between CodeGen and MC. #define GET_INSTRMAP_INFO 1 #define GET_INSTRINFO_ENUM 1 +#define GET_INSTRINFO_MC_HELPER_DECLS #include "WebAssemblyGenInstrInfo.inc" diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp index 0f1655718481..f380b2582c65 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp @@ -13,6 +13,7 @@ #include "WebAssemblyTypeUtilities.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" // Get register classes enum. #define GET_REGINFO_ENUM @@ -168,6 +169,11 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) { } } +wasm::ValType WebAssembly::regClassToValType(const TargetRegisterClass *RC) { + assert(RC != nullptr); + return regClassToValType(RC->getID()); +} + void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT, const SmallVector<MVT, 1> &VTs) { assert(!Sym->getType()); @@ -175,33 +181,28 @@ void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT, // Tables are represented as Arrays in LLVM IR therefore // they reach this point as aggregate Array types with an element type // that is a reference type. - wasm::ValType Type; + wasm::ValType ValTy; bool IsTable = false; if (GlobalVT->isArrayTy() && WebAssembly::isRefType(GlobalVT->getArrayElementType())) { - MVT VT; IsTable = true; - switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) { - case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF: - VT = MVT::funcref; - break; - case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF: - VT = MVT::externref; - break; - default: - report_fatal_error("unhandled address space type"); - } - Type = WebAssembly::toValType(VT); + const Type *ElTy = GlobalVT->getArrayElementType(); + if (WebAssembly::isExternrefType(ElTy)) + ValTy = wasm::ValType::EXTERNREF; + else if (WebAssembly::isFuncrefType(ElTy)) + ValTy = wasm::ValType::FUNCREF; + else + report_fatal_error("unhandled reference type"); } else if (VTs.size() == 1) { - Type = WebAssembly::toValType(VTs[0]); + ValTy = WebAssembly::toValType(VTs[0]); } else report_fatal_error("Aggregate globals not yet implemented"); if (IsTable) { Sym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); - Sym->setTableType(Type); + Sym->setTableType(ValTy); } else { Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); - Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true}); + Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(ValTy), /*Mutable=*/true}); } } diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h index 8fc67d37925c..86211700c70a 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h @@ -22,6 +22,9 @@ #include "llvm/Support/MachineValueType.h" namespace llvm { + +class TargetRegisterClass; + namespace WebAssembly { /// Used as immediate MachineOperands for block signatures @@ -108,9 +111,12 @@ std::string signatureToString(const wasm::WasmSignature *Sig); // Convert a MVT into its corresponding wasm ValType. wasm::ValType toValType(MVT Type); -// Convert a register class to a wasm ValType. +// Convert a register class ID to a wasm ValType. wasm::ValType regClassToValType(unsigned RC); +// Convert a register class to a wasm ValType. +wasm::ValType regClassToValType(const TargetRegisterClass *RC); + /// Sets a Wasm Symbol Type. void wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT, const SmallVector<MVT, 1> &VTs); diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp index b87c884c9e4a..277bbee83a6f 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp @@ -179,3 +179,25 @@ MachineInstr *WebAssembly::findCatch(MachineBasicBlock *EHPad) { return &*Pos; return nullptr; } + +unsigned WebAssembly::getCopyOpcodeForRegClass(const TargetRegisterClass *RC) { + assert(RC != nullptr); + switch (RC->getID()) { + case WebAssembly::I32RegClassID: + return WebAssembly::COPY_I32; + case WebAssembly::I64RegClassID: + return WebAssembly::COPY_I64; + case WebAssembly::F32RegClassID: + return WebAssembly::COPY_F32; + case WebAssembly::F64RegClassID: + return WebAssembly::COPY_F64; + case WebAssembly::V128RegClassID: + return WebAssembly::COPY_V128; + case WebAssembly::FUNCREFRegClassID: + return WebAssembly::COPY_FUNCREF; + case WebAssembly::EXTERNREFRegClassID: + return WebAssembly::COPY_EXTERNREF; + default: + llvm_unreachable("Unexpected register class"); + } +} diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h index cdfc758db7ac..d0639208fda9 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h @@ -24,6 +24,7 @@ class MachineInstr; class MachineOperand; class MCContext; class MCSymbolWasm; +class TargetRegisterClass; class WebAssemblyFunctionInfo; class WebAssemblySubtarget; @@ -65,6 +66,9 @@ getOrCreateFuncrefCallTableSymbol(MCContext &Ctx, /// instruction found or the catch is in an invalid location. MachineInstr *findCatch(MachineBasicBlock *EHPad); +/// Returns the appropriate copy opcode for the given register class. +unsigned getCopyOpcodeForRegClass(const TargetRegisterClass *RC); + } // end namespace WebAssembly } // end namespace llvm diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 57d51634e849..bcb6cf1b4e1d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -597,6 +597,8 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() { void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) { LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n'); + WebAssembly_MC::verifyInstructionPredicates(MI->getOpcode(), + Subtarget->getFeatureBits()); switch (MI->getOpcode()) { case WebAssembly::ARGUMENT_i32: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 02e873a0f9a6..d2eb4b29e9fd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -781,25 +781,6 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) { } } -// Get the appropriate copy opcode for the given register class. -static unsigned getCopyOpcode(const TargetRegisterClass *RC) { - if (RC == &WebAssembly::I32RegClass) - return WebAssembly::COPY_I32; - if (RC == &WebAssembly::I64RegClass) - return WebAssembly::COPY_I64; - if (RC == &WebAssembly::F32RegClass) - return WebAssembly::COPY_F32; - if (RC == &WebAssembly::F64RegClass) - return WebAssembly::COPY_F64; - if (RC == &WebAssembly::V128RegClass) - return WebAssembly::COPY_V128; - if (RC == &WebAssembly::FUNCREFRegClass) - return WebAssembly::COPY_FUNCREF; - if (RC == &WebAssembly::EXTERNREFRegClass) - return WebAssembly::COPY_EXTERNREF; - llvm_unreachable("Unexpected register class"); -} - // When MBB is split into MBB and Split, we should unstackify defs in MBB that // have their uses in Split. static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, @@ -851,7 +832,8 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB, if (!MFI.isVRegStackified(TeeReg)) { // Now we are not using TEE anymore, so unstackify DefReg too MFI.unstackifyVReg(DefReg); - unsigned CopyOpc = getCopyOpcode(MRI.getRegClass(DefReg)); + unsigned CopyOpc = + WebAssembly::getCopyOpcodeForRegClass(MRI.getRegClass(DefReg)); BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(CopyOpc), TeeReg) .addReg(DefReg); BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(CopyOpc), Reg).addReg(DefReg); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 5484c0db7775..9316826e3d92 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -66,23 +66,7 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB, ? MRI.getRegClass(DestReg) : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg); - unsigned CopyOpcode; - if (RC == &WebAssembly::I32RegClass) - CopyOpcode = WebAssembly::COPY_I32; - else if (RC == &WebAssembly::I64RegClass) - CopyOpcode = WebAssembly::COPY_I64; - else if (RC == &WebAssembly::F32RegClass) - CopyOpcode = WebAssembly::COPY_F32; - else if (RC == &WebAssembly::F64RegClass) - CopyOpcode = WebAssembly::COPY_F64; - else if (RC == &WebAssembly::V128RegClass) - CopyOpcode = WebAssembly::COPY_V128; - else if (RC == &WebAssembly::FUNCREFRegClass) - CopyOpcode = WebAssembly::COPY_FUNCREF; - else if (RC == &WebAssembly::EXTERNREFRegClass) - CopyOpcode = WebAssembly::COPY_EXTERNREF; - else - llvm_unreachable("Unexpected register class"); + unsigned CopyOpcode = WebAssembly::getCopyOpcodeForRegClass(RC); BuildMI(MBB, I, DL, get(CopyOpcode), DestReg) .addReg(SrcReg, KillSrc ? RegState::Kill : 0); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 2db4bd822349..7a1a769c6b16 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -553,7 +553,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) { std::tie(SizeArg, NEltArg) = FnAttrs.getAllocSizeArgs(); SizeArg += 1; if (NEltArg) - NEltArg = NEltArg.getValue() + 1; + NEltArg = NEltArg.value() + 1; FnAttrs.addAllocSizeAttr(SizeArg, NEltArg); } // In case the callee has 'noreturn' attribute, We need to remove it, because diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 2e6027a5605c..e8b3542df12f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -154,25 +154,6 @@ MCOperand WebAssemblyMCInstLower::lowerTypeIndexOperand( return MCOperand::createExpr(Expr); } -// Return the WebAssembly type associated with the given register class. -static wasm::ValType getType(const TargetRegisterClass *RC) { - if (RC == &WebAssembly::I32RegClass) - return wasm::ValType::I32; - if (RC == &WebAssembly::I64RegClass) - return wasm::ValType::I64; - if (RC == &WebAssembly::F32RegClass) - return wasm::ValType::F32; - if (RC == &WebAssembly::F64RegClass) - return wasm::ValType::F64; - if (RC == &WebAssembly::V128RegClass) - return wasm::ValType::V128; - if (RC == &WebAssembly::EXTERNREFRegClass) - return wasm::ValType::EXTERNREF; - if (RC == &WebAssembly::FUNCREFRegClass) - return wasm::ValType::FUNCREF; - llvm_unreachable("Unexpected register class"); -} - static void getFunctionReturns(const MachineInstr *MI, SmallVectorImpl<wasm::ValType> &Returns) { const Function &F = MI->getMF()->getFunction(); @@ -221,10 +202,12 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI, const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); for (const MachineOperand &MO : MI->defs()) - Returns.push_back(getType(MRI.getRegClass(MO.getReg()))); + Returns.push_back( + WebAssembly::regClassToValType(MRI.getRegClass(MO.getReg()))); for (const MachineOperand &MO : MI->explicit_uses()) if (MO.isReg()) - Params.push_back(getType(MRI.getRegClass(MO.getReg()))); + Params.push_back( + WebAssembly::regClassToValType(MRI.getRegClass(MO.getReg()))); // call_indirect instructions have a callee operand at the end which // doesn't count as a param. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp index ba1c4b7233f2..5fcee7af9bde 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "Utils/WebAssemblyUtilities.h" #include "WebAssembly.h" #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblySubtarget.h" @@ -95,31 +96,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB, if (!MFI.isVRegStackified(Reg)) { unsigned CopyLocalOpc; const TargetRegisterClass *RegClass = MRI.getRegClass(Reg); - switch (RegClass->getID()) { - case WebAssembly::I32RegClassID: - CopyLocalOpc = WebAssembly::COPY_I32; - break; - case WebAssembly::I64RegClassID: - CopyLocalOpc = WebAssembly::COPY_I64; - break; - case WebAssembly::F32RegClassID: - CopyLocalOpc = WebAssembly::COPY_F32; - break; - case WebAssembly::F64RegClassID: - CopyLocalOpc = WebAssembly::COPY_F64; - break; - case WebAssembly::V128RegClassID: - CopyLocalOpc = WebAssembly::COPY_V128; - break; - case WebAssembly::FUNCREFRegClassID: - CopyLocalOpc = WebAssembly::COPY_FUNCREF; - break; - case WebAssembly::EXTERNREFRegClassID: - CopyLocalOpc = WebAssembly::COPY_EXTERNREF; - break; - default: - llvm_unreachable("Unexpected register class for return operand"); - } + CopyLocalOpc = WebAssembly::getCopyOpcodeForRegClass(RegClass); Register NewReg = MRI.createVirtualRegister(RegClass); BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg) .addReg(Reg); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 388c0f9110b7..0b3e534315d5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -21,7 +21,6 @@ #include "WebAssemblyRuntimeLibcallSignatures.h" #include "WebAssemblySubtarget.h" #include "llvm/CodeGen/RuntimeLibcalls.h" -#include "llvm/Support/ManagedStatic.h" using namespace llvm; @@ -482,10 +481,13 @@ struct RuntimeLibcallSignatureTable { } }; -ManagedStatic<RuntimeLibcallSignatureTable> RuntimeLibcallSignatures; +RuntimeLibcallSignatureTable &getRuntimeLibcallSignatures() { + static RuntimeLibcallSignatureTable RuntimeLibcallSignatures; + return RuntimeLibcallSignatures; +} // Maps libcall names to their RTLIB::Libcall number. Builds the map in a -// constructor for use with ManagedStatic +// constructor for use with a static variable struct StaticLibcallNameMap { StringMap<RTLIB::Libcall> Map; StaticLibcallNameMap() { @@ -496,7 +498,8 @@ struct StaticLibcallNameMap { }; for (const auto &NameLibcall : NameLibcalls) { if (NameLibcall.first != nullptr && - RuntimeLibcallSignatures->Table[NameLibcall.second] != unsupported) { + getRuntimeLibcallSignatures().Table[NameLibcall.second] != + unsupported) { assert(Map.find(NameLibcall.first) == Map.end() && "duplicate libcall names in name map"); Map[NameLibcall.first] = NameLibcall.second; @@ -523,7 +526,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, wasm::ValType PtrTy = Subtarget.hasAddr64() ? wasm::ValType::I64 : wasm::ValType::I32; - auto &Table = RuntimeLibcallSignatures->Table; + auto &Table = getRuntimeLibcallSignatures().Table; switch (Table[LC]) { case func: break; @@ -885,14 +888,14 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, } } -static ManagedStatic<StaticLibcallNameMap> LibcallNameMap; // TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unsed // other than here, just roll its logic into this version. void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget, StringRef Name, SmallVectorImpl<wasm::ValType> &Rets, SmallVectorImpl<wasm::ValType> &Params) { - auto &Map = LibcallNameMap->Map; + static StaticLibcallNameMap LibcallNameMap; + auto &Map = LibcallNameMap.Map; auto Val = Map.find(Name); #ifndef NDEBUG if (Val == Map.end()) { diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index a903c5f455a2..da90befb2320 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -622,7 +622,7 @@ static bool printFMAComments(const MCInst *MI, raw_ostream &OS, OS << '-'; OS << '(' << Mul1Name << " * " << Mul2Name << ") " << AccStr << ' ' - << AccName; + << AccName << '\n'; return true; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp index 901082ce6cf3..640efd468135 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstrRelaxTables.cpp @@ -13,6 +13,7 @@ #include "X86InstrRelaxTables.h" #include "X86InstrInfo.h" #include "llvm/ADT/STLExtras.h" +#include <atomic> using namespace llvm; @@ -119,7 +120,7 @@ const X86InstrRelaxTableEntry *llvm::lookupRelaxTable(unsigned ShortOp) { namespace { // This class stores the short form tables. It is instantiated as a -// ManagedStatic to lazily init the short form table. +// function scope static variable to lazily init the short form table. struct X86ShortFormTable { // Stores relaxation table entries sorted by relaxed form opcode. SmallVector<X86InstrRelaxTableEntry, 0> Table; @@ -137,10 +138,9 @@ struct X86ShortFormTable { }; } // namespace -static ManagedStatic<X86ShortFormTable> ShortTable; - const X86InstrRelaxTableEntry *llvm::lookupShortTable(unsigned RelaxOp) { - auto &Table = ShortTable->Table; + static X86ShortFormTable ShortTable; + auto &Table = ShortTable.Table; auto I = llvm::lower_bound(Table, RelaxOp); if (I != Table.end() && I->KeyOp == RelaxOp) return &*I; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 49660883ad83..4c962de16530 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -37,6 +37,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC #define GET_INSTRINFO_MC_HELPERS +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "X86GenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 7344900f2e31..0ac916527495 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -132,6 +132,9 @@ FunctionPass *createX86EvexToVexInsts(); /// This pass creates the thunks for the retpoline feature. FunctionPass *createX86IndirectThunksPass(); +/// This pass replaces ret instructions with jmp's to __x86_return thunk. +FunctionPass *createX86ReturnThunksPass(); + /// This pass ensures instructions featuring a memory operand /// have distinctive <LineNumber, Discriminator> (with respect to eachother) FunctionPass *createX86DiscriminateMemOpsPass(); @@ -185,6 +188,7 @@ void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86PreAMXConfigPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); +void initializeX86ReturnThunksPass(PassRegistry &); namespace X86AS { enum : unsigned { diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index a5c6b40c493c..a859176220c7 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -266,6 +266,8 @@ def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true", "Write Back No Invalidate">; def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true", "Support RDPID instructions">; +def FeatureRDPRU : SubtargetFeature<"rdpru", "HasRDPRU", "true", + "Support RDPRU instructions">; def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", "Wait and pause enhancements">; def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true", @@ -1238,6 +1240,7 @@ def ProcessorFeatures { TuningInsertVZEROUPPER]; list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, + FeatureRDPRU, FeatureWBNOINVD]; list<SubtargetFeature> ZN2Tuning = ZNTuning; list<SubtargetFeature> ZN2Features = diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp index c7a013a0b17a..cff95d17c14c 100644 --- a/llvm/lib/Target/X86/X86EvexToVex.cpp +++ b/llvm/lib/Target/X86/X86EvexToVex.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" +#include <atomic> #include <cassert> #include <cstdint> diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 61c1fd25031d..12af6087cb47 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -594,7 +594,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Half type will be promoted by default. setOperationAction(ISD::FABS, MVT::f16, Promote); setOperationAction(ISD::FNEG, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); @@ -629,6 +629,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + + setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, LibCall); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); @@ -2817,6 +2845,21 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { AddressSpace = X86AS::FS; else if (GuardReg == "gs") AddressSpace = X86AS::GS; + + // Use symbol guard if user specify. + StringRef GuardSymb = M->getStackProtectorGuardSymbol(); + if (!GuardSymb.empty()) { + GlobalVariable *GV = M->getGlobalVariable(GuardSymb); + if (!GV) { + Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext()) + : Type::getInt32Ty(M->getContext()); + GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, + nullptr, GuardSymb, nullptr, + GlobalValue::NotThreadLocal, AddressSpace); + } + return GV; + } + return SegmentOffset(IRB, Offset, AddressSpace); } } @@ -11757,15 +11800,17 @@ static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// /// SM_SentinelZero is accepted as a valid negative index but must match in -/// both. +/// both, or via a known bits test. static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, + const SelectionDAG &DAG, SDValue V1 = SDValue(), SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; - assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && + assert(llvm::all_of(ExpectedMask, + [Size](int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"); // Check for out-of-range target shuffle mask indices. @@ -11778,12 +11823,28 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits()) V2 = SDValue(); + APInt ZeroV1 = APInt::getNullValue(Size); + APInt ZeroV2 = APInt::getNullValue(Size); + for (int i = 0; i < Size; ++i) { int MaskIdx = Mask[i]; int ExpectedIdx = ExpectedMask[i]; if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx) continue; - if (0 <= MaskIdx && 0 <= ExpectedIdx) { + if (MaskIdx == SM_SentinelZero) { + // If we need this expected index to be a zero element, then update the + // relevant zero mask and perform the known bits at the end to minimize + // repeated computes. + SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; + if (ExpectedV && + Size == (int)ExpectedV.getValueType().getVectorNumElements()) { + int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); + APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2; + ZeroMask.setBit(BitIdx); + continue; + } + } + if (MaskIdx >= 0) { SDValue MaskV = MaskIdx < Size ? V1 : V2; SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); @@ -11791,15 +11852,16 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) continue; } - // TODO - handle SM_Sentinel equivalences. return false; } - return true; + return (ZeroV1.isNullValue() || DAG.MaskedVectorIsZero(V1, ZeroV1)) && + (ZeroV2.isNullValue() || DAG.MaskedVectorIsZero(V2, ZeroV2)); } // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd // instructions. -static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { +static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT, + const SelectionDAG &DAG) { if (VT != MVT::v8i32 && VT != MVT::v8f32) return false; @@ -11809,12 +11871,13 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { SmallVector<int, 8> Unpckhwd; createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, /* Unary = */ false); - bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) || - isTargetShuffleEquivalent(VT, Mask, Unpckhwd)); + bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) || + isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG)); return IsUnpackwdMask; } -static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) { +static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask, + const SelectionDAG &DAG) { // Create 128-bit vector type based on mask size. MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); MVT VT = MVT::getVectorVT(EltVT, Mask.size()); @@ -11827,8 +11890,8 @@ static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) { for (unsigned i = 0; i != 4; ++i) { SmallVector<int, 16> UnpackMask; createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); - if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) || - isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask)) + if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) || + isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG)) return true; } return false; @@ -12021,7 +12084,7 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // Attempt to match the target mask against the unpack lo/hi mask patterns. SmallVector<int, 64> Unpckl, Unpckh; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1, + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1, (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKL; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); @@ -12030,7 +12093,7 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, } createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1, + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1, (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKH; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); @@ -12069,14 +12132,14 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // If a binary shuffle, commute and try again. if (!IsUnary) { ShuffleVectorSDNode::commuteMask(Unpckl); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) { + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) { UnpackOpcode = X86ISD::UNPCKL; std::swap(V1, V2); return true; } ShuffleVectorSDNode::commuteMask(Unpckh); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) { + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) { UnpackOpcode = X86ISD::UNPCKH; std::swap(V1, V2); return true; @@ -12464,14 +12527,14 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, // Try binary shuffle. SmallVector<int, 32> BinaryMask; createPackShuffleMask(VT, BinaryMask, false, NumStages); - if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2)) + if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2)) if (MatchPACK(V1, V2, PackVT)) return true; // Try unary shuffle. SmallVector<int, 32> UnaryMask; createPackShuffleMask(VT, UnaryMask, true, NumStages); - if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1)) + if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1)) if (MatchPACK(V1, V1, PackVT)) return true; } @@ -14283,7 +14346,7 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps // because that avoids a constant load from memory. if (NumElts == 4 && - (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask))) + (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG))) return SDValue(); // Extend the shuffle mask with undef elements. @@ -17230,7 +17293,7 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2()) { // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && - !is128BitUnpackShuffleMask(HalfMask) && + !is128BitUnpackShuffleMask(HalfMask, DAG) && (!isSingleSHUFPSMask(HalfMask) || Subtarget.hasFastVariableCrossLaneShuffle())) return SDValue(); @@ -17892,7 +17955,7 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. - if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) + if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG)) return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); @@ -17930,7 +17993,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code than vblend by using // vpunpcklwd and vpunpckhwd instrs. - if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && + if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() && !Subtarget.hasAVX512()) return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); @@ -27887,11 +27950,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, } // Read Performance Monitoring Counters. case RDPMC: + // Read Processor Register. + case RDPRU: // GetExtended Control Register. case XGETBV: { SmallVector<SDValue, 2> Results; // RDPMC uses ECX to select the index of the performance counter to read. + // RDPRU uses ECX to select the processor register to read. // XGETBV uses ECX to select the index of the XCR register to return. // The result is stored into registers EDX:EAX. expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX, @@ -29902,14 +29968,12 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt); SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, {4, 5, 6, 7, -1, -1, -1, -1}); - Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, - {0, 1, 1, 1, -1, -1, -1, -1}); - Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, - {2, 3, 3, 3, -1, -1, -1, -1}); - Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23, - {0, 1, 1, 1, -1, -1, -1, -1}); - Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23, - {2, 3, 3, 3, -1, -1, -1, -1}); + SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG); + SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG); + Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02); + Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13); + Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02); + Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13); } } @@ -30797,6 +30861,8 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::UMin: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; @@ -32894,6 +32960,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget, Results); return; + case Intrinsic::x86_rdpru: + expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget, + Results); + return; case Intrinsic::x86_xgetbv: expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, Results); @@ -36985,8 +37055,9 @@ static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, bool AllowFloatDomain, bool AllowIntDomain, - SDValue V1, const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { + SDValue V1, const SelectionDAG &DAG, + const X86Subtarget &Subtarget, unsigned &Shuffle, + MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); @@ -37057,17 +37128,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v4f32; return true; @@ -37076,17 +37147,19 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v4f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG, + V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v8f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG, + V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v8f32; return true; @@ -37096,21 +37169,22 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG, + V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( MaskVT, Mask, - {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) { + {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( MaskVT, Mask, - {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) { + {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v16f32; return true; @@ -37126,6 +37200,7 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, + const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { @@ -37269,33 +37344,36 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) && + AllowFloatDomain) { V2 = V1; V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1); Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) && + AllowFloatDomain) { V2 = V1; Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) && + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) && Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; SrcVT = DstVT = MVT::v2f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) && + if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; SrcVT = DstVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) && + if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}, + DAG) && Subtarget.hasFP16()) { Shuffle = X86ISD::MOVSH; SrcVT = DstVT = MVT::v8f16; @@ -37678,7 +37756,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, scaleShuffleElements(Mask, NumElts, ScaledMask)) { for (unsigned i = 0; i != NumElts; ++i) IdentityMask.push_back(i); - if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2)) + if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1, + V2)) return CanonicalizeShuffleInput(RootVT, V1); } } @@ -37902,7 +37981,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, - Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && + DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) @@ -37913,7 +37992,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, - AllowIntDomain, Subtarget, Shuffle, ShuffleVT, + AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { @@ -37931,7 +38010,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // TODO: Handle other insertions here as well? if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 && Subtarget.hasSSE41() && - !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) { + !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) { if (MaskEltSizeInBits == 32) { SDValue SrcV1 = V1, SrcV2 = V2; if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, @@ -37947,12 +38026,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } if (MaskEltSizeInBits == 64 && - isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) && + isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) && V2.getOpcode() == ISD::SCALAR_TO_VECTOR && V2.getScalarValueSizeInBits() <= 32) { if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) return SDValue(); // Nothing to do! - PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0); + PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0); Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, CanonicalizeShuffleInput(MVT::v4f32, V1), CanonicalizeShuffleInput(MVT::v4f32, V2), @@ -51654,9 +51733,13 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. - if ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX()) || - (OpSize == 512 && Subtarget.useAVX512Regs())) { + bool NoImplicitFloatOps = + DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && + ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX()) || + (OpSize == 512 && Subtarget.useAVX512Regs()))) { bool HasPT = Subtarget.hasSSE41(); // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index a55b95960aa6..6124755ca539 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1532,44 +1532,6 @@ def : Pat<(xor GR32:$src1, -2147483648), } //===----------------------------------------------------------------------===// -// Pattern match SUB as XOR -//===----------------------------------------------------------------------===// - -// An immediate in the LHS of a subtract can't be encoded in the instruction. -// If there is no possibility of a borrow we can use an XOR instead of a SUB -// to enable the immediate to be folded. -// TODO: Move this to a DAG combine? - -def sub_is_xor : PatFrag<(ops node:$lhs, node:$rhs), (sub node:$lhs, node:$rhs),[{ - if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) { - KnownBits Known = CurDAG->computeKnownBits(N->getOperand(1)); - - // If all possible ones in the RHS are set in the LHS then there can't be - // a borrow and we can use xor. - return (~Known.Zero).isSubsetOf(CN->getAPIntValue()); - } - - return false; -}]>; - -let AddedComplexity = 5 in { -def : Pat<(sub_is_xor imm:$src2, GR8:$src1), - (XOR8ri GR8:$src1, imm:$src2)>; -def : Pat<(sub_is_xor i16immSExt8:$src2, GR16:$src1), - (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(sub_is_xor imm:$src2, GR16:$src1), - (XOR16ri GR16:$src1, imm:$src2)>; -def : Pat<(sub_is_xor i32immSExt8:$src2, GR32:$src1), - (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; -def : Pat<(sub_is_xor imm:$src2, GR32:$src1), - (XOR32ri GR32:$src1, imm:$src2)>; -def : Pat<(sub_is_xor i64immSExt8:$src2, GR64:$src1), - (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(sub_is_xor i64immSExt32:$src2, GR64:$src1), - (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; -} - -//===----------------------------------------------------------------------===// // Some peepholes //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp index 52b2a62316cd..c4317be664fd 100644 --- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -13,8 +13,8 @@ #include "X86InstrFMA3Info.h" #include "X86InstrInfo.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Threading.h" +#include <atomic> #include <cassert> #include <cstdint> diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 27220a8d4d99..8aeb169929f2 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -13,6 +13,7 @@ #include "X86InstrFoldTables.h" #include "X86InstrInfo.h" #include "llvm/ADT/STLExtras.h" +#include <atomic> #include <vector> using namespace llvm; @@ -6102,7 +6103,7 @@ llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) { namespace { // This class stores the memory unfolding tables. It is instantiated as a -// ManagedStatic to lazily init the unfolding table. +// function scope static variable to lazily init the unfolding table. struct X86MemUnfoldTable { // Stores memory unfolding tables entries sorted by opcode. std::vector<X86MemoryFoldTableEntry> Table; @@ -6159,11 +6160,10 @@ struct X86MemUnfoldTable { }; } -static ManagedStatic<X86MemUnfoldTable> MemUnfoldTable; - const X86MemoryFoldTableEntry * llvm::lookupUnfoldTable(unsigned MemOp) { - auto &Table = MemUnfoldTable->Table; + static X86MemUnfoldTable MemUnfoldTable; + auto &Table = MemUnfoldTable.Table; auto I = llvm::lower_bound(Table, MemOp); if (I != Table.end() && I->KeyOp == MemOp) return &*I; diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 7f6ef3479d40..4a9a281d5b99 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -978,6 +978,7 @@ def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; def HasCLWB : Predicate<"Subtarget->hasCLWB()">; def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">; def HasRDPID : Predicate<"Subtarget->hasRDPID()">; +def HasRDPRU : Predicate<"Subtarget->hasRDPRU()">; def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">; def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">; def HasCX8 : Predicate<"Subtarget->hasCX8()">; diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td index 3a653a56e534..b1ca87279007 100644 --- a/llvm/lib/Target/X86/X86InstrSystem.td +++ b/llvm/lib/Target/X86/X86InstrSystem.td @@ -735,6 +735,15 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst), } // SchedRW //===----------------------------------------------------------------------===// +// RDPRU - Read Processor Register instruction. + +let SchedRW = [WriteSystem] in { +let Uses = [ECX], Defs = [EAX, EDX] in + def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, PS, + Requires<[HasRDPRU]>; +} + +//===----------------------------------------------------------------------===// // Platform Configuration instruction // From ISA docs: diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 3c8be95b43e3..6112c0b7d6c3 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -37,7 +37,7 @@ enum IntrinsicType : uint16_t { TRUNCATE_TO_REG, CVTPS2PH_MASK, CVTPD2DQ_MASK, CVTQQ2PS_MASK, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, FIXUPIMM, FIXUPIMM_MASKZ, GATHER_AVX2, - ROUNDP, ROUNDS + ROUNDP, ROUNDS, RDPRU }; struct IntrinsicData { @@ -309,6 +309,7 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0), X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0), + X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0), diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index b107de692365..3fbdb18a0793 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -2413,6 +2413,10 @@ static void addConstantComments(const MachineInstr *MI, } void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // X86_MC::verifyInstructionPredicates(MI->getOpcode(), + // Subtarget->getFeatureBits()); + X86MCInstLower MCInstLowering(*MF, *this); const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index 7761f7323358..c760a32e2579 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -439,8 +439,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { while (!Worklist.empty()) { Value *V = Worklist.pop_back_val(); - if (!Visited.insert(V).second) - continue; + if (!Visited.insert(V).second) + continue; if (auto *PN = dyn_cast<PHINode>(V)) { // PHI node should have single use unless it is the root node, then it @@ -466,7 +466,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { // gets us back to this node. if (BO->hasNUses(BO == Root ? 3 : 2)) { PHINode *PN = nullptr; - for (auto *U : Root->users()) + for (auto *U : BO->users()) if (auto *P = dyn_cast<PHINode>(U)) if (!Visited.count(P)) PN = P; diff --git a/llvm/lib/Target/X86/X86ReturnThunks.cpp b/llvm/lib/Target/X86/X86ReturnThunks.cpp new file mode 100644 index 000000000000..4b203229ba83 --- /dev/null +++ b/llvm/lib/Target/X86/X86ReturnThunks.cpp @@ -0,0 +1,92 @@ +//==- X86ReturnThunks.cpp - Replace rets with thunks or inline thunks --=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Pass that replaces ret instructions with a jmp to __x86_return_thunk. +/// +/// This corresponds to -mfunction-return=thunk-extern or +/// __attribute__((function_return("thunk-extern"). +/// +/// This pass is a minimal implementation necessary to help mitigate +/// RetBleed for the Linux kernel. +/// +/// Should support for thunk or thunk-inline be necessary in the future, then +/// this pass should be combined with x86-retpoline-thunks which already has +/// machinery to emit thunks. Until then, YAGNI. +/// +/// This pass is very similar to x86-lvi-ret. +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define PASS_KEY "x86-return-thunks" +#define DEBUG_TYPE PASS_KEY + +struct X86ReturnThunks final : public MachineFunctionPass { + static char ID; + X86ReturnThunks() : MachineFunctionPass(ID) {} + StringRef getPassName() const override { return "X86 Return Thunks"; } + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +char X86ReturnThunks::ID = 0; + +bool X86ReturnThunks::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << getPassName() << "\n"); + + bool Modified = false; + + if (!MF.getFunction().hasFnAttribute(llvm::Attribute::FnRetThunkExtern)) + return Modified; + + StringRef ThunkName = "__x86_return_thunk"; + if (MF.getFunction().getName() == ThunkName) + return Modified; + + const auto &ST = MF.getSubtarget<X86Subtarget>(); + const bool Is64Bit = ST.getTargetTriple().getArch() == Triple::x86_64; + const unsigned RetOpc = Is64Bit ? X86::RET64 : X86::RET32; + SmallVector<MachineInstr *, 16> Rets; + + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &Term : MBB.terminators()) + if (Term.getOpcode() == RetOpc) + Rets.push_back(&Term); + + const MCInstrDesc &JMP = ST.getInstrInfo()->get(X86::TAILJMPd); + + for (MachineInstr *Ret : Rets) { + BuildMI(Ret->getParent(), Ret->getDebugLoc(), JMP) + .addExternalSymbol(ThunkName.data()); + Ret->eraseFromParent(); + Modified = true; + } + + return Modified; +} + +INITIALIZE_PASS(X86ReturnThunks, PASS_KEY, "X86 Return Thunks", false, false) + +FunctionPass *llvm::createX86ReturnThunksPass() { + return new X86ReturnThunks(); +} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 4249788e3540..f4e25e4194db 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -100,6 +100,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86OptimizeLEAPassPass(PR); initializeX86PartialReductionPass(PR); initializePseudoProbeInserterPass(PR); + initializeX86ReturnThunksPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -575,6 +576,7 @@ void X86PassConfig::addPreEmitPass2() { // hand inspection of the codegen output. addPass(createX86SpeculativeExecutionSideEffectSuppression()); addPass(createX86IndirectThunksPass()); + addPass(createX86ReturnThunksPass()); // Insert extra int3 instructions after trailing call instructions to avoid // issues in the unwinder. diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp index c286b747a271..a782ff436dc0 100644 --- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp +++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp @@ -29,6 +29,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "XCoreGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h index 096b22415a22..ec4418333859 100644 --- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h +++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h @@ -22,6 +22,7 @@ // Defines symbolic names for the XCore instructions. // #define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "XCoreGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp index 8fea61d125d2..691fdf16bc0f 100644 --- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -256,6 +256,9 @@ bool XCoreAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void XCoreAsmPrinter::emitInstruction(const MachineInstr *MI) { + XCore_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + SmallString<128> Str; raw_svector_ostream O(Str); |