diff options
Diffstat (limited to 'llvm/lib/Target')
88 files changed, 1930 insertions, 868 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index b87468d5c8de..9a04b28a8b8f 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -972,6 +972,10 @@ def ProcessorFeatures { list<SubtargetFeature> X1 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureRCPC, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureDotProd]; + list<SubtargetFeature> X1C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureRCPC, FeaturePerfMon, + FeatureSPE, FeatureFullFP16, FeatureDotProd, + FeaturePAuth]; list<SubtargetFeature> X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, FeatureMatMulInt8, FeatureBF16, FeatureAM, FeatureMTE, FeatureETE, FeatureSVE2BitPerm, @@ -1086,6 +1090,8 @@ def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82, [TuneR82]>; def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1, [TuneX1]>; +def : ProcessorModel<"cortex-x1c", CortexA57Model, ProcessorFeatures.X1C, + [TuneX1]>; def : ProcessorModel<"cortex-x2", CortexA57Model, ProcessorFeatures.X2, [TuneX2]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 85a9c04a3fef..b54a0eaba7d1 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -95,6 +95,8 @@ public: void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI); + void LowerMOPS(MCStreamer &OutStreamer, const MachineInstr &MI); + void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI); void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, @@ -936,6 +938,43 @@ void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer, .addImm(Size == 4 ? 0 : 2)); } +void AArch64AsmPrinter::LowerMOPS(llvm::MCStreamer &OutStreamer, + const llvm::MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + assert(STI->hasMOPS()); + assert(STI->hasMTE() || Opcode != AArch64::MOPSMemorySetTaggingPseudo); + + const auto Ops = [Opcode]() -> std::array<unsigned, 3> { + if (Opcode == AArch64::MOPSMemoryCopyPseudo) + return {AArch64::CPYFP, AArch64::CPYFM, AArch64::CPYFE}; + if (Opcode == AArch64::MOPSMemoryMovePseudo) + return {AArch64::CPYP, AArch64::CPYM, AArch64::CPYE}; + if (Opcode == AArch64::MOPSMemorySetPseudo) + return {AArch64::SETP, AArch64::SETM, AArch64::SETE}; + if (Opcode == AArch64::MOPSMemorySetTaggingPseudo) + return {AArch64::SETGP, AArch64::SETGM, AArch64::MOPSSETGE}; + llvm_unreachable("Unhandled memory operation pseudo"); + }(); + const bool IsSet = Opcode == AArch64::MOPSMemorySetPseudo || + Opcode == AArch64::MOPSMemorySetTaggingPseudo; + + for (auto Op : Ops) { + int i = 0; + auto MCIB = MCInstBuilder(Op); + // Destination registers + MCIB.addReg(MI.getOperand(i++).getReg()); + MCIB.addReg(MI.getOperand(i++).getReg()); + if (!IsSet) + MCIB.addReg(MI.getOperand(i++).getReg()); + // Input registers + MCIB.addReg(MI.getOperand(i++).getReg()); + MCIB.addReg(MI.getOperand(i++).getReg()); + MCIB.addReg(MI.getOperand(i++).getReg()); + + EmitToStreamer(OutStreamer, MCIB); + } +} + void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI) { unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes(); @@ -1363,6 +1402,13 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { emitFMov0(*MI); return; + case AArch64::MOPSMemoryCopyPseudo: + case AArch64::MOPSMemoryMovePseudo: + case AArch64::MOPSMemorySetPseudo: + case AArch64::MOPSMemorySetTaggingPseudo: + LowerMOPS(*OutStreamer, *MI); + return; + case TargetOpcode::STACKMAP: return LowerSTACKMAP(*OutStreamer, SM, *MI); diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 109b739528bf..b0f739cc26e6 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -709,20 +709,24 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB, bool AArch64ExpandPseudo::expandCALL_RVMARKER( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { - // Expand CALL_RVMARKER pseudo to a branch, followed by the special `mov x29, - // x29` marker. Mark the sequence as bundle, to avoid passes moving other code - // in between. + // Expand CALL_RVMARKER pseudo to: + // - a branch to the call target, followed by + // - the special `mov x29, x29` marker, and + // - another branch, to the runtime function + // Mark the sequence as bundle, to avoid passes moving other code in between. MachineInstr &MI = *MBBI; MachineInstr *OriginalCall; - MachineOperand &CallTarget = MI.getOperand(0); + MachineOperand &RVTarget = MI.getOperand(0); + MachineOperand &CallTarget = MI.getOperand(1); assert((CallTarget.isGlobal() || CallTarget.isReg()) && "invalid operand for regular call"); + assert(RVTarget.isGlobal() && "invalid operand for attached call"); unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR; OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr(); OriginalCall->addOperand(CallTarget); - unsigned RegMaskStartIdx = 1; + unsigned RegMaskStartIdx = 2; // Skip register arguments. Those are added during ISel, but are not // needed for the concrete branch. while (!MI.getOperand(RegMaskStartIdx).isRegMask()) { @@ -736,17 +740,22 @@ bool AArch64ExpandPseudo::expandCALL_RVMARKER( llvm::drop_begin(MI.operands(), RegMaskStartIdx)) OriginalCall->addOperand(MO); - auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs)) + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs)) .addReg(AArch64::FP, RegState::Define) .addReg(AArch64::XZR) .addReg(AArch64::FP) - .addImm(0) + .addImm(0); + + auto *RVCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::BL)) + .add(RVTarget) .getInstr(); + if (MI.shouldUpdateCallSiteInfo()) - MBB.getParent()->moveCallSiteInfo(&MI, Marker); + MBB.getParent()->moveCallSiteInfo(&MI, OriginalCall); + MI.eraseFromParent(); finalizeBundle(MBB, OriginalCall->getIterator(), - std::next(Marker->getIterator())); + std::next(RVCall->getIterator())); return true; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a26bbc77f248..c539c8617d99 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" @@ -938,19 +939,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; - MaxStoresPerMemset = Subtarget->requiresStrictAlign() - ? MaxStoresPerMemsetOptSize : 32; + MaxStoresPerMemset = + Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32; MaxGluedStoresPerMemcpy = 4; MaxStoresPerMemcpyOptSize = 4; - MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() - ? MaxStoresPerMemcpyOptSize : 16; + MaxStoresPerMemcpy = + Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16; - MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; + MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemmove = 4; MaxLoadsPerMemcmpOptSize = 4; - MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() - ? MaxLoadsPerMemcmpOptSize : 8; + MaxLoadsPerMemcmp = + Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8; setStackPointerRegisterToSaveRestore(AArch64::SP); @@ -1426,6 +1428,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8); } + if (Subtarget->hasMOPS() && Subtarget->hasMTE()) { + // Only required for llvm.aarch64.mops.memset.tag + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + } + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } @@ -2201,7 +2208,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::INSR) MAKE_CASE(AArch64ISD::PTEST) MAKE_CASE(AArch64ISD::PTRUE) - MAKE_CASE(AArch64ISD::PFALSE) MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO) MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO) MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO) @@ -2268,6 +2274,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::UADDLP) MAKE_CASE(AArch64ISD::CALL_RVMARKER) MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL) + MAKE_CASE(AArch64ISD::MOPS_MEMSET) + MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING) + MAKE_CASE(AArch64ISD::MOPS_MEMCOPY) + MAKE_CASE(AArch64ISD::MOPS_MEMMOVE) } #undef MAKE_CASE return nullptr; @@ -3746,6 +3756,10 @@ SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, if (OpVT != MVT::f16 && OpVT != MVT::bf16) return SDValue(); + // Bitcasts between f16 and bf16 are legal. + if (ArgVT == MVT::f16 || ArgVT == MVT::bf16) + return Op; + assert(ArgVT == MVT::i16); SDLoc DL(Op); @@ -4056,6 +4070,39 @@ static SDValue lowerConvertToSVBool(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret); } +SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = Op.getConstantOperandVal(1); + switch (IntNo) { + default: + return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::aarch64_mops_memset_tag: { + auto Node = cast<MemIntrinsicSDNode>(Op.getNode()); + SDLoc DL(Op); + SDValue Chain = Node->getChain(); + SDValue Dst = Op.getOperand(2); + SDValue Val = Op.getOperand(3); + Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64); + SDValue Size = Op.getOperand(4); + auto Alignment = Node->getMemOperand()->getAlign(); + bool IsVol = Node->isVolatile(); + auto DstPtrInfo = Node->getPointerInfo(); + + const auto &SDI = + static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo()); + SDValue MS = + SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val, + Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{}); + + // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the + // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise + // LowerOperationWrapper will complain that the number of results has + // changed. + return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL); + } + } +} + SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -5123,6 +5170,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::MULHU: return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED, /*OverrideNEON=*/true); + case ISD::INTRINSIC_W_CHAIN: + return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::ATOMIC_STORE: @@ -6475,12 +6524,18 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned CallOpc = AArch64ISD::CALL; // Calls with operand bundle "clang.arc.attachedcall" are special. They should - // be expanded to the call, directly followed by a special marker sequence. - // Use the CALL_RVMARKER to do that. + // be expanded to the call, directly followed by a special marker sequence and + // a call to an ObjC library function. Use CALL_RVMARKER to do that. if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { assert(!IsTailCall && "tail calls cannot be marked with clang.arc.attachedcall"); CallOpc = AArch64ISD::CALL_RVMARKER; + + // Add a target global address for the retainRV/claimRV runtime function + // just before the call target. + Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); + auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT); + Ops.insert(Ops.begin() + 1, GA); } // Returns a chain and a flag for retval copy to use. @@ -9985,8 +10040,9 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, // The only legal i1 vectors are SVE vectors, so we can use SVE-specific // lowering code. if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) { + // We can hande the zero case during isel. if (ConstVal->isZero()) - return DAG.getNode(AArch64ISD::PFALSE, dl, VT); + return Op; if (ConstVal->isOne()) return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all); } @@ -11869,6 +11925,19 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; return true; } + case Intrinsic::aarch64_mops_memset_tag: { + Value *Dst = I.getArgOperand(0); + Value *Val = I.getArgOperand(1); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(Val->getType()); + Info.ptrVal = Dst; + Info.offset = 0; + Info.align = I.getParamAlign(0).valueOrOne(); + Info.flags = MachineMemOperand::MOStore; + // The size of the memory being operated on is unknown at this point + Info.size = MemoryLocation::UnknownSize; + return true; + } default: break; } @@ -15092,7 +15161,7 @@ static bool isAllInactivePredicate(SDValue N) { while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) N = N.getOperand(0); - return N.getOpcode() == AArch64ISD::PFALSE; + return ISD::isConstantSplatVectorAllZeros(N.getNode()); } static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) { @@ -15393,6 +15462,52 @@ static SDValue performIntrinsicCombine(SDNode *N, return SDValue(); } +static bool isCheapToExtend(const SDValue &N) { + unsigned OC = N->getOpcode(); + return OC == ISD::LOAD || OC == ISD::MLOAD || + ISD::isConstantSplatVectorAllZeros(N.getNode()); +} + +static SDValue +performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // If we have (sext (setcc A B)) and A and B are cheap to extend, + // we can move the sext into the arguments and have the same result. For + // example, if A and B are both loads, we can make those extending loads and + // avoid an extra instruction. This pattern appears often in VLS code + // generation where the inputs to the setcc have a different size to the + // instruction that wants to use the result of the setcc. + assert(N->getOpcode() == ISD::SIGN_EXTEND && + N->getOperand(0)->getOpcode() == ISD::SETCC); + const SDValue SetCC = N->getOperand(0); + + const SDValue CCOp0 = SetCC.getOperand(0); + const SDValue CCOp1 = SetCC.getOperand(1); + if (!CCOp0->getValueType(0).isInteger() || + !CCOp1->getValueType(0).isInteger()) + return SDValue(); + + ISD::CondCode Code = + cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get(); + + ISD::NodeType ExtType = + isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + + if (isCheapToExtend(SetCC.getOperand(0)) && + isCheapToExtend(SetCC.getOperand(1))) { + const SDValue Ext1 = + DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0); + const SDValue Ext2 = + DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1); + + return DAG.getSetCC( + SDLoc(SetCC), N->getValueType(0), Ext1, Ext2, + cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get()); + } + + return SDValue(); +} + static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -15411,6 +15526,12 @@ static SDValue performExtendCombine(SDNode *N, return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } + + if (N->getValueType(0).isFixedLengthVector() && + N->getOpcode() == ISD::SIGN_EXTEND && + N->getOperand(0)->getOpcode() == ISD::SETCC) + return performSignExtendSetCCCombine(N, DCI, DAG); + return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ca6c70297c0b..2138c0ffe70a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -323,7 +323,6 @@ enum NodeType : unsigned { INSR, PTEST, PTRUE, - PFALSE, BITREVERSE_MERGE_PASSTHRU, BSWAP_MERGE_PASSTHRU, @@ -453,6 +452,12 @@ enum NodeType : unsigned { LDP, STP, STNP, + + // Memory Operations + MOPS_MEMSET, + MOPS_MEMSET_TAGGING, + MOPS_MEMCOPY, + MOPS_MEMMOVE, }; } // end namespace AArch64ISD @@ -890,6 +895,7 @@ private: SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 93c17133c845..a9191924129c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -93,9 +93,18 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // before the assembly printer. unsigned NumBytes = 0; const MCInstrDesc &Desc = MI.getDesc(); + + // Size should be preferably set in + // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case). + // Specific cases handle instructions of variable sizes switch (Desc.getOpcode()) { default: - // Anything not explicitly designated otherwise is a normal 4-byte insn. + if (Desc.getSize()) + return Desc.getSize(); + + // Anything not explicitly designated otherwise (i.e. pseudo-instructions + // with fixed constant size but not specified in .td file) is a normal + // 4-byte insn. NumBytes = 4; break; case TargetOpcode::STACKMAP: @@ -115,29 +124,9 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (NumBytes == 0) NumBytes = 4; break; - case AArch64::TLSDESC_CALLSEQ: - // This gets lowered to an instruction sequence which takes 16 bytes - NumBytes = 16; - break; - case AArch64::SpeculationBarrierISBDSBEndBB: - // This gets lowered to 2 4-byte instructions. - NumBytes = 8; - break; - case AArch64::SpeculationBarrierSBEndBB: - // This gets lowered to 1 4-byte instructions. - NumBytes = 4; - break; - case AArch64::JumpTableDest32: - case AArch64::JumpTableDest16: - case AArch64::JumpTableDest8: - NumBytes = 12; - break; case AArch64::SPACE: NumBytes = MI.getOperand(1).getImm(); break; - case AArch64::StoreSwiftAsyncContext: - NumBytes = 20; - break; case TargetOpcode::BUNDLE: NumBytes = getInstBundleLength(MI); break; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c8a697c8b82f..83bf89ff97c5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -780,6 +780,7 @@ def : Pat<(AArch64LOADgot texternalsym:$addr), def : Pat<(AArch64LOADgot tconstpool:$addr), (LOADgot tconstpool:$addr)>; +// In general these get lowered into a sequence of three 4-byte instructions. // 32-bit jump table destination is actually only 2 instructions since we can // use the table itself as a PC-relative base. But optimization occurs after // branch relaxation so be pessimistic. @@ -815,8 +816,12 @@ let hasSideEffects = 1, isCodeGenOnly = 1 in { // SpeculationBarrierEndBB must only be used after an unconditional control // flow, i.e. after a terminator for which isBarrier is True. let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { + // This gets lowered to a pair of 4-byte instructions. + let Size = 8 in def SpeculationBarrierISBDSBEndBB : Pseudo<(outs), (ins), []>, Sched<[]>; + // This gets lowered to a 4-byte instruction. + let Size = 4 in def SpeculationBarrierSBEndBB : Pseudo<(outs), (ins), []>, Sched<[]>; } @@ -2324,8 +2329,8 @@ def : Pat<(AArch64call GPR64noip:$Rn), (BLRNoIP GPR64noip:$Rn)>, Requires<[SLSBLRMitigation]>; -def : Pat<(AArch64call_rvmarker GPR64:$Rn), - (BLR_RVMARKER GPR64:$Rn)>, +def : Pat<(AArch64call_rvmarker (i64 tglobaladdr:$rvfunc), GPR64:$Rn), + (BLR_RVMARKER tglobaladdr:$rvfunc, GPR64:$Rn)>, Requires<[NoSLSBLRMitigation]>; let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { @@ -2356,7 +2361,8 @@ def EMITBKEY : Pseudo<(outs), (ins), []>, Sched<[]> {} // FIXME: maybe the scratch register used shouldn't be fixed to X1? // FIXME: can "hasSideEffects be dropped? -let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, +// This gets lowered to an instruction sequence which takes 16 bytes +let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, Size = 16, isCodeGenOnly = 1 in def TLSDESC_CALLSEQ : Pseudo<(outs), (ins i64imm:$sym), @@ -7546,6 +7552,9 @@ def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(f16 (bitconvert (bf16 FPR16:$src))), (f16 FPR16:$src)>; +def : Pat<(bf16 (bitconvert (f16 FPR16:$src))), (bf16 FPR16:$src)>; + let Predicates = [IsLE] in { def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -8330,26 +8339,67 @@ let Predicates = [HasLS64] in { } let Predicates = [HasMOPS] in { - defm CPYFP : MOPSMemoryCopyInsns<0b00, "cpyfp">; - defm CPYFM : MOPSMemoryCopyInsns<0b01, "cpyfm">; - defm CPYFE : MOPSMemoryCopyInsns<0b10, "cpyfe">; + let Defs = [NZCV] in { + defm CPYFP : MOPSMemoryCopyInsns<0b00, "cpyfp">; + + defm CPYP : MOPSMemoryMoveInsns<0b00, "cpyp">; + + defm SETP : MOPSMemorySetInsns<0b00, "setp">; + } + let Uses = [NZCV] in { + defm CPYFM : MOPSMemoryCopyInsns<0b01, "cpyfm">; + defm CPYFE : MOPSMemoryCopyInsns<0b10, "cpyfe">; - defm CPYP : MOPSMemoryMoveInsns<0b00, "cpyp">; - defm CPYM : MOPSMemoryMoveInsns<0b01, "cpym">; - defm CPYE : MOPSMemoryMoveInsns<0b10, "cpye">; + defm CPYM : MOPSMemoryMoveInsns<0b01, "cpym">; + defm CPYE : MOPSMemoryMoveInsns<0b10, "cpye">; - defm SETP : MOPSMemorySetInsns<0b00, "setp">; - defm SETM : MOPSMemorySetInsns<0b01, "setm">; - defm SETE : MOPSMemorySetInsns<0b10, "sete">; + defm SETM : MOPSMemorySetInsns<0b01, "setm">; + defm SETE : MOPSMemorySetInsns<0b10, "sete">; + } } let Predicates = [HasMOPS, HasMTE] in { - defm SETGP : MOPSMemorySetTaggingInsns<0b00, "setgp">; - defm SETGM : MOPSMemorySetTaggingInsns<0b01, "setgm">; - // Can't use SETGE because it's a reserved name in TargetSelectionDAG.td - defm MOPSSETGE : MOPSMemorySetTaggingInsns<0b10, "setge">; + let Defs = [NZCV] in { + defm SETGP : MOPSMemorySetTaggingInsns<0b00, "setgp">; + } + let Uses = [NZCV] in { + defm SETGM : MOPSMemorySetTaggingInsns<0b01, "setgm">; + // Can't use SETGE because it's a reserved name in TargetSelectionDAG.td + defm MOPSSETGE : MOPSMemorySetTaggingInsns<0b10, "setge">; + } +} + +// MOPS Node operands: 0: Dst, 1: Src or Value, 2: Size, 3: Chain +// MOPS Node results: 0: Dst writeback, 1: Size writeback, 2: Chain +def SDT_AArch64mops : SDTypeProfile<2, 3, [ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2> ]>; +def AArch64mops_memset : SDNode<"AArch64ISD::MOPS_MEMSET", SDT_AArch64mops>; +def AArch64mops_memset_tagging : SDNode<"AArch64ISD::MOPS_MEMSET_TAGGING", SDT_AArch64mops>; +def AArch64mops_memcopy : SDNode<"AArch64ISD::MOPS_MEMCOPY", SDT_AArch64mops>; +def AArch64mops_memmove : SDNode<"AArch64ISD::MOPS_MEMMOVE", SDT_AArch64mops>; + +// MOPS operations always contain three 4-byte instructions +let Predicates = [HasMOPS], Defs = [NZCV], Size = 12, mayStore = 1 in { + let mayLoad = 1 in { + def MOPSMemoryCopyPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn), + [], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>; + def MOPSMemoryMovePseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn), + [], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>; + } + let mayLoad = 0 in { + def MOPSMemorySetPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), + [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>; + } +} +let Predicates = [HasMOPS, HasMTE], Defs = [NZCV], Size = 12, mayLoad = 0, mayStore = 1 in { + def MOPSMemorySetTaggingPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), + [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>; } -let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1 in +// This gets lowered into an instruction sequence of 20 bytes +let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1, Size = 20 in def StoreSwiftAsyncContext : Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset), []>, Sched<[]>; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 73a680465f6f..1d162610de9c 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -292,7 +292,13 @@ def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [ SDTCisSameAs<0,1>, SDTCisSameAs<1,2> ]>; -def AArch64bic : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>; +def AArch64bic_node : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>; + +def AArch64bic : PatFrags<(ops node:$op1, node:$op2), + [(and node:$op1, (xor node:$op2, (AArch64dup (i32 -1)))), + (and node:$op1, (xor node:$op2, (AArch64dup (i64 -1)))), + (and node:$op1, (xor node:$op2, (SVEAllActive))), + (AArch64bic_node node:$op1, node:$op2)]>; let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; @@ -734,14 +740,14 @@ let Predicates = [HasSVEorStreamingSVE] in { defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; - defm AND_PPzPP : sve_int_pred_log_and<0b0000, "and", int_aarch64_sve_and_z>; - defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>; + defm AND_PPzPP : sve_int_pred_log_v2<0b0000, "and", int_aarch64_sve_and_z, and>; + defm BIC_PPzPP : sve_int_pred_log_v2<0b0001, "bic", int_aarch64_sve_bic_z, AArch64bic>; defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>; - defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>; + defm SEL_PPPP : sve_int_pred_log_v2<0b0011, "sel", vselect, or>; defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>; defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>; defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>; - defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>; + defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>; defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>; defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>; defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>; diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index d2d84b2a3f6d..893269c1a7ef 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -15,15 +15,95 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-selectiondag-info" +SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode, + SelectionDAG &DAG, const SDLoc &DL, + SDValue Chain, SDValue Dst, + SDValue SrcOrValue, SDValue Size, + Align Alignment, bool isVolatile, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const { + + // Get the constant size of the copy/set. + uint64_t ConstSize = 0; + if (auto *C = dyn_cast<ConstantSDNode>(Size)) + ConstSize = C->getZExtValue(); + + const bool IsSet = SDOpcode == AArch64ISD::MOPS_MEMSET || + SDOpcode == AArch64ISD::MOPS_MEMSET_TAGGING; + + const auto MachineOpcode = [&]() { + switch (SDOpcode) { + case AArch64ISD::MOPS_MEMSET: + return AArch64::MOPSMemorySetPseudo; + case AArch64ISD::MOPS_MEMSET_TAGGING: + return AArch64::MOPSMemorySetTaggingPseudo; + case AArch64ISD::MOPS_MEMCOPY: + return AArch64::MOPSMemoryCopyPseudo; + case AArch64ISD::MOPS_MEMMOVE: + return AArch64::MOPSMemoryMovePseudo; + default: + llvm_unreachable("Unhandled MOPS ISD Opcode"); + } + }(); + + MachineMemOperand::Flags Flags = MachineMemOperand::MOStore; + if (isVolatile) + Flags |= MachineMemOperand::MOVolatile; + if (!IsSet) + Flags |= MachineMemOperand::MOLoad; + + MachineFunction &MF = DAG.getMachineFunction(); + + auto *DstOp = + MF.getMachineMemOperand(DstPtrInfo, Flags, ConstSize, Alignment); + auto *SrcOp = + MF.getMachineMemOperand(SrcPtrInfo, Flags, ConstSize, Alignment); + + if (IsSet) { + // Extend value to i64 if required + if (SrcOrValue.getValueType() != MVT::i64) + SrcOrValue = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, SrcOrValue); + SDValue Ops[] = {Dst, Size, SrcOrValue, Chain}; + const EVT ResultTys[] = {MVT::i64, MVT::i64, MVT::Other}; + MachineSDNode *Node = DAG.getMachineNode(MachineOpcode, DL, ResultTys, Ops); + DAG.setNodeMemRefs(Node, {DstOp}); + return SDValue(Node, 2); + } else { + SDValue Ops[] = {Dst, SrcOrValue, Size, Chain}; + const EVT ResultTys[] = {MVT::i64, MVT::i64, MVT::i64, MVT::Other}; + MachineSDNode *Node = DAG.getMachineNode(MachineOpcode, DL, ResultTys, Ops); + DAG.setNodeMemRefs(Node, {DstOp, SrcOp}); + return SDValue(Node, 3); + } +} + +SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy( + SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { + const AArch64Subtarget &STI = + DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); + if (STI.hasMOPS()) + return EmitMOPS(AArch64ISD::MOPS_MEMCOPY, DAG, DL, Chain, Dst, Src, Size, + Alignment, isVolatile, DstPtrInfo, SrcPtrInfo); + return SDValue(); +} + SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo) const { + const AArch64Subtarget &STI = + DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); + + if (STI.hasMOPS()) { + return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size, + Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{}); + } + // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size); - const AArch64Subtarget &STI = - DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); const char *bzeroName = (V && V->isZero()) ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) @@ -55,6 +135,19 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( return SDValue(); } +SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, bool isVolatile, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { + const AArch64Subtarget &STI = + DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); + if (STI.hasMOPS()) { + return EmitMOPS(AArch64ISD::MOPS_MEMMOVE, DAG, dl, Chain, Dst, Src, Size, + Alignment, isVolatile, DstPtrInfo, SrcPtrInfo); + } + return SDValue(); +} + static const int kSetTagLoopThreshold = 176; static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl, diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 7d53bd456975..47fe3bf7dcf5 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -19,11 +19,30 @@ namespace llvm { class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo { public: + SDValue EmitMOPS(AArch64ISD::NodeType SDOpcode, SelectionDAG &DAG, + const SDLoc &DL, SDValue Chain, SDValue Dst, + SDValue SrcOrValue, SDValue Size, Align Alignment, + bool isVolatile, MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const; + + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, Align Alignment, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; + SDValue + EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, + SDValue Dst, SDValue Src, SDValue Size, + Align Alignment, bool isVolatile, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, MachinePointerInfo DstPtrInfo, diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index a4f4b8582182..8a7e20237271 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -99,6 +99,7 @@ void AArch64Subtarget::initializeProperties() { case CortexA78C: case CortexR82: case CortexX1: + case CortexX1C: PrefFunctionLogAlignment = 4; break; case CortexA510: diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 3e3c0f6aba15..7b2bbad30f85 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -63,6 +63,7 @@ public: CortexA710, CortexR82, CortexX1, + CortexX1C, CortexX2, ExynosM3, Falkor, @@ -217,7 +218,6 @@ protected: bool HasETE = false; bool HasTRBE = false; bool HasBRBE = false; - bool HasPAUTH = false; bool HasSPE_EEF = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. @@ -510,7 +510,6 @@ public: bool hasRandGen() const { return HasRandGen; } bool hasMTE() const { return HasMTE; } bool hasTME() const { return HasTME; } - bool hasPAUTH() const { return HasPAUTH; } // Arm SVE2 extensions bool hasSVE2AES() const { return HasSVE2AES; } bool hasSVE2SM4() const { return HasSVE2SM4; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a4d666a0a3c2..b2ffdf949d8b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1886,14 +1886,21 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, m_Value()))) VecPred = CurrentPred; } - // Check if we have a compare/select chain that can be lowered using CMxx & - // BFI pair. - if (CmpInst::isIntPredicate(VecPred)) { - static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, - MVT::v8i16, MVT::v2i32, MVT::v4i32, - MVT::v2i64}; + // Check if we have a compare/select chain that can be lowered using + // a (F)CMxx & BFI pair. + if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || + VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || + VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || + VecPred == CmpInst::FCMP_UNE) { + static const auto ValidMinMaxTys = { + MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, + MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; + static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; + auto LT = TLI->getTypeLegalizationCost(DL, ValTy); - if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) + if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || + (ST->hasFullFP16() && + any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) return LT.first; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 1f546ad50d57..703e356f016d 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -192,6 +192,7 @@ private: bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI); bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI); bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI); + bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI); bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI); unsigned emitConstantPoolEntry(const Constant *CPVal, @@ -3424,6 +3425,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_VECREDUCE_FADD: case TargetOpcode::G_VECREDUCE_ADD: return selectReduction(I, MRI); + case TargetOpcode::G_MEMCPY: + case TargetOpcode::G_MEMCPY_INLINE: + case TargetOpcode::G_MEMMOVE: + case TargetOpcode::G_MEMSET: + assert(STI.hasMOPS() && "Shouldn't get here without +mops feature"); + return selectMOPS(I, MRI); } return false; @@ -3481,6 +3488,64 @@ bool AArch64InstructionSelector::selectReduction(MachineInstr &I, return false; } +bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI, + MachineRegisterInfo &MRI) { + unsigned Mopcode; + switch (GI.getOpcode()) { + case TargetOpcode::G_MEMCPY: + case TargetOpcode::G_MEMCPY_INLINE: + Mopcode = AArch64::MOPSMemoryCopyPseudo; + break; + case TargetOpcode::G_MEMMOVE: + Mopcode = AArch64::MOPSMemoryMovePseudo; + break; + case TargetOpcode::G_MEMSET: + // For tagged memset see llvm.aarch64.mops.memset.tag + Mopcode = AArch64::MOPSMemorySetPseudo; + break; + } + + auto &DstPtr = GI.getOperand(0); + auto &SrcOrVal = GI.getOperand(1); + auto &Size = GI.getOperand(2); + + // Create copies of the registers that can be clobbered. + const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg()); + const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg()); + const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg()); + + const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo; + const auto &SrcValRegClass = + IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass; + + // Constrain to specific registers + RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI); + RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI); + RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI); + + MIB.buildCopy(DstPtrCopy, DstPtr); + MIB.buildCopy(SrcValCopy, SrcOrVal); + MIB.buildCopy(SizeCopy, Size); + + // New instruction uses the copied registers because it must update them. + // The defs are not used since they don't exist in G_MEM*. They are still + // tied. + // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE + Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); + Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + if (IsSet) { + MIB.buildInstr(Mopcode, {DefDstPtr, DefSize}, + {DstPtrCopy, SizeCopy, SrcValCopy}); + } else { + Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass); + MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize}, + {DstPtrCopy, SrcValCopy, SizeCopy}); + } + + GI.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); @@ -5375,6 +5440,36 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( constrainSelectedInstRegOperands(*Store, TII, TRI, RBI); break; } + case Intrinsic::aarch64_mops_memset_tag: { + // Transform + // %dst:gpr(p0) = \ + // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag), + // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64) + // where %dst is updated, into + // %Rd:GPR64common, %Rn:GPR64) = \ + // MOPSMemorySetTaggingPseudo \ + // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64 + // where Rd and Rn are tied. + // It is expected that %val has been extended to s64 in legalization. + // Note that the order of the size/value operands are swapped. + + Register DstDef = I.getOperand(0).getReg(); + // I.getOperand(1) is the intrinsic function + Register DstUse = I.getOperand(2).getReg(); + Register ValUse = I.getOperand(3).getReg(); + Register SizeUse = I.getOperand(4).getReg(); + + // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one. + // Therefore an additional virtual register is requried for the updated size + // operand. This value is not accessible via the semantics of the intrinsic. + Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64)); + + auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo, + {DstDef, SizeDef}, {DstUse, SizeUse, ValUse}); + Memset.cloneMemRefs(I); + constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI); + break; + } } I.eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index e8894e7933d6..e9df7e001d38 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -699,8 +699,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); - getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) - .libcall(); + if (ST.hasMOPS()) { + // G_BZERO is not supported. Currently it is only emitted by + // PreLegalizerCombiner for G_MEMSET with zero constant. + getActionDefinitionsBuilder(G_BZERO).unsupported(); + + getActionDefinitionsBuilder(G_MEMSET) + .legalForCartesianProduct({p0}, {s64}, {s64}) + .customForCartesianProduct({p0}, {s8}, {s64}) + .immIdx(0); // Inform verifier imm idx 0 is handled. + + getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE}) + .legalForCartesianProduct({p0}, {p0}, {s64}) + .immIdx(0); // Inform verifier imm idx 0 is handled. + + // G_MEMCPY_INLINE does not have a tailcall immediate + getActionDefinitionsBuilder(G_MEMCPY_INLINE) + .legalForCartesianProduct({p0}, {p0}, {s64}); + + } else { + getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET}) + .libcall(); + } // FIXME: Legal types are only legal with NEON. getActionDefinitionsBuilder(G_ABS) @@ -832,6 +852,11 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeAtomicCmpxchg128(MI, MRI, Helper); case TargetOpcode::G_CTTZ: return legalizeCTTZ(MI, Helper); + case TargetOpcode::G_BZERO: + case TargetOpcode::G_MEMCPY: + case TargetOpcode::G_MEMMOVE: + case TargetOpcode::G_MEMSET: + return legalizeMemOps(MI, Helper); } llvm_unreachable("expected switch to return"); @@ -989,6 +1014,15 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.eraseFromParent(); return true; } + case Intrinsic::aarch64_mops_memset_tag: { + assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); + // Zext the value to 64 bit + MachineIRBuilder MIB(MI); + auto &Value = MI.getOperand(3); + Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0); + Value.setReg(ZExtValueReg); + return true; + } } return true; @@ -1359,3 +1393,20 @@ bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI, MI.eraseFromParent(); return true; } + +bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI, + LegalizerHelper &Helper) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + + // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic + if (MI.getOpcode() == TargetOpcode::G_MEMSET) { + // Zext the value operand to 64 bit + auto &Value = MI.getOperand(1); + Register ZExtValueReg = + MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0); + Value.setReg(ZExtValueReg); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index e2c46f4b4c1f..973f96ff4775 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -56,6 +56,7 @@ private: bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const; bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const; + bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const; const AArch64Subtarget *ST; }; } // End llvm namespace. diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 574b22124957..9d4bdbe5d053 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -334,8 +334,6 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> { def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>; -def SDT_AArch64PFalse : SDTypeProfile<1, 0, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>]>; -def AArch64pfalse : SDNode<"AArch64ISD::PFALSE", SDT_AArch64PFalse>; let Predicates = [HasSVEorStreamingSVE] in { defm PTRUE : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>; @@ -614,10 +612,10 @@ class sve_int_pfalse<bits<6> opc, string asm> multiclass sve_int_pfalse<bits<6> opc, string asm> { def NAME : sve_int_pfalse<opc, asm>; - def : Pat<(nxv16i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>; - def : Pat<(nxv8i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>; - def : Pat<(nxv4i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>; - def : Pat<(nxv2i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>; + def : Pat<(nxv16i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>; + def : Pat<(nxv8i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>; + def : Pat<(nxv4i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>; + def : Pat<(nxv2i1 (splat_vector (i32 0))), (!cast<Instruction>(NAME))>; } class sve_int_ptest<bits<6> opc, string asm> @@ -773,7 +771,7 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm, def : Pat<(i64 (op GPR64:$Rn, (nxv2i1 PPRAny:$Pg))), (!cast<Instruction>(NAME # _D) PPRAny:$Pg, $Rn)>; - // Combine cntp with combine_op + // combine_op(x, cntp(all_active, p)) ==> inst p, x def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv16i1 (SVEAllActive)), (nxv16i1 PPRAny:$pred)))), (!cast<Instruction>(NAME # _B) PPRAny:$pred, $Rn)>; def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv8i1 (SVEAllActive)), (nxv8i1 PPRAny:$pred)))), @@ -782,6 +780,16 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm, (!cast<Instruction>(NAME # _S) PPRAny:$pred, $Rn)>; def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv2i1 (SVEAllActive)), (nxv2i1 PPRAny:$pred)))), (!cast<Instruction>(NAME # _D) PPRAny:$pred, $Rn)>; + + // combine_op(x, cntp(p, p)) ==> inst p, x + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv16i1 PPRAny:$pred), (nxv16i1 PPRAny:$pred)))), + (!cast<Instruction>(NAME # _B) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv8i1 PPRAny:$pred), (nxv8i1 PPRAny:$pred)))), + (!cast<Instruction>(NAME # _H) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv4i1 PPRAny:$pred), (nxv4i1 PPRAny:$pred)))), + (!cast<Instruction>(NAME # _S) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv2i1 PPRAny:$pred), (nxv2i1 PPRAny:$pred)))), + (!cast<Instruction>(NAME # _D) PPRAny:$pred, $Rn)>; } class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, @@ -1633,15 +1641,18 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op, !cast<Instruction>(NAME), PTRUE_D>; } -multiclass sve_int_pred_log_and<bits<4> opc, string asm, SDPatternOperator op> : +// An instance of sve_int_pred_log_and but uses op_nopred's first operand as the +// general predicate. +multiclass sve_int_pred_log_v2<bits<4> opc, string asm, SDPatternOperator op, + SDPatternOperator op_nopred> : sve_int_pred_log<opc, asm, op> { - def : Pat<(nxv16i1 (and nxv16i1:$Op1, nxv16i1:$Op2)), + def : Pat<(nxv16i1 (op_nopred nxv16i1:$Op1, nxv16i1:$Op2)), (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; - def : Pat<(nxv8i1 (and nxv8i1:$Op1, nxv8i1:$Op2)), + def : Pat<(nxv8i1 (op_nopred nxv8i1:$Op1, nxv8i1:$Op2)), (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; - def : Pat<(nxv4i1 (and nxv4i1:$Op1, nxv4i1:$Op2)), + def : Pat<(nxv4i1 (op_nopred nxv4i1:$Op1, nxv4i1:$Op2)), (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; - def : Pat<(nxv2i1 (and nxv2i1:$Op1, nxv2i1:$Op2)), + def : Pat<(nxv2i1 (op_nopred nxv2i1:$Op1, nxv2i1:$Op2)), (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 958e8c9e5bc5..11cc1a01d248 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -11,6 +11,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H #include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 7d6845b287bc..bebf032b5535 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -14,9 +14,12 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-annotate-uniform" @@ -29,6 +32,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor<AMDGPUAnnotateUniformValues> { LegacyDivergenceAnalysis *DA; MemorySSA *MSSA; + AliasAnalysis *AA; DenseMap<Value*, GetElementPtrInst*> noClobberClones; bool isEntryFunc; @@ -44,6 +48,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LegacyDivergenceAnalysis>(); AU.addRequired<MemorySSAWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.setPreservesAll(); } @@ -58,6 +63,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) @@ -70,9 +76,79 @@ static void setNoClobberMetadata(Instruction *I) { I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); } -bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { - const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load); - return !MSSA->isLiveOnEntryDef(MA); +bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) { + MemorySSAWalker *Walker = MSSA->getWalker(); + SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; + SmallSet<MemoryAccess *, 8> Visited; + MemoryLocation Loc(MemoryLocation::get(Load)); + + const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool { + Instruction *DefInst = Def->getMemoryInst(); + LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n'); + + if (isa<FenceInst>(DefInst)) + return false; + + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_s_barrier: + case Intrinsic::amdgcn_wave_barrier: + return false; + default: + break; + } + } + + // Ignore atomics not aliasing with the original load, any atomic is a + // universal MemoryDef from MSSA's point of view too, just like a fence. + const auto checkNoAlias = [this, Load](auto I) -> bool { + return I && AA->isNoAlias(I->getPointerOperand(), + Load->getPointerOperand()); + }; + + if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) || + checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst))) + return false; + + return true; + }; + + LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); + + // Start with a nearest dominating clobbering access, it will be either + // live on entry (nothing to do, load is not clobbered), MemoryDef, or + // MemoryPhi if several MemoryDefs can define this memory state. In that + // case add all Defs to WorkList and continue going up and checking all + // the definitions of this memory location until the root. When all the + // defs are exhausted and came to the entry state we have no clobber. + // Along the scan ignore barriers and fences which are considered clobbers + // by the MemorySSA, but not really writing anything into the memory. + while (!WorkList.empty()) { + MemoryAccess *MA = WorkList.pop_back_val(); + if (!Visited.insert(MA).second) + continue; + + if (MSSA->isLiveOnEntryDef(MA)) + continue; + + if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) { + if (isReallyAClobber(Def)) { + LLVM_DEBUG(dbgs() << " -> load is clobbered\n"); + return true; + } + + WorkList.push_back( + Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc)); + continue; + } + + const MemoryPhi *Phi = cast<MemoryPhi>(MA); + for (auto &Use : Phi->incoming_values()) + WorkList.push_back(cast<MemoryAccess>(&Use)); + } + + LLVM_DEBUG(dbgs() << " -> no clobber\n"); + return false; } void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { @@ -84,9 +160,6 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; - auto isGlobalLoad = [&](LoadInst &Load)->bool { - return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; - }; // We're tracking up to the Function boundaries, and cannot go beyond because // of FunctionPass restrictions. We can ensure that is memory not clobbered // for memory operations that are live in to entry points only. @@ -99,7 +172,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { } bool NotClobbered = false; - bool GlobalLoad = isGlobalLoad(I); + bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; if (PtrI) NotClobbered = GlobalLoad && !isClobberedInFunction(&I); else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) { @@ -139,6 +212,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { DA = &getAnalysis<LegacyDivergenceAnalysis>(); MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv()); visit(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index bb2e723f4ab0..6e2984f2a04f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -88,6 +88,8 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2()); } else if (isHsaAbiVersion3(getGlobalSTI())) { HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3()); + } else if (isHsaAbiVersion5(getGlobalSTI())) { + HSAMetadataStream.reset(new HSAMD::MetadataStreamerV5()); } else { HSAMetadataStream.reset(new HSAMD::MetadataStreamerV4()); } @@ -118,7 +120,7 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { TM.getTargetTriple().getOS() != Triple::AMDPAL) return; - if (isHsaAbiVersion3Or4(getGlobalSTI())) + if (isHsaAbiVersion3AndAbove(getGlobalSTI())) getTargetStreamer()->EmitDirectiveAMDGCNTarget(); if (TM.getTargetTriple().getOS() == Triple::AMDHSA) @@ -127,7 +129,7 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { if (TM.getTargetTriple().getOS() == Triple::AMDPAL) getTargetStreamer()->getPALMetadata()->readFromIR(M); - if (isHsaAbiVersion3Or4(getGlobalSTI())) + if (isHsaAbiVersion3AndAbove(getGlobalSTI())) return; // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2. @@ -259,7 +261,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { void AMDGPUAsmPrinter::emitFunctionEntryLabel() { if (TM.getTargetTriple().getOS() == Triple::AMDHSA && - isHsaAbiVersion3Or4(getGlobalSTI())) { + isHsaAbiVersion3AndAbove(getGlobalSTI())) { AsmPrinter::emitFunctionEntryLabel(); return; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 3ac7c45b3275..f5018e3a19ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -672,15 +672,15 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func, Kern[".kind"] = Kern.getDocument()->getNode("fini"); } -void MetadataStreamerV3::emitKernelArgs(const Function &Func, - const GCNSubtarget &ST, +void MetadataStreamerV3::emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern) { + auto &Func = MF.getFunction(); unsigned Offset = 0; auto Args = HSAMetadataDoc->getArrayNode(); for (auto &Arg : Func.args()) emitKernelArg(Arg, Offset, Args); - emitHiddenKernelArgs(Func, ST, Offset, Args); + emitHiddenKernelArgs(MF, Offset, Args); Kern[".args"] = Args; } @@ -789,10 +789,12 @@ void MetadataStreamerV3::emitKernelArg( Args.push_back(Arg); } -void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, - const GCNSubtarget &ST, +void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) { + auto &Func = MF.getFunction(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func); if (!HiddenArgNumBytes) return; @@ -910,7 +912,6 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); auto Kern = getHSAKernelProps(MF, ProgramInfo); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL || Func.getCallingConv() == CallingConv::SPIR_KERNEL); @@ -924,7 +925,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF, (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true); emitKernelLanguage(Func, Kern); emitKernelAttrs(Func, Kern); - emitKernelArgs(Func, ST, Kern); + emitKernelArgs(MF, Kern); } Kernels.push_back(Kern); @@ -954,6 +955,97 @@ void MetadataStreamerV4::begin(const Module &Mod, getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode(); } +//===----------------------------------------------------------------------===// +// HSAMetadataStreamerV5 +//===----------------------------------------------------------------------===// + +void MetadataStreamerV5::emitVersion() { + auto Version = HSAMetadataDoc->getArrayNode(); + Version.push_back(Version.getDocument()->getNode(VersionMajorV5)); + Version.push_back(Version.getDocument()->getNode(VersionMinorV5)); + getRootMetadata("amdhsa.version") = Version; +} + +void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF, + unsigned &Offset, + msgpack::ArrayDocNode Args) { + auto &Func = MF.getFunction(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const Module *M = Func.getParent(); + auto &DL = M->getDataLayout(); + + auto Int64Ty = Type::getInt64Ty(Func.getContext()); + auto Int32Ty = Type::getInt32Ty(Func.getContext()); + auto Int16Ty = Type::getInt16Ty(Func.getContext()); + + emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args); + emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args); + emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args); + + emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args); + emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args); + emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args); + + emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args); + emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args); + emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args); + + // Reserved for hidden_tool_correlation_id. + Offset += 8; + + Offset += 8; // Reserved. + + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args); + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args); + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args); + + emitKernelArg(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args); + + Offset += 6; // Reserved. + auto Int8PtrTy = + Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + + if (M->getNamedMetadata("llvm.printf.fmts")) { + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, + Args); + } else + Offset += 8; // Skipped. + + if (M->getModuleFlag("amdgpu_hostcall")) { + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset, + Args); + } else + Offset += 8; // Skipped. + + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, + Args); + + // Ignore temporarily until it is implemented. + // emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args); + Offset += 8; + + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset, + Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, + Args); + } else + Offset += 16; // Skipped. + + Offset += 72; // Reserved. + + // hidden_private_base and hidden_shared_base are only used by GFX8. + if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args); + emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args); + } else + Offset += 8; // Skipped. + + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + if (MFI.hasQueuePtr()) + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args); +} + } // end namespace HSAMD } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 54ed0afbba6d..bcf7fc449094 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -53,6 +53,11 @@ public: virtual void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) = 0; + +protected: + virtual void emitVersion() = 0; + virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, + msgpack::ArrayDocNode Args) = 0; }; // TODO: Rename MetadataStreamerV3 -> MetadataStreamerMsgPackV3. @@ -79,7 +84,7 @@ protected: msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const; - void emitVersion(); + void emitVersion() override; void emitPrintf(const Module &Mod); @@ -87,8 +92,7 @@ protected: void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern); - void emitKernelArgs(const Function &Func, const GCNSubtarget &ST, - msgpack::MapDocNode Kern); + void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern); void emitKernelArg(const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args); @@ -100,8 +104,8 @@ protected: StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); - void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST, - unsigned &Offset, msgpack::ArrayDocNode Args); + void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, + msgpack::ArrayDocNode Args) override; msgpack::DocNode &getRootMetadata(StringRef Key) { return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key]; @@ -127,9 +131,9 @@ public: }; // TODO: Rename MetadataStreamerV4 -> MetadataStreamerMsgPackV4. -class MetadataStreamerV4 final : public MetadataStreamerV3 { - void emitVersion(); - +class MetadataStreamerV4 : public MetadataStreamerV3 { +protected: + void emitVersion() override; void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID); public: @@ -140,6 +144,18 @@ public: const IsaInfo::AMDGPUTargetID &TargetID) override; }; +// TODO: Rename MetadataStreamerV5 -> MetadataStreamerMsgPackV5. +class MetadataStreamerV5 final : public MetadataStreamerV4 { +protected: + void emitVersion() override; + void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, + msgpack::ArrayDocNode Args) override; + +public: + MetadataStreamerV5() = default; + ~MetadataStreamerV5() = default; +}; + // TODO: Rename MetadataStreamerV2 -> MetadataStreamerYamlV2. class MetadataStreamerV2 final : public MetadataStreamer { private: @@ -167,8 +183,6 @@ private: const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const; - void emitVersion(); - void emitPrintf(const Module &Mod); void emitKernelLanguage(const Function &Func); @@ -191,6 +205,13 @@ private: return HSAMetadata; } +protected: + void emitVersion() override; + void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset, + msgpack::ArrayDocNode Args) override { + llvm_unreachable("Dummy override should not be invoked!"); + } + public: MetadataStreamerV2() = default; ~MetadataStreamerV2() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 04c6f67ed339..645d05aa9238 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4778,6 +4778,7 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, case ELF::ELFABIVERSION_AMDGPU_HSA_V3: return legalizeTrapHsaQueuePtr(MI, MRI, B); case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + case ELF::ELFABIVERSION_AMDGPU_HSA_V5: return ST.supportsGetDoorbellID() ? legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index c28427758ac7..bbbadfdfd444 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -16,8 +16,9 @@ #include "GCNSubtarget.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 2d8126a49327..99b7ffb33884 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -13,15 +13,16 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" -#include "Utils/AMDGPUBaseInfo.h" #define DEBUG_TYPE "amdgpu-promote-alloca" diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index c1c88d9a7462..ffe626513d47 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1129,7 +1129,8 @@ class KernelScopeInfo { if (i >= SgprIndexUnusedMin) { SgprIndexUnusedMin = ++i; if (Ctx) { - MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.sgpr_count")); + MCSymbol* const Sym = + Ctx->getOrCreateSymbol(Twine(".kernel.sgpr_count")); Sym->setVariableValue(MCConstantExpr::create(SgprIndexUnusedMin, *Ctx)); } } @@ -1139,7 +1140,8 @@ class KernelScopeInfo { if (i >= VgprIndexUnusedMin) { VgprIndexUnusedMin = ++i; if (Ctx) { - MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count")); + MCSymbol* const Sym = + Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count")); Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx)); } } @@ -1296,7 +1298,7 @@ public: // AsmParser::parseDirectiveSet() cannot be specialized for specific target. AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); MCContext &Ctx = getContext(); - if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) { + if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) { MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number")); Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); @@ -1313,7 +1315,7 @@ public: Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); } - if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) { + if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) { initializeGprCountSymbol(IS_VGPR); initializeGprCountSymbol(IS_SGPR); } else @@ -2747,7 +2749,7 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { return nullptr; } - if (isHsaAbiVersion3Or4(&getSTI())) { + if (isHsaAbiVersion3AndAbove(&getSTI())) { if (!updateGprCountSymbols(RegKind, RegNum, RegWidth)) return nullptr; } else @@ -5099,7 +5101,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { const char *AssemblerDirectiveBegin; const char *AssemblerDirectiveEnd; std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) = - isHsaAbiVersion3Or4(&getSTI()) + isHsaAbiVersion3AndAbove(&getSTI()) ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin, HSAMD::V3::AssemblerDirectiveEnd) : std::make_tuple(HSAMD::AssemblerDirectiveBegin, @@ -5116,7 +5118,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { HSAMetadataString)) return true; - if (isHsaAbiVersion3Or4(&getSTI())) { + if (isHsaAbiVersion3AndAbove(&getSTI())) { if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) return Error(getLoc(), "invalid HSA metadata"); } else { @@ -5266,7 +5268,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); - if (isHsaAbiVersion3Or4(&getSTI())) { + if (isHsaAbiVersion3AndAbove(&getSTI())) { if (IDVal == ".amdhsa_kernel") return ParseDirectiveAMDHSAKernel(); @@ -7440,7 +7442,7 @@ void AMDGPUAsmParser::onBeginOfFile() { if (!getTargetStreamer().getTargetID()) getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString()); - if (isHsaAbiVersion3Or4(&getSTI())) + if (isHsaAbiVersion3AndAbove(&getSTI())) getTargetStreamer().EmitDirectiveAMDGCNTarget(); } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 9578bdb0bad0..7aa5f1abf65b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -396,6 +396,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( break; case ELF::ELFABIVERSION_AMDGPU_HSA_V3: case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + case ELF::ELFABIVERSION_AMDGPU_HSA_V5: if (getTargetID()->isXnackSupported()) OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n'; break; @@ -578,6 +579,7 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() { case ELF::ELFABIVERSION_AMDGPU_HSA_V3: return getEFlagsV3(); case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + case ELF::ELFABIVERSION_AMDGPU_HSA_V5: return getEFlagsV4(); } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 561866b5a398..e2f4a0896bc3 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5423,6 +5423,7 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { case ELF::ELFABIVERSION_AMDGPU_HSA_V3: return lowerTrapHsaQueuePtr(Op, DAG); case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + case ELF::ELFABIVERSION_AMDGPU_HSA_V5: return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG); } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index c18637bdbc43..44bdbe37dec0 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -938,12 +938,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( // 2. It is safe to move MBBI down past the instruction that I will // be merged into. - if (MBBI->hasUnmodeledSideEffects()) { - // We can't re-order this instruction with respect to other memory - // operations, so we fail both conditions mentioned above. - return false; - } - if (MBBI->mayLoadOrStore() && (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { @@ -1977,10 +1971,10 @@ SILoadStoreOptimizer::collectMergeableInsts( if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; - // Don't combine if volatile. We also won't be able to merge across this, so - // break the search. We can look after this barrier for separate merges. - if (MI.hasOrderedMemoryRef()) { - LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); + // Treat volatile accesses, ordered accesses and unmodeled side effects as + // barriers. We can look after this barrier for separate merges. + if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { + LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); // Search will resume after this instruction in a separate merge list. ++BlockI; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1e96266eb06c..683be871ff82 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -99,6 +99,8 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) { return ELF::ELFABIVERSION_AMDGPU_HSA_V3; case 4: return ELF::ELFABIVERSION_AMDGPU_HSA_V4; + case 5: + return ELF::ELFABIVERSION_AMDGPU_HSA_V5; default: report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") + Twine(AmdhsaCodeObjectVersion)); @@ -123,8 +125,15 @@ bool isHsaAbiVersion4(const MCSubtargetInfo *STI) { return false; } -bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) { - return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI); +bool isHsaAbiVersion5(const MCSubtargetInfo *STI) { + if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI)) + return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V5; + return false; +} + +bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) { + return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI) || + isHsaAbiVersion5(STI); } #define GET_MIMGBaseOpcodesTable_IMPL @@ -495,6 +504,7 @@ std::string AMDGPUTargetID::toString() const { Features += "+sram-ecc"; break; case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + case ELF::ELFABIVERSION_AMDGPU_HSA_V5: // sramecc. if (getSramEccSetting() == TargetIDSetting::Off) Features += ":sramecc-"; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 89f928eb8b92..4516b511f3c8 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -47,9 +47,12 @@ bool isHsaAbiVersion3(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 4, /// false otherwise. bool isHsaAbiVersion4(const MCSubtargetInfo *STI); +/// \returns True if HSA OS ABI Version identification is 5, +/// false otherwise. +bool isHsaAbiVersion5(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 3 or 4, /// false otherwise. -bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI); +bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI); struct GcnBufferFormatInfo { unsigned Format; diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 4efbdbb2abc8..27edf69b4abf 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -656,6 +656,8 @@ def ProcA710 : SubtargetFeature<"cortex-a710", "ARMProcFamily", "CortexA710", "Cortex-A710 ARM processors", []>; def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", "Cortex-X1 ARM processors", []>; +def ProcX1C : SubtargetFeature<"cortex-x1c", "ARMProcFamily", "CortexX1C", + "Cortex-X1C ARM processors", []>; def ProcV1 : SubtargetFeature<"neoverse-v1", "ARMProcFamily", "NeoverseV1", "Neoverse-V1 ARM processors", []>; @@ -1443,6 +1445,14 @@ def : ProcNoItin<"cortex-x1", [ARMv82a, ProcX1, FeatureFullFP16, FeatureDotProd]>; +def : ProcNoItin<"cortex-x1c", [ARMv82a, ProcX1C, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFullFP16, + FeatureDotProd]>; + def : ProcNoItin<"neoverse-v1", [ARMv84a, FeatureHWDivThumb, FeatureHWDivARM, diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index cde715880376..5b0bae4d9274 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -752,23 +752,17 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); const MCInstrDesc &MCID = MI.getDesc(); - if (MCID.getSize()) - return MCID.getSize(); switch (MI.getOpcode()) { default: - // pseudo-instruction sizes are zero. - return 0; + // Return the size specified in .td file. If there's none, return 0, as we + // can't define a default size (Thumb1 instructions are 2 bytes, Thumb2 + // instructions are 2-4 bytes, and ARM instructions are 4 bytes), in + // contrast to AArch64 instructions which have a default size of 4 bytes for + // example. + return MCID.getSize(); case TargetOpcode::BUNDLE: return getInstBundleLength(MI); - case ARM::MOVi16_ga_pcrel: - case ARM::MOVTi16_ga_pcrel: - case ARM::t2MOVi16_ga_pcrel: - case ARM::t2MOVTi16_ga_pcrel: - return 4; - case ARM::MOVi32imm: - case ARM::t2MOVi32imm: - return 8; case ARM::CONSTPOOL_ENTRY: case ARM::JUMPTABLE_INSTS: case ARM::JUMPTABLE_ADDRS: @@ -777,19 +771,6 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // If this machine instr is a constant pool entry, its size is recorded as // operand #2. return MI.getOperand(2).getImm(); - case ARM::Int_eh_sjlj_longjmp: - return 16; - case ARM::tInt_eh_sjlj_longjmp: - return 10; - case ARM::tInt_WIN_eh_sjlj_longjmp: - return 12; - case ARM::Int_eh_sjlj_setjmp: - case ARM::Int_eh_sjlj_setjmp_nofp: - return 20; - case ARM::tInt_eh_sjlj_setjmp: - case ARM::t2Int_eh_sjlj_setjmp: - case ARM::t2Int_eh_sjlj_setjmp_nofp: - return 12; case ARM::SPACE: return MI.getOperand(1).getImm(); case ARM::INLINEASM: @@ -800,14 +781,6 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { Size = alignTo(Size, 4); return Size; } - case ARM::SpeculationBarrierISBDSBEndBB: - case ARM::t2SpeculationBarrierISBDSBEndBB: - // This gets lowered to 2 4-byte instructions. - return 8; - case ARM::SpeculationBarrierSBEndBB: - case ARM::t2SpeculationBarrierSBEndBB: - // This gets lowered to 1 4-byte instructions. - return 4; } } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fe4e6b24367a..1b41427a1cab 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14527,7 +14527,7 @@ static SDValue PerformXORCombine(SDNode *N, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); const TargetLowering *TLI = Subtarget->getTargetLowering(); - if (TLI->isConstTrueVal(N1.getNode()) && + if (TLI->isConstTrueVal(N1) && (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { if (CanInvertMVEVCMP(N0)) { SDLoc DL(N0); diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 1c1db473f866..32a3911d3369 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -3657,6 +3657,8 @@ def : InstAlias<"mov${p} $Rd, $imm", (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>, Requires<[IsARM, HasV6T2]>; +// This gets lowered to a single 4-byte instructions +let Size = 4 in def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, Sched<[WriteALU]>; @@ -3680,6 +3682,8 @@ def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd), let DecoderMethod = "DecodeArmMOVTWInstruction"; } +// This gets lowered to a single 4-byte instructions +let Size = 4 in def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd), (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, Sched<[WriteALU]>; @@ -5895,27 +5899,30 @@ def : ARMPat<(ARMthread_pointer), (MRC 15, 0, 13, 0, 3)>, // // These are pseudo-instructions and are lowered to individual MC-insts, so // no encoding information is necessary. +// This gets lowered to an instruction sequence of 20 bytes let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ], - hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { + hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1, Size = 20 in { def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), NoItinerary, [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, Requires<[IsARM, HasVFP2]>; } +// This gets lowered to an instruction sequence of 20 bytes let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ], - hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { + hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1, Size = 20 in { def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val), NoItinerary, [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>, Requires<[IsARM, NoVFP]>; } +// This gets lowered to an instruction sequence of 16 bytes // FIXME: Non-IOS version(s) -let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, +let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, Size = 16, Defs = [ R7, LR, SP ] in { def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), NoItinerary, @@ -5958,7 +5965,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in // This is a single pseudo instruction, the benefit is that it can be remat'd // as a single unit instead of having to handle reg inputs. // FIXME: Remove this when we can do generalized remat. -let isReMaterializable = 1, isMoveImm = 1 in +let isReMaterializable = 1, isMoveImm = 1, Size = 8 in def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2, [(set GPR:$dst, (arm_i32imm:$src))]>, Requires<[IsARM]>; @@ -6419,8 +6426,12 @@ def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn), // SpeculationBarrierEndBB must only be used after an unconditional control // flow, i.e. after a terminator for which isBarrier is True. let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { + // This gets lowered to a pair of 4-byte instructions + let Size = 8 in def SpeculationBarrierISBDSBEndBB : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; + // This gets lowered to a single 4-byte instructions + let Size = 4 in def SpeculationBarrierSBEndBB : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; } diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td index f09ad8167600..71527ae1ab11 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -1537,25 +1537,28 @@ def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br, // Defs. By doing so, we also cause the prologue/epilogue code to actively // preserve all of the callee-saved registers, which is exactly what we want. // $val is a scratch register for our use. +// This gets lowered to an instruction sequence of 12 bytes let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ], - hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, Size = 12, usesCustomInserter = 1 in def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val), AddrModeNone, 0, NoItinerary, "","", [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>; +// This gets lowered to an instruction sequence of 10 bytes // FIXME: Non-IOS version(s) let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1, - Defs = [ R7, LR, SP ] in + Size = 10, Defs = [ R7, LR, SP ] in def tInt_eh_sjlj_longjmp : XI<(outs), (ins tGPR:$src, tGPR:$scratch), AddrModeNone, 0, IndexModeNone, Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp tGPR:$src, tGPR:$scratch)]>, Requires<[IsThumb,IsNotWindows]>; +// This gets lowered to an instruction sequence of 12 bytes // (Windows is Thumb2-only) let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1, - Defs = [ R11, LR, SP ] in + Size = 12, Defs = [ R11, LR, SP ] in def tInt_WIN_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch), AddrModeNone, 0, IndexModeNone, Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>, diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index 6e8e61ca2b8e..f80b9a5053f7 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -2194,6 +2194,8 @@ def : InstAlias<"mov${p} $Rd, $imm", (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>, Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteALU]>; +// This gets lowered to a single 4-byte instructions +let Size = 4 in def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd), (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, Sched<[WriteALU]>; @@ -2223,6 +2225,8 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd), let DecoderMethod = "DecodeT2MOVTWInstruction"; } +// This gets lowered to a single 4-byte instructions +let Size = 4 in def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd), (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>, Sched<[WriteALU]>, Requires<[IsThumb, HasV8MBaseline]>; @@ -3814,10 +3818,11 @@ def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr), // doing so, we also cause the prologue/epilogue code to actively preserve // all of the callee-saved registers, which is exactly what we want. // $val is a scratch register for our use. +// This gets lowered to an instruction sequence of 12 bytes let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, Q0, Q1, Q2, Q3, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15], - hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, Size = 12, usesCustomInserter = 1 in { def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), AddrModeNone, 0, NoItinerary, "", "", @@ -3825,9 +3830,10 @@ let Defs = Requires<[IsThumb2, HasVFP2]>; } +// This gets lowered to an instruction sequence of 12 bytes let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ], - hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, Size = 12, usesCustomInserter = 1 in { def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val), AddrModeNone, 0, NoItinerary, "", "", @@ -4224,7 +4230,7 @@ def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>; // 32-bit immediate using movw + movt. // This is a single pseudo instruction to make it re-materializable. // FIXME: Remove this when we can do generalized remat. -let isReMaterializable = 1, isMoveImm = 1 in +let isReMaterializable = 1, isMoveImm = 1, Size = 8 in def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2, [(set rGPR:$dst, (i32 imm:$src))]>, Requires<[IsThumb, UseMovt]>; @@ -5006,8 +5012,12 @@ def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>; // SpeculationBarrierEndBB must only be used after an unconditional control // flow, i.e. after a terminator for which isBarrier is True. let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { + // This gets lowered to a pair of 4-byte instructions + let Size = 8 in def t2SpeculationBarrierISBDSBEndBB : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; + // This gets lowered to a single 4-byte instructions + let Size = 4 in def t2SpeculationBarrierSBEndBB : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 2dd25234dc50..32160b109343 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -304,6 +304,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { case CortexM7: case CortexR52: case CortexX1: + case CortexX1C: break; case Exynos: LdStMultipleTiming = SingleIssuePlusExtras; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 1c2b7ee6ba35..7cbdc014299f 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -77,6 +77,7 @@ protected: CortexR52, CortexR7, CortexX1, + CortexX1C, Exynos, Krait, Kryo, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index e0750a9945d2..d9d563ead260 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2109,9 +2109,6 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, } Type *T = I.getType(); - if (T->isPointerTy()) - T = T->getPointerElementType(); - if (T->getScalarSizeInBits() > 32) { LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump()); return false; diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp index ea6a7498e27f..311e43d77210 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -313,12 +313,18 @@ bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN, return false; } + // If the register is undefined (for example if it's a reserved register), + // it may still be possible to extend the range, but it's safer to be + // conservative and just punt. + if (LRExtRegRD == 0) + return false; + MachineInstr *UseMI = NodeAddr<StmtNode *>(IA).Addr->getCode(); NodeAddr<DefNode *> LRExtRegDN = DFG->addr<DefNode *>(LRExtRegRD); // Reaching Def to LRExtReg can't be a phi. if ((LRExtRegDN.Addr->getFlags() & NodeAttrs::PhiRef) && MI->getParent() != UseMI->getParent()) - return false; + return false; } return true; } diff --git a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp index 860c0ce29326..79e9ad4dd1d2 100644 --- a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp +++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp @@ -21,13 +21,32 @@ using namespace llvm; M68kLegalizerInfo::M68kLegalizerInfo(const M68kSubtarget &ST) { using namespace TargetOpcode; - const LLT S32 = LLT::scalar(32); - const LLT P0 = LLT::pointer(0, 32); - getActionDefinitionsBuilder(G_LOAD).legalFor({S32}); - getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({P0}); - getActionDefinitionsBuilder(G_ADD).legalFor({S32}); - getActionDefinitionsBuilder(G_SUB).legalFor({S32}); - getActionDefinitionsBuilder(G_MUL).legalFor({S32}); - getActionDefinitionsBuilder(G_UDIV).legalFor({S32}); + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); + const LLT s32 = LLT::scalar(32); + const LLT p0 = LLT::pointer(0, 32); + + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_UDIV, G_AND}) + .legalFor({s8, s16, s32}) + .clampScalar(0, s8, s32) + .widenScalarToNextPow2(0, 8); + + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({s32, p0}) + .clampScalar(0, s32, s32); + + getActionDefinitionsBuilder({G_FRAME_INDEX, G_GLOBAL_VALUE}).legalFor({p0}); + + getActionDefinitionsBuilder({G_STORE, G_LOAD}) + .legalForTypesWithMemDesc({{s32, p0, s32, 4}, + {s32, p0, s16, 4}, + {s32, p0, s8, 4}, + {s16, p0, s16, 2}, + {s8, p0, s8, 1}, + {p0, p0, s32, 4}}) + .clampScalar(0, s8, s32); + + getActionDefinitionsBuilder(G_PTR_ADD).legalFor({{p0, s32}}); + getLegacyLegalizerInfo().computeTables(); } diff --git a/llvm/lib/Target/M68k/M68kInstrBits.td b/llvm/lib/Target/M68k/M68kInstrBits.td index d610bce5c277..0d1278102378 100644 --- a/llvm/lib/Target/M68k/M68kInstrBits.td +++ b/llvm/lib/Target/M68k/M68kInstrBits.td @@ -79,6 +79,10 @@ def BTST32di : MxBTST_RI<MxType32d>; // Memory BTST limited to 8 bits only def BTST8jd : MxBTST_MR<MxType8d, MxType8.JOp, MxType8.JPat, MxEncEAj_0, MxExtEmpty>; +def BTST8od : MxBTST_MR<MxType8d, MxType8.OOp, MxType8.OPat, + MxEncEAo_0, MxExtEmpty>; +def BTST8ed : MxBTST_MR<MxType8d, MxType8.EOp, MxType8.EPat, + MxEncEAe_0, MxExtEmpty>; def BTST8pd : MxBTST_MR<MxType8d, MxType8.POp, MxType8.PPat, MxEncEAp_0, MxExtI16_0>; def BTST8fd : MxBTST_MR<MxType8d, MxType8.FOp, MxType8.FPat, @@ -90,6 +94,10 @@ def BTST8kd : MxBTST_MR<MxType8d, MxType8.KOp, MxType8.KPat, def BTST8ji : MxBTST_MI<MxType8d, MxType8.JOp, MxType8.JPat, MxEncEAj_0, MxExtEmpty>; +def BTST8oi : MxBTST_MI<MxType8d, MxType8.OOp, MxType8.OPat, + MxEncEAo_0, MxExtEmpty>; +def BTST8ei : MxBTST_MI<MxType8d, MxType8.EOp, MxType8.EPat, + MxEncEAe_0, MxExtEmpty>; def BTST8pi : MxBTST_MI<MxType8d, MxType8.POp, MxType8.PPat, MxEncEAp_0, MxExtI16_0>; def BTST8fi : MxBTST_MI<MxType8d, MxType8.FOp, MxType8.FPat, diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index eac237bb27bb..7b5248906b56 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -574,7 +574,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); - setOperationAction(Op, MVT::f64, GetMinMaxAction(Expand)); setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 22e200e77831..22084cddc092 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -896,6 +896,7 @@ defm FMUL : F3_fma_component<"mul", fmul>; defm FMIN : F3<"min", fminnum>; defm FMAX : F3<"max", fmaxnum>; +// Note: min.NaN.f64 and max.NaN.f64 do not actually exist. defm FMINNAN : F3<"min.NaN", fminimum>; defm FMAXNAN : F3<"max.NaN", fmaximum>; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 25cc34badda0..cbeae0ab03b8 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1252,7 +1252,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal); } else { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); @@ -9093,22 +9092,30 @@ bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) { static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op, unsigned &Opcode) { - const SDNode *InputNode = Op.getOperand(0).getNode(); - if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode)) - return false; - - if (!Subtarget.hasVSX()) + LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Op.getOperand(0)); + if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(InputNode)) return false; EVT Ty = Op->getValueType(0); - if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 || - Ty == MVT::v8i16 || Ty == MVT::v16i8) + // For v2f64, v4f32 and v4i32 types, we require the load to be non-extending + // as we cannot handle extending loads for these types. + if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) && + ISD::isNON_EXTLoad(InputNode)) + return true; + + EVT MemVT = InputNode->getMemoryVT(); + // For v8i16 and v16i8 types, extending loads can be handled as long as the + // memory VT is the same vector element VT type. + // The loads feeding into the v8i16 and v16i8 types will be extending because + // scalar i8/i16 are not legal types. + if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(InputNode) && + (MemVT == Ty.getVectorElementType())) return true; if (Ty == MVT::v2i64) { // Check the extend type, when the input type is i32, and the output vector // type is v2i64. - if (cast<LoadSDNode>(Op.getOperand(0))->getMemoryVT() == MVT::i32) { + if (MemVT == MVT::i32) { if (ISD::isZEXTLoad(InputNode)) Opcode = PPCISD::ZEXT_LD_SPLAT; if (ISD::isSEXTLoad(InputNode)) @@ -10755,6 +10762,26 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, if (VT == MVT::v2f64 && C) return Op; + if (Subtarget.hasP9Vector()) { + // A f32 load feeding into a v4f32 insert_vector_elt is handled in this way + // because on P10, it allows this specific insert_vector_elt load pattern to + // utilize the refactored load and store infrastructure in order to exploit + // prefixed loads. + // On targets with inexpensive direct moves (Power9 and up), a + // (insert_vector_elt v4f32:$vec, (f32 load)) is always better as an integer + // load since a single precision load will involve conversion to double + // precision on the load followed by another conversion to single precision. + if ((VT == MVT::v4f32) && (V2.getValueType() == MVT::f32) && + (isa<LoadSDNode>(V2))) { + SDValue BitcastVector = DAG.getBitcast(MVT::v4i32, V1); + SDValue BitcastLoad = DAG.getBitcast(MVT::i32, V2); + SDValue InsVecElt = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4i32, BitcastVector, + BitcastLoad, Op.getOperand(2)); + return DAG.getBitcast(MVT::v4f32, InsVecElt); + } + } + if (Subtarget.isISA3_1()) { if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64()) return SDValue(); diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index fe354208533b..ff43426dd1ef 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -2816,32 +2816,20 @@ let Predicates = [IsISA3_1, HasVSX, IsLittleEndian] in { def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i64:$rB)), (VINSWVRX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)), - (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)), - (VINSWRX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)), - (VINSWRX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>; def : Pat<(v2f64 (insertelt v2f64:$vDi, f64:$A, i64:$rB)), (VINSDRX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>; - def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)), + def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load DSForm:$rA)), i64:$rB)), (VINSDRX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>; - def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)), + def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load PDForm:$rA)), i64:$rB)), (VINSDRX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>; - def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)), + def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load XForm:$rA)), i64:$rB)), (VINSDRX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>; let AddedComplexity = 400 in { // Immediate vector insert element foreach Idx = [0, 1, 2, 3] in { def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, Idx)), (VINSW $vDi, !mul(!sub(3, Idx), 4), $rA)>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), Idx)), - (VINSW $vDi, !mul(!sub(3, Idx), 4), (LWZ memri:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), Idx)), - (VINSW $vDi, !mul(!sub(3, Idx), 4), (PLWZ memri34:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), Idx)), - (VINSW $vDi, !mul(!sub(3, Idx), 4), (LWZX memrr:$rA))>; } foreach i = [0, 1] in def : Pat<(v2i64 (insertelt v2i64:$vDi, i64:$rA, (i64 i))), @@ -2860,12 +2848,6 @@ let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC32] in { def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i32:$rB)), (VINSWVLX $vDi, InsertEltShift.Left2, (XSCVDPSPN $rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i32:$rB)), - (VINSWLX v4f32:$vDi, InsertEltShift.Left2, (LWZ memri:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i32:$rB)), - (VINSWLX v4f32:$vDi, InsertEltShift.Left2, (PLWZ memri34:$rA))>; - def: Pat<(v4f32(insertelt v4f32 : $vDi, (f32(load xaddr : $rA)), i32 : $rB)), - (VINSWLX v4f32 : $vDi, InsertEltShift.Left2, (LWZX memrr : $rA))>; } let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC64] in { @@ -2881,20 +2863,14 @@ let Predicates = [IsISA3_1, HasVSX, IsBigEndian, IsPPC64] in { def : Pat<(v4f32 (insertelt v4f32:$vDi, f32:$rA, i64:$rB)), (VINSWVLX $vDi, InsertEltShift.Sub32Left2, (XSCVDPSPN $rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), i64:$rB)), - (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZ memri:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), i64:$rB)), - (VINSWLX $vDi, InsertEltShift.Sub32Left2, (PLWZ memri34:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), i64:$rB)), - (VINSWLX $vDi, InsertEltShift.Sub32Left2, (LWZX memrr:$rA))>; def : Pat<(v2f64 (insertelt v2f64:$vDi, f64:$A, i64:$rB)), (VINSDLX $vDi, InsertEltShift.Left3, Bitcast.DblToLong)>; - def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX4:$rA)), i64:$rB)), + def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load DSForm:$rA)), i64:$rB)), (VINSDLX $vDi, InsertEltShift.Left3, (LD memrix:$rA))>; - def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load iaddrX34:$rA)), i64:$rB)), + def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load PDForm:$rA)), i64:$rB)), (VINSDLX $vDi, InsertEltShift.Left3, (PLD memri34:$rA))>; - def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load xaddrX4:$rA)), i64:$rB)), + def : Pat<(v2f64 (insertelt v2f64:$vDi, (f64 (load XForm:$rA)), i64:$rB)), (VINSDLX $vDi, InsertEltShift.Left3, (LDX memrr:$rA))>; } @@ -2904,15 +2880,6 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX, IsBigEndian] in { foreach Idx = [0, 1, 2, 3] in { def : Pat<(v4i32 (insertelt v4i32:$vDi, i32:$rA, (Ty Idx))), (VINSW $vDi, !mul(Idx, 4), $rA)>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddr:$rA)), - (Ty Idx))), - (VINSW $vDi, !mul(Idx, 4), (LWZ memri:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load iaddrX34:$rA)), - (Ty Idx))), - (VINSW $vDi, !mul(Idx, 4), (PLWZ memri34:$rA))>; - def : Pat<(v4f32 (insertelt v4f32:$vDi, (f32 (load xaddr:$rA)), - (Ty Idx))), - (VINSW $vDi, !mul(Idx, 4), (LWZX memrr:$rA))>; } } diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index a2ea34fe11c7..01f36e6dcdd2 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2266,8 +2266,8 @@ void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value, if (Inst.Opc == RISCV::LUI) { emitToStreamer( Out, MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Inst.Imm)); - } else if (Inst.Opc == RISCV::ADDUW) { - emitToStreamer(Out, MCInstBuilder(RISCV::ADDUW) + } else if (Inst.Opc == RISCV::ADD_UW) { + emitToStreamer(Out, MCInstBuilder(RISCV::ADD_UW) .addReg(DestReg) .addReg(SrcReg) .addReg(RISCV::X0)); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 14d0191a505f..1078403a3fd2 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -197,9 +197,9 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, // Get byte count of instruction. unsigned Size = Desc.getSize(); - // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded - // instructions for each pseudo, and must be updated when adding new pseudos - // or changing existing ones. + // RISCVInstrInfo::getInstSizeInBytes expects that the total size of the + // expanded instructions for each pseudo is correct in the Size field of the + // tablegen definition for the pseudo. if (MI.getOpcode() == RISCV::PseudoCALLReg || MI.getOpcode() == RISCV::PseudoCALL || MI.getOpcode() == RISCV::PseudoTAIL || diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp index 18858209aa9b..e935179e5f9b 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp @@ -31,7 +31,7 @@ static int getInstSeqCost(RISCVMatInt::InstSeq &Res, bool HasRVC) { case RISCV::LUI: Compressed = isInt<6>(Instr.Imm); break; - case RISCV::ADDUW: + case RISCV::ADD_UW: Compressed = false; break; } @@ -123,10 +123,11 @@ static void generateInstSeqImpl(int64_t Val, } } - // Try to use SLLIUW for Hi52 when it is uint32 but not int32. + // Try to use SLLI_UW for Hi52 when it is uint32 but not int32. if (isUInt<32>((uint64_t)Hi52) && !isInt<32>((uint64_t)Hi52) && ActiveFeatures[RISCV::FeatureStdExtZba]) { - // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with SLLIUW. + // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with + // SLLI_UW. Hi52 = ((uint64_t)Hi52) | (0xffffffffull << 32); Unsigned = true; } @@ -134,7 +135,7 @@ static void generateInstSeqImpl(int64_t Val, generateInstSeqImpl(Hi52, ActiveFeatures, Res); if (Unsigned) - Res.push_back(RISCVMatInt::Inst(RISCV::SLLIUW, ShiftAmount)); + Res.push_back(RISCVMatInt::Inst(RISCV::SLLI_UW, ShiftAmount)); else Res.push_back(RISCVMatInt::Inst(RISCV::SLLI, ShiftAmount)); if (Lo12) @@ -210,7 +211,7 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) { uint64_t LeadingOnesVal = Val | maskLeadingOnes<uint64_t>(LeadingZeros); TmpSeq.clear(); generateInstSeqImpl(LeadingOnesVal, ActiveFeatures, TmpSeq); - TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADDUW, 0)); + TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADD_UW, 0)); // Keep the new sequence if it is an improvement. if (TmpSeq.size() < Res.size()) { diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 5b0f27c5e937..e32a8fb010de 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -52,11 +52,17 @@ def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">, def FeatureStdExtZfh : SubtargetFeature<"zfh", "HasStdExtZfh", "true", "'Zfh' (Half-Precision Floating-Point)", - [FeatureStdExtZfhmin, FeatureStdExtF]>; + [FeatureStdExtF]>; def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">, AssemblerPredicate<(all_of FeatureStdExtZfh), "'Zfh' (Half-Precision Floating-Point)">; +def HasStdExtZfhOrZfhmin + : Predicate<"Subtarget->hasStdExtZfh() || Subtarget->hasStdExtZfhmin()">, + AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZfhmin), + "'Zfh' (Half-Precision Floating-Point) or " + "'Zfhmin' (Half-Precision Floating-Point Minimal)">; + def FeatureStdExtC : SubtargetFeature<"c", "HasStdExtC", "true", "'C' (Compressed Instructions)">; diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index 26ce16486bd9..40ee7ca6bc1c 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -86,9 +86,9 @@ bool RISCVExpandAtomicPseudo::expandMBB(MachineBasicBlock &MBB) { bool RISCVExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { - // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded - // instructions for each pseudo, and must be updated when adding new pseudos - // or changing existing ones. + // RISCVInstrInfo::getInstSizeInBytes expects that the total size of the + // expanded instructions for each pseudo is correct in the Size field of the + // tablegen definition for the pseudo. switch (MBBI->getOpcode()) { case RISCV::PseudoAtomicLoadNand32: return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32, diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 80340ee81509..0c5c13db7112 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -92,9 +92,9 @@ bool RISCVExpandPseudo::expandMBB(MachineBasicBlock &MBB) { bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { - // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded - // instructions for each pseudo, and must be updated when adding new pseudos - // or changing existing ones. + // RISCVInstrInfo::getInstSizeInBytes expects that the total size of the + // expanded instructions for each pseudo is correct in the Size field of the + // tablegen definition for the pseudo. switch (MBBI->getOpcode()) { case RISCV::PseudoLLA: return expandLoadLocalAddress(MBB, MBBI, NextMBBI); diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 5870502d74d5..6f77428ae721 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -166,8 +166,8 @@ static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT, SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT); if (Inst.Opc == RISCV::LUI) Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm); - else if (Inst.Opc == RISCV::ADDUW) - Result = CurDAG->getMachineNode(RISCV::ADDUW, DL, XLenVT, SrcReg, + else if (Inst.Opc == RISCV::ADD_UW) + Result = CurDAG->getMachineNode(RISCV::ADD_UW, DL, XLenVT, SrcReg, CurDAG->getRegister(RISCV::X0, XLenVT)); else if (Inst.Opc == RISCV::SH1ADD || Inst.Opc == RISCV::SH2ADD || Inst.Opc == RISCV::SH3ADD) @@ -775,10 +775,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { C1 == (maskTrailingOnes<uint64_t>(XLen - (C2 + C3)) << C2)) { // Use slli.uw when possible. if ((XLen - (C2 + C3)) == 32 && Subtarget->hasStdExtZba()) { - SDNode *SLLIUW = - CurDAG->getMachineNode(RISCV::SLLIUW, DL, XLenVT, X, + SDNode *SLLI_UW = + CurDAG->getMachineNode(RISCV::SLLI_UW, DL, XLenVT, X, CurDAG->getTargetConstant(C2, DL, XLenVT)); - ReplaceNode(Node, SLLIUW); + ReplaceNode(Node, SLLI_UW); return; } @@ -1811,7 +1811,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const { case RISCV::CLZW: case RISCV::CTZW: case RISCV::CPOPW: - case RISCV::SLLIUW: + case RISCV::SLLI_UW: case RISCV::FCVT_H_W: case RISCV::FCVT_H_WU: case RISCV::FCVT_S_W: @@ -1830,20 +1830,20 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const { if (Bits < (64 - countLeadingZeros(User->getConstantOperandVal(1)))) return false; break; - case RISCV::SEXTB: + case RISCV::SEXT_B: if (Bits < 8) return false; break; - case RISCV::SEXTH: - case RISCV::ZEXTH_RV32: - case RISCV::ZEXTH_RV64: + case RISCV::SEXT_H: + case RISCV::ZEXT_H_RV32: + case RISCV::ZEXT_H_RV64: if (Bits < 16) return false; break; - case RISCV::ADDUW: - case RISCV::SH1ADDUW: - case RISCV::SH2ADDUW: - case RISCV::SH3ADDUW: + case RISCV::ADD_UW: + case RISCV::SH1ADD_UW: + case RISCV::SH2ADD_UW: + case RISCV::SH3ADD_UW: // The first operand to add.uw/shXadd.uw is implicitly zero extended from // 32 bits. if (UI.getOperandNo() != 0 || Bits < 32) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5cc3aa35d4d2..97d24c8e9c0b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -282,6 +282,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb()) ? Legal : Expand); + // Zbkb can use rev8+brev8 to implement bitreverse. + setOperationAction(ISD::BITREVERSE, XLenVT, + Subtarget.hasStdExtZbkb() ? Custom : Expand); } if (Subtarget.hasStdExtZbb()) { @@ -1082,6 +1085,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::STORE); } + + setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); } EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, @@ -1115,17 +1121,15 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::riscv_masked_atomicrmw_min_i32: case Intrinsic::riscv_masked_atomicrmw_umax_i32: case Intrinsic::riscv_masked_atomicrmw_umin_i32: - case Intrinsic::riscv_masked_cmpxchg_i32: { - PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); + case Intrinsic::riscv_masked_cmpxchg_i32: Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getPointerElementType()); + Info.memVT = MVT::i32; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = Align(4); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; - } case Intrinsic::riscv_masked_strided_load: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(1); @@ -2952,17 +2956,26 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return LowerINTRINSIC_VOID(Op, DAG); case ISD::BSWAP: case ISD::BITREVERSE: { - // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining. - assert(Subtarget.hasStdExtZbp() && "Unexpected custom legalisation"); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); - // Start with the maximum immediate value which is the bitwidth - 1. - unsigned Imm = VT.getSizeInBits() - 1; - // If this is BSWAP rather than BITREVERSE, clear the lower 3 bits. - if (Op.getOpcode() == ISD::BSWAP) - Imm &= ~0x7U; - return DAG.getNode(RISCVISD::GREV, DL, VT, Op.getOperand(0), - DAG.getConstant(Imm, DL, VT)); + if (Subtarget.hasStdExtZbp()) { + // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining. + // Start with the maximum immediate value which is the bitwidth - 1. + unsigned Imm = VT.getSizeInBits() - 1; + // If this is BSWAP rather than BITREVERSE, clear the lower 3 bits. + if (Op.getOpcode() == ISD::BSWAP) + Imm &= ~0x7U; + return DAG.getNode(RISCVISD::GREV, DL, VT, Op.getOperand(0), + DAG.getConstant(Imm, DL, VT)); + } + assert(Subtarget.hasStdExtZbkb() && "Unexpected custom legalization"); + assert(Op.getOpcode() == ISD::BITREVERSE && "Unexpected opcode"); + // Expand bitreverse to a bswap(rev8) followed by brev8. + SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Op.getOperand(0)); + // We use the Zbp grevi encoding for rev.b/brev8 which will be recognized + // as brev8 by an isel pattern. + return DAG.getNode(RISCVISD::GREV, DL, VT, BSwap, + DAG.getConstant(7, DL, VT)); } case ISD::FSHL: case ISD::FSHR: { @@ -3063,6 +3076,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, // minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate // vscale as VLENB / 8. static_assert(RISCV::RVVBitsPerBlock == 64, "Unexpected bits per block!"); + if (Subtarget.getMinVLen() < RISCV::RVVBitsPerBlock) + report_fatal_error("Support for VLEN==32 is incomplete."); if (isa<ConstantSDNode>(Op.getOperand(0))) { // We assume VLENB is a multiple of 8. We manually choose the best shift // here because SimplifyDemandedBits isn't always able to simplify it. @@ -4288,8 +4303,47 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, MVT XLenVT = Subtarget.getXLenVT(); if (VecVT.getVectorElementType() == MVT::i1) { - // FIXME: For now we just promote to an i8 vector and extract from that, - // but this is probably not optimal. + if (VecVT.isFixedLengthVector()) { + unsigned NumElts = VecVT.getVectorNumElements(); + if (NumElts >= 8) { + MVT WideEltVT; + unsigned WidenVecLen; + SDValue ExtractElementIdx; + SDValue ExtractBitIdx; + unsigned MaxEEW = Subtarget.getMaxELENForFixedLengthVectors(); + MVT LargestEltVT = MVT::getIntegerVT( + std::min(MaxEEW, unsigned(XLenVT.getSizeInBits()))); + if (NumElts <= LargestEltVT.getSizeInBits()) { + assert(isPowerOf2_32(NumElts) && + "the number of elements should be power of 2"); + WideEltVT = MVT::getIntegerVT(NumElts); + WidenVecLen = 1; + ExtractElementIdx = DAG.getConstant(0, DL, XLenVT); + ExtractBitIdx = Idx; + } else { + WideEltVT = LargestEltVT; + WidenVecLen = NumElts / WideEltVT.getSizeInBits(); + // extract element index = index / element width + ExtractElementIdx = DAG.getNode( + ISD::SRL, DL, XLenVT, Idx, + DAG.getConstant(Log2_64(WideEltVT.getSizeInBits()), DL, XLenVT)); + // mask bit index = index % element width + ExtractBitIdx = DAG.getNode( + ISD::AND, DL, XLenVT, Idx, + DAG.getConstant(WideEltVT.getSizeInBits() - 1, DL, XLenVT)); + } + MVT WideVT = MVT::getVectorVT(WideEltVT, WidenVecLen); + Vec = DAG.getNode(ISD::BITCAST, DL, WideVT, Vec); + SDValue ExtractElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, XLenVT, + Vec, ExtractElementIdx); + // Extract the bit from GPR. + SDValue ShiftRight = + DAG.getNode(ISD::SRL, DL, XLenVT, ExtractElt, ExtractBitIdx); + return DAG.getNode(ISD::AND, DL, XLenVT, ShiftRight, + DAG.getConstant(1, DL, XLenVT)); + } + } + // Otherwise, promote to an i8 vector and extract from that. MVT WideVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorElementCount()); Vec = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Vec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec, Idx); @@ -4411,15 +4465,30 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getRegister(RISCV::X4, PtrVT); } case Intrinsic::riscv_orc_b: - // Lower to the GORCI encoding for orc.b. - return DAG.getNode(RISCVISD::GORC, DL, XLenVT, Op.getOperand(1), + case Intrinsic::riscv_brev8: { + // Lower to the GORCI encoding for orc.b or the GREVI encoding for brev8. + unsigned Opc = + IntNo == Intrinsic::riscv_brev8 ? RISCVISD::GREV : RISCVISD::GORC; + return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), DAG.getConstant(7, DL, XLenVT)); + } case Intrinsic::riscv_grev: case Intrinsic::riscv_gorc: { unsigned Opc = IntNo == Intrinsic::riscv_grev ? RISCVISD::GREV : RISCVISD::GORC; return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::riscv_zip: + case Intrinsic::riscv_unzip: { + // Lower to the SHFLI encoding for zip or the UNSHFLI encoding for unzip. + // For i32 the immdiate is 15. For i64 the immediate is 31. + unsigned Opc = + IntNo == Intrinsic::riscv_zip ? RISCVISD::SHFL : RISCVISD::UNSHFL; + unsigned BitWidth = Op.getValueSizeInBits(); + assert(isPowerOf2_32(BitWidth) && BitWidth >= 2 && "Unexpected bit width"); + return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), + DAG.getConstant((BitWidth / 2) - 1, DL, XLenVT)); + } case Intrinsic::riscv_shfl: case Intrinsic::riscv_unshfl: { unsigned Opc = @@ -5829,14 +5898,17 @@ SDValue RISCVTargetLowering::lowerMaskedGather(SDValue Op, } } - if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) { - IndexVT = IndexVT.changeVectorElementType(XLenVT); - Index = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); - } - if (!VL) VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; + if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) { + IndexVT = IndexVT.changeVectorElementType(XLenVT); + SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, Mask.getValueType(), + VL); + Index = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, IndexVT, Index, + TrueMask, VL); + } + unsigned IntID = IsUnmasked ? Intrinsic::riscv_vluxei : Intrinsic::riscv_vluxei_mask; SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)}; @@ -5937,14 +6009,17 @@ SDValue RISCVTargetLowering::lowerMaskedScatter(SDValue Op, } } - if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) { - IndexVT = IndexVT.changeVectorElementType(XLenVT); - Index = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); - } - if (!VL) VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; + if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) { + IndexVT = IndexVT.changeVectorElementType(XLenVT); + SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, Mask.getValueType(), + VL); + Index = DAG.getNode(RISCVISD::TRUNCATE_VECTOR_VL, DL, IndexVT, Index, + TrueMask, VL); + } + unsigned IntID = IsUnmasked ? Intrinsic::riscv_vsoxei : Intrinsic::riscv_vsoxei_mask; SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)}; @@ -6568,7 +6643,11 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2)); unsigned Opc = IntNo == Intrinsic::riscv_shfl ? RISCVISD::SHFLW : RISCVISD::UNSHFLW; - if (isa<ConstantSDNode>(N->getOperand(2))) { + // There is no (UN)SHFLIW. If the control word is a constant, we can use + // (UN)SHFLI with bit 4 of the control word cleared. The upper 32 bit half + // will be shuffled the same way as the lower 32 bit half, but the two + // halves won't cross. + if (isa<ConstantSDNode>(NewOp2)) { NewOp2 = DAG.getNode(ISD::AND, DL, MVT::i64, NewOp2, DAG.getConstant(0xf, DL, MVT::i64)); Opc = @@ -7284,8 +7363,8 @@ static SDValue performANY_EXTENDCombine(SDNode *N, return SDValue(N, 0); } -// Try to form VWMUL or VWMULU. -// FIXME: Support VWMULSU. +// Try to form VWMUL, VWMULU or VWMULSU. +// TODO: Support VWMULSU.vx with a sign extend Op and a splat of scalar Op. static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG, bool Commute) { assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode"); @@ -7296,6 +7375,7 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG, bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL; bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL; + bool IsVWMULSU = IsSignExt && Op1.getOpcode() == RISCVISD::VZEXT_VL; if ((!IsSignExt && !IsZeroExt) || !Op0.hasOneUse()) return SDValue(); @@ -7316,7 +7396,7 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); // See if the other operand is the same opcode. - if (Op0.getOpcode() == Op1.getOpcode()) { + if (IsVWMULSU || Op0.getOpcode() == Op1.getOpcode()) { if (!Op1.hasOneUse()) return SDValue(); @@ -7366,7 +7446,9 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG, if (Op1.getValueType() != NarrowVT) Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL); - unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL; + unsigned WMulOpc = RISCVISD::VWMULSU_VL; + if (!IsVWMULSU) + WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL; return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL); } @@ -8194,12 +8276,17 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } break; } - case RISCVISD::READ_VLENB: - // We assume VLENB is at least 16 bytes. - Known.Zero.setLowBits(4); + case RISCVISD::READ_VLENB: { + // If we know the minimum VLen from Zvl extensions, we can use that to + // determine the trailing zeros of VLENB. + // FIXME: Limit to 128 bit vectors until we have more testing. + unsigned MinVLenB = std::min(128U, Subtarget.getMinVLen()) / 8; + if (MinVLenB > 0) + Known.Zero.setLowBits(Log2_32(MinVLenB)); // We assume VLENB is no more than 65536 / 8 bytes. Known.Zero.setBitsFrom(14); break; + } case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_WO_CHAIN: { unsigned IntNo = @@ -8230,9 +8317,11 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( default: break; case RISCVISD::SELECT_CC: { - unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth + 1); + unsigned Tmp = + DAG.ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth + 1); if (Tmp == 1) return 1; // Early out. - unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(4), DemandedElts, Depth + 1); + unsigned Tmp2 = + DAG.ComputeNumSignBits(Op.getOperand(4), DemandedElts, Depth + 1); return std::min(Tmp, Tmp2); } case RISCVISD::SLLW: @@ -8275,15 +8364,18 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( } break; } - case RISCVISD::VMV_X_S: + case RISCVISD::VMV_X_S: { // The number of sign bits of the scalar result is computed by obtaining the // element type of the input vector operand, subtracting its width from the // XLEN, and then adding one (sign bit within the element type). If the // element type is wider than XLen, the least-significant XLEN bits are // taken. - if (Op.getOperand(0).getScalarValueSizeInBits() > Subtarget.getXLen()) - return 1; - return Subtarget.getXLen() - Op.getOperand(0).getScalarValueSizeInBits() + 1; + unsigned XLen = Subtarget.getXLen(); + unsigned EltBits = Op.getOperand(0).getScalarValueSizeInBits(); + if (EltBits <= XLen) + return XLen - EltBits + 1; + break; + } } return 1; @@ -10129,6 +10221,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FP_ROUND_VL) NODE_NAME_CASE(VWMUL_VL) NODE_NAME_CASE(VWMULU_VL) + NODE_NAME_CASE(VWMULSU_VL) NODE_NAME_CASE(VWADDU_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 58b7ec89f875..840a821870a7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -245,6 +245,7 @@ enum NodeType : unsigned { // Widening instructions VWMUL_VL, VWMULU_VL, + VWMULSU_VL, VWADDU_VL, // Vector compare producing a mask. Fourth operand is input mask. Fifth diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index d39e0805a79c..649eb57b325b 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -999,6 +999,12 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require, void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { VSETVLIInfo CurInfo; + // BBLocalInfo tracks the VL/VTYPE state the same way BBInfo.Change was + // calculated in computeIncomingVLVTYPE. We need this to apply + // canSkipVSETVLIForLoadStore the same way computeIncomingVLVTYPE did. We + // can't include predecessor information in that decision to avoid disagreeing + // with the global analysis. + VSETVLIInfo BBLocalInfo; // Only be set if current VSETVLIInfo is from an explicit VSET(I)VLI. MachineInstr *PrevVSETVLIMI = nullptr; @@ -1014,6 +1020,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { MI.getOperand(3).setIsDead(false); MI.getOperand(4).setIsDead(false); CurInfo = getInfoForVSETVLI(MI); + BBLocalInfo = getInfoForVSETVLI(MI); PrevVSETVLIMI = &MI; continue; } @@ -1043,12 +1050,22 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { // use the predecessor information. assert(BlockInfo[MBB.getNumber()].Pred.isValid() && "Expected a valid predecessor state."); - if (needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred) && + // Don't use predecessor information if there was an earlier instruction + // in this block that allowed a vsetvli to be skipped for load/store. + if (!(BBLocalInfo.isValid() && + canSkipVSETVLIForLoadStore(MI, NewInfo, BBLocalInfo)) && + needVSETVLI(NewInfo, BlockInfo[MBB.getNumber()].Pred) && needVSETVLIPHI(NewInfo, MBB)) { insertVSETVLI(MBB, MI, NewInfo, BlockInfo[MBB.getNumber()].Pred); CurInfo = NewInfo; + BBLocalInfo = NewInfo; } + + // We must update BBLocalInfo for every vector instruction. + if (!BBLocalInfo.isValid()) + BBLocalInfo = NewInfo; } else { + assert(BBLocalInfo.isValid()); // If this instruction isn't compatible with the previous VL/VTYPE // we need to insert a VSETVLI. // If this is a unit-stride or strided load/store, we may be able to use @@ -1084,6 +1101,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { if (NeedInsertVSETVLI) insertVSETVLI(MBB, MI, NewInfo, CurInfo); CurInfo = NewInfo; + BBLocalInfo = NewInfo; } } PrevVSETVLIMI = nullptr; @@ -1094,6 +1112,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) { if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) || MI.modifiesRegister(RISCV::VTYPE)) { CurInfo = VSETVLIInfo::getUnknown(); + BBLocalInfo = VSETVLIInfo::getUnknown(); PrevVSETVLIMI = nullptr; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 7baed2793e4e..55f4a19b79eb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -654,8 +654,8 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, get(RISCV::LUI), Result) .addImm(Inst.Imm) .setMIFlag(Flag); - } else if (Inst.Opc == RISCV::ADDUW) { - BuildMI(MBB, MBBI, DL, get(RISCV::ADDUW), Result) + } else if (Inst.Opc == RISCV::ADD_UW) { + BuildMI(MBB, MBBI, DL, get(RISCV::ADD_UW), Result) .addReg(SrcReg, RegState::Kill) .addReg(RISCV::X0) .setMIFlag(Flag); @@ -965,93 +965,29 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp, } unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { + if (MI.isMetaInstruction()) + return 0; + unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - default: { - if (MI.getParent() && MI.getParent()->getParent()) { - const auto MF = MI.getMF(); - const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget()); - const MCRegisterInfo &MRI = *TM.getMCRegisterInfo(); - const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); - const RISCVSubtarget &ST = MF->getSubtarget<RISCVSubtarget>(); - if (isCompressibleInst(MI, &ST, MRI, STI)) - return 2; - } - return get(Opcode).getSize(); - } - case TargetOpcode::EH_LABEL: - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - case TargetOpcode::DBG_VALUE: - return 0; - // These values are determined based on RISCVExpandAtomicPseudoInsts, - // RISCVExpandPseudoInsts and RISCVMCCodeEmitter, depending on where the - // pseudos are expanded. - case RISCV::PseudoCALLReg: - case RISCV::PseudoCALL: - case RISCV::PseudoJump: - case RISCV::PseudoTAIL: - case RISCV::PseudoLLA: - case RISCV::PseudoLA: - case RISCV::PseudoLA_TLS_IE: - case RISCV::PseudoLA_TLS_GD: - return 8; - case RISCV::PseudoAtomicLoadNand32: - case RISCV::PseudoAtomicLoadNand64: - return 20; - case RISCV::PseudoMaskedAtomicSwap32: - case RISCV::PseudoMaskedAtomicLoadAdd32: - case RISCV::PseudoMaskedAtomicLoadSub32: - return 28; - case RISCV::PseudoMaskedAtomicLoadNand32: - return 32; - case RISCV::PseudoMaskedAtomicLoadMax32: - case RISCV::PseudoMaskedAtomicLoadMin32: - return 44; - case RISCV::PseudoMaskedAtomicLoadUMax32: - case RISCV::PseudoMaskedAtomicLoadUMin32: - return 36; - case RISCV::PseudoCmpXchg32: - case RISCV::PseudoCmpXchg64: - return 16; - case RISCV::PseudoMaskedCmpXchg32: - return 32; - case TargetOpcode::INLINEASM: - case TargetOpcode::INLINEASM_BR: { + if (Opcode == TargetOpcode::INLINEASM || + Opcode == TargetOpcode::INLINEASM_BR) { const MachineFunction &MF = *MI.getParent()->getParent(); const auto &TM = static_cast<const RISCVTargetMachine &>(MF.getTarget()); return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *TM.getMCAsmInfo()); } - case RISCV::PseudoVSPILL2_M1: - case RISCV::PseudoVSPILL2_M2: - case RISCV::PseudoVSPILL2_M4: - case RISCV::PseudoVSPILL3_M1: - case RISCV::PseudoVSPILL3_M2: - case RISCV::PseudoVSPILL4_M1: - case RISCV::PseudoVSPILL4_M2: - case RISCV::PseudoVSPILL5_M1: - case RISCV::PseudoVSPILL6_M1: - case RISCV::PseudoVSPILL7_M1: - case RISCV::PseudoVSPILL8_M1: - case RISCV::PseudoVRELOAD2_M1: - case RISCV::PseudoVRELOAD2_M2: - case RISCV::PseudoVRELOAD2_M4: - case RISCV::PseudoVRELOAD3_M1: - case RISCV::PseudoVRELOAD3_M2: - case RISCV::PseudoVRELOAD4_M1: - case RISCV::PseudoVRELOAD4_M2: - case RISCV::PseudoVRELOAD5_M1: - case RISCV::PseudoVRELOAD6_M1: - case RISCV::PseudoVRELOAD7_M1: - case RISCV::PseudoVRELOAD8_M1: { - // The values are determined based on expandVSPILL and expandVRELOAD that - // expand the pseudos depending on NF. - unsigned NF = isRVVSpillForZvlsseg(Opcode)->first; - return 4 * (2 * NF - 1); - } + + if (MI.getParent() && MI.getParent()->getParent()) { + const auto MF = MI.getMF(); + const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget()); + const MCRegisterInfo &MRI = *TM.getMCRegisterInfo(); + const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); + const RISCVSubtarget &ST = MF->getSubtarget<RISCVSubtarget>(); + if (isCompressibleInst(MI, &ST, MRI, STI)) + return 2; } + return get(Opcode).getSize(); } bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 64cd89cda06a..ee6a74b7f14f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1183,7 +1183,7 @@ def : Pat<(brind (add GPRJALR:$rs1, simm12:$imm12)), // destination. // Define AsmString to print "call" when compile with -S flag. // Define isCodeGenOnly = 0 to support parsing assembly "call" instruction. -let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, hasSideEffects = 0, +let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> { let AsmString = "call\t$rd, $func"; @@ -1195,7 +1195,7 @@ def PseudoCALLReg : Pseudo<(outs GPR:$rd), (ins call_symbol:$func), []> { // if the offset fits in a signed 21-bit immediate. // Define AsmString to print "call" when compile with -S flag. // Define isCodeGenOnly = 0 to support parsing assembly "call" instruction. -let isCall = 1, Defs = [X1], isCodeGenOnly = 0 in +let isCall = 1, Defs = [X1], isCodeGenOnly = 0, Size = 8 in def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), []> { let AsmString = "call\t$func"; } @@ -1220,7 +1220,7 @@ def PseudoRET : Pseudo<(outs), (ins), [(riscv_ret_flag)]>, // expand to auipc and jalr while encoding. // Define AsmString to print "tail" when compile with -S flag. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [X2], - isCodeGenOnly = 0 in + Size = 8, isCodeGenOnly = 0 in def PseudoTAIL : Pseudo<(outs), (ins call_symbol:$dst), []> { let AsmString = "tail\t$dst"; } @@ -1235,28 +1235,28 @@ def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)), def : Pat<(riscv_tail (iPTR texternalsym:$dst)), (PseudoTAIL texternalsym:$dst)>; -let isCall = 0, isBarrier = 1, isBranch = 1, isTerminator = 1, +let isCall = 0, isBarrier = 1, isBranch = 1, isTerminator = 1, Size = 8, isCodeGenOnly = 0, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), []> { let AsmString = "jump\t$target, $rd"; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0, +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, isCodeGenOnly = 0, isAsmParserOnly = 1 in def PseudoLLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "lla", "$dst, $src">; -let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, +let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0, isAsmParserOnly = 1 in def PseudoLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la", "$dst, $src">; -let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, +let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0, isAsmParserOnly = 1 in def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la.tls.ie", "$dst, $src">; -let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, +let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 0, isAsmParserOnly = 1 in def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la.tls.gd", "$dst, $src">; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index ee10c3a54b2f..7d23dafb0346 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -188,6 +188,7 @@ class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch), let hasSideEffects = 0; } +let Size = 20 in def PseudoAtomicLoadNand32 : PseudoAMO; // Ordering constants must be kept in sync with the AtomicOrdering enum in // AtomicOrdering.h. @@ -242,27 +243,35 @@ class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst> (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt, timm:$ordering)>; +let Size = 28 in def PseudoMaskedAtomicSwap32 : PseudoMaskedAMO; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_xchg_i32, PseudoMaskedAtomicSwap32>; +let Size = 28 in def PseudoMaskedAtomicLoadAdd32 : PseudoMaskedAMO; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_add_i32, PseudoMaskedAtomicLoadAdd32>; +let Size = 28 in def PseudoMaskedAtomicLoadSub32 : PseudoMaskedAMO; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_sub_i32, PseudoMaskedAtomicLoadSub32>; +let Size = 32 in def PseudoMaskedAtomicLoadNand32 : PseudoMaskedAMO; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_nand_i32, PseudoMaskedAtomicLoadNand32>; +let Size = 44 in def PseudoMaskedAtomicLoadMax32 : PseudoMaskedAMOMinMax; def : PseudoMaskedAMOMinMaxPat<int_riscv_masked_atomicrmw_max_i32, PseudoMaskedAtomicLoadMax32>; +let Size = 44 in def PseudoMaskedAtomicLoadMin32 : PseudoMaskedAMOMinMax; def : PseudoMaskedAMOMinMaxPat<int_riscv_masked_atomicrmw_min_i32, PseudoMaskedAtomicLoadMin32>; +let Size = 36 in def PseudoMaskedAtomicLoadUMax32 : PseudoMaskedAMOUMinUMax; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax_i32, PseudoMaskedAtomicLoadUMax32>; +let Size = 36 in def PseudoMaskedAtomicLoadUMin32 : PseudoMaskedAMOUMinUMax; def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin_i32, PseudoMaskedAtomicLoadUMin32>; @@ -276,6 +285,7 @@ class PseudoCmpXchg let mayLoad = 1; let mayStore = 1; let hasSideEffects = 0; + let Size = 16; } // Ordering constants must be kept in sync with the AtomicOrdering enum in @@ -304,6 +314,7 @@ def PseudoMaskedCmpXchg32 let mayLoad = 1; let mayStore = 1; let hasSideEffects = 0; + let Size = 32; } def : Pat<(int_riscv_masked_cmpxchg_i32 @@ -347,6 +358,7 @@ def : Pat<(i64 (atomic_load_sub_64_seq_cst GPR:$addr, GPR:$incr)), /// 64-bit pseudo AMOs +let Size = 20 in def PseudoAtomicLoadNand64 : PseudoAMO; // Ordering constants must be kept in sync with the AtomicOrdering enum in // AtomicOrdering.h. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 4e7e251bc412..9087ed50f9fc 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -3836,7 +3836,7 @@ multiclass VPatConversionVF_WF <string intrinsic, string instruction> { } multiclass VPatCompare_VI<string intrinsic, string inst, - ImmLeaf ImmType = simm5_plus1> { + ImmLeaf ImmType> { foreach vti = AllIntegerVectors in { defvar Intr = !cast<Intrinsic>(intrinsic); defvar Pseudo = !cast<Instruction>(inst#"_VI_"#vti.LMul.MX); @@ -3899,11 +3899,13 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in { foreach lmul = MxList in { foreach nf = NFSet<lmul>.L in { defvar vreg = SegRegClass<lmul, nf>.RC; - let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1 in { + let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1, + Size = !mul(4, !sub(!mul(nf, 2), 1)) in { def "PseudoVSPILL" # nf # "_" # lmul.MX : Pseudo<(outs), (ins vreg:$rs1, GPR:$rs2, GPR:$vlenb), []>; } - let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in { + let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1, + Size = !mul(4, !sub(!mul(nf, 2), 1)) in { def "PseudoVRELOAD" # nf # "_" # lmul.MX : Pseudo<(outs vreg:$rs1), (ins GPR:$rs2, GPR:$vlenb), []>; } @@ -4657,13 +4659,15 @@ defm : VPatBinarySwappedM_VV<"int_riscv_vmsgt", "PseudoVMSLT", AllIntegerVectors defm : VPatBinarySwappedM_VV<"int_riscv_vmsgeu", "PseudoVMSLEU", AllIntegerVectors>; defm : VPatBinarySwappedM_VV<"int_riscv_vmsge", "PseudoVMSLE", AllIntegerVectors>; -// Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16. This -// avoids the user needing to know that there is no vmslt(u).vi instruction. -// Similar for vmsge(u).vx intrinsics using vmslt(u).vi. -defm : VPatCompare_VI<"int_riscv_vmslt", "PseudoVMSLE">; +// Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16 and +// non-zero. Zero can be .vx with x0. This avoids the user needing to know that +// there is no vmslt(u).vi instruction. Similar for vmsge(u).vx intrinsics +// using vmslt(u).vi. +defm : VPatCompare_VI<"int_riscv_vmslt", "PseudoVMSLE", simm5_plus1_nonzero>; defm : VPatCompare_VI<"int_riscv_vmsltu", "PseudoVMSLEU", simm5_plus1_nonzero>; -defm : VPatCompare_VI<"int_riscv_vmsge", "PseudoVMSGT">; +// We need to handle 0 for vmsge.vi using vmslt.vi because there is no vmsge.vx. +defm : VPatCompare_VI<"int_riscv_vmsge", "PseudoVMSGT", simm5_plus1>; defm : VPatCompare_VI<"int_riscv_vmsgeu", "PseudoVMSGTU", simm5_plus1_nonzero>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index e452a84a9a6f..2b920d29ab81 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -539,7 +539,7 @@ defm : VPatIntegerSetCCSDNode_VV_VX_VI<SETNE, "PseudoVMSNE">; defm : VPatIntegerSetCCSDNode_VV_VX<SETLT, "PseudoVMSLT">; defm : VPatIntegerSetCCSDNode_VV_VX<SETULT, "PseudoVMSLTU">; defm : VPatIntegerSetCCSDNode_VIPlus1<SETLT, "PseudoVMSLE", - SplatPat_simm5_plus1>; + SplatPat_simm5_plus1_nonzero>; defm : VPatIntegerSetCCSDNode_VIPlus1<SETULT, "PseudoVMSLEU", SplatPat_simm5_plus1_nonzero>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 964f0fa54512..e71c498fd5f4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -228,6 +228,7 @@ def SDT_RISCVVWBinOp_VL : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisVT<4, XLenVT>]>; def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; +def riscv_vwmulsu_vl : SDNode<"RISCVISD::VWMULSU_VL", SDT_RISCVVWBinOp_VL>; def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; def SDTRVVVecReduce : SDTypeProfile<1, 5, [ @@ -832,7 +833,7 @@ foreach vti = AllIntegerVectors in { defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGTU", SETUGT, SETULT>; defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLE", SETLT, - SplatPat_simm5_plus1>; + SplatPat_simm5_plus1_nonzero>; defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLEU", SETULT, SplatPat_simm5_plus1_nonzero>; defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSGT", SETGE, @@ -861,6 +862,7 @@ defm : VPatBinaryVL_VV_VX<riscv_srem_vl, "PseudoVREM">; // 12.12. Vector Widening Integer Multiply Instructions defm : VPatBinaryWVL_VV_VX<riscv_vwmul_vl, "PseudoVWMUL">; defm : VPatBinaryWVL_VV_VX<riscv_vwmulu_vl, "PseudoVWMULU">; +defm : VPatBinaryWVL_VV_VX<riscv_vwmulsu_vl, "PseudoVWMULSU">; // 12.13 Vector Single-Width Integer Multiply-Add Instructions foreach vti = AllIntegerVectors in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index db3f5851879a..07884d35f63c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -337,13 +337,39 @@ def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">, Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>; } // Predicates = [HasStdExtZba] +let Predicates = [HasStdExtZba, IsRV64] in { +def SLLI_UW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slli.uw">, + Sched<[WriteShiftImm32, ReadShiftImm32]>; +def ADD_UW : ALUW_rr<0b0000100, 0b000, "add.uw">, + Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>; +def SH1ADD_UW : ALUW_rr<0b0010000, 0b010, "sh1add.uw">, + Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>; +def SH2ADD_UW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">, + Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>; +def SH3ADD_UW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">, + Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>; +} // Predicates = [HasStdExtZbb, IsRV64] + let Predicates = [HasStdExtZbbOrZbpOrZbkb] in { def ROL : ALU_rr<0b0110000, 0b001, "rol">, Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>; def ROR : ALU_rr<0b0110000, 0b101, "ror">, Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>; + +def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, + Sched<[WriteRotateImm, ReadRotateImm]>; } // Predicates = [HasStdExtZbbOrZbpOrZbkb] +let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in { +def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">, + Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>; +def RORW : ALUW_rr<0b0110000, 0b101, "rorw">, + Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>; + +def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, + Sched<[WriteRotateImm32, ReadRotateImm32]>; +} // Predicates = [HasStdExtZbbOrZbp, IsRV64] + let Predicates = [HasStdExtZbs] in { def BCLR : ALU_rr<0b0100100, 0b001, "bclr">, Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>; @@ -353,27 +379,7 @@ def BINV : ALU_rr<0b0110100, 0b001, "binv">, Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>; def BEXT : ALU_rr<0b0100100, 0b101, "bext">, Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>; -} // Predicates = [HasStdExtZbs] - -let Predicates = [HasStdExtZbp] in { -def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>; -def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>; -} // Predicates = [HasStdExtZbp] -let Predicates = [HasStdExtZbpOrZbkx] in { -def XPERMN : ALU_rr<0b0010100, 0b010, "xperm4">, Sched<[]>; -def XPERMB : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[]>; -} // Predicates = [HasStdExtZbpOrZbkx] - -let Predicates = [HasStdExtZbp] in { -def XPERMH : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>; -} // Predicates = [HasStdExtZbp] - -let Predicates = [HasStdExtZbbOrZbpOrZbkb] in -def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, - Sched<[WriteRotateImm, ReadRotateImm]>; - -let Predicates = [HasStdExtZbs] in { def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">, Sched<[WriteSingleBitImm, ReadSingleBitImm]>; def BSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "bseti">, @@ -385,10 +391,42 @@ def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">, } // Predicates = [HasStdExtZbs] let Predicates = [HasStdExtZbp] in { +def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>; +def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>; + def GREVI : RVBShift_ri<0b01101, 0b101, OPC_OP_IMM, "grevi">, Sched<[]>; def GORCI : RVBShift_ri<0b00101, 0b101, OPC_OP_IMM, "gorci">, Sched<[]>; + +def SHFL : ALU_rr<0b0000100, 0b001, "shfl">, Sched<[]>; +def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>; + +def SHFLI : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>; +def UNSHFLI : RVBShfl_ri<0b0000100, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>; + +def XPERM_H : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>; } // Predicates = [HasStdExtZbp] +let Predicates = [HasStdExtZbp, IsRV64] in { +def GORCW : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>; +def GREVW : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>; + +def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>; +def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>; + +def SHFLW : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>; +def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>; + +def XPERM_W : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>; +} // Predicates = [HasStdExtZbp, IsRV64] + +// These instructions were named xperm.n and xperm.b in the last version of +// the draft bit manipulation specification they were included in. However, we +// use the mnemonics given to them in the ratified Zbkx extension. +let Predicates = [HasStdExtZbpOrZbkx] in { +def XPERM4 : ALU_rr<0b0010100, 0b010, "xperm4">, Sched<[]>; +def XPERM8 : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[]>; +} // Predicates = [HasStdExtZbpOrZbkx] + let Predicates = [HasStdExtZbt] in { def CMIX : RVBTernaryR<0b11, 0b001, OPC_OP, "cmix", "$rd, $rs2, $rs1, $rs3">, Sched<[]>; @@ -402,6 +440,15 @@ def FSRI : RVBTernaryImm6<0b101, OPC_OP_IMM, "fsri", "$rd, $rs1, $rs3, $shamt">, Sched<[]>; } // Predicates = [HasStdExtZbt] +let Predicates = [HasStdExtZbt, IsRV64] in { +def FSLW : RVBTernaryR<0b10, 0b001, OPC_OP_32, + "fslw", "$rd, $rs1, $rs3, $rs2">, Sched<[]>; +def FSRW : RVBTernaryR<0b10, 0b101, OPC_OP_32, "fsrw", + "$rd, $rs1, $rs3, $rs2">, Sched<[]>; +def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32, + "fsriw", "$rd, $rs1, $rs3, $shamt">, Sched<[]>; +} // Predicates = [HasStdExtZbt, IsRV64] + let Predicates = [HasStdExtZbb] in { def CLZ : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM, "clz">, Sched<[WriteCLZ, ReadCLZ]>; @@ -411,42 +458,45 @@ def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM, "cpop">, Sched<[WriteCPOP, ReadCPOP]>; } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbm, IsRV64] in -def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, OPC_OP_IMM, "bmatflip">, - Sched<[]>; +let Predicates = [HasStdExtZbb, IsRV64] in { +def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM_32, "clzw">, + Sched<[WriteCLZ32, ReadCLZ32]>; +def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, OPC_OP_IMM_32, "ctzw">, + Sched<[WriteCTZ32, ReadCTZ32]>; +def CPOPW : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM_32, "cpopw">, + Sched<[WriteCPOP32, ReadCPOP32]>; +} // Predicates = [HasStdExtZbb, IsRV64] let Predicates = [HasStdExtZbb] in { -def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, OPC_OP_IMM, "sext.b">, - Sched<[WriteIALU, ReadIALU]>; -def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, OPC_OP_IMM, "sext.h">, - Sched<[WriteIALU, ReadIALU]>; +def SEXT_B : RVBUnary<0b0110000, 0b00100, 0b001, OPC_OP_IMM, "sext.b">, + Sched<[WriteIALU, ReadIALU]>; +def SEXT_H : RVBUnary<0b0110000, 0b00101, 0b001, OPC_OP_IMM, "sext.h">, + Sched<[WriteIALU, ReadIALU]>; } // Predicates = [HasStdExtZbb] let Predicates = [HasStdExtZbr] in { -def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, OPC_OP_IMM, "crc32.b">, - Sched<[]>; -def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, OPC_OP_IMM, "crc32.h">, - Sched<[]>; -def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, OPC_OP_IMM, "crc32.w">, - Sched<[]>; -} // Predicates = [HasStdExtZbr] - -let Predicates = [HasStdExtZbr, IsRV64] in -def CRC32D : RVBUnary<0b0110000, 0b10011, 0b001, OPC_OP_IMM, "crc32.d">, +def CRC32_B : RVBUnary<0b0110000, 0b10000, 0b001, OPC_OP_IMM, "crc32.b">, Sched<[]>; - -let Predicates = [HasStdExtZbr] in { -def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, OPC_OP_IMM, "crc32c.b">, +def CRC32_H : RVBUnary<0b0110000, 0b10001, 0b001, OPC_OP_IMM, "crc32.h">, Sched<[]>; -def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, OPC_OP_IMM, "crc32c.h">, - Sched<[]>; -def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, OPC_OP_IMM, "crc32c.w">, +def CRC32_W : RVBUnary<0b0110000, 0b10010, 0b001, OPC_OP_IMM, "crc32.w">, Sched<[]>; + +def CRC32C_B : RVBUnary<0b0110000, 0b11000, 0b001, OPC_OP_IMM, "crc32c.b">, + Sched<[]>; +def CRC32C_H : RVBUnary<0b0110000, 0b11001, 0b001, OPC_OP_IMM, "crc32c.h">, + Sched<[]>; +def CRC32C_W : RVBUnary<0b0110000, 0b11010, 0b001, OPC_OP_IMM, "crc32c.w">, + Sched<[]>; } // Predicates = [HasStdExtZbr] -let Predicates = [HasStdExtZbr, IsRV64] in -def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">, - Sched<[]>; +let Predicates = [HasStdExtZbr, IsRV64] in { +def CRC32_D : RVBUnary<0b0110000, 0b10011, 0b001, OPC_OP_IMM, "crc32.d">, + Sched<[]>; + +def CRC32C_D : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">, + Sched<[]>; +} // Predicates = [HasStdExtZbr, IsRV64] let Predicates = [HasStdExtZbc] in { def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">, @@ -472,8 +522,6 @@ def MAXU : ALU_rr<0b0000101, 0b111, "maxu">, } // Predicates = [HasStdExtZbb] let Predicates = [HasStdExtZbp] in { -def SHFL : ALU_rr<0b0000100, 0b001, "shfl">, Sched<[]>; -def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>; } // Predicates = [HasStdExtZbp] let Predicates = [HasStdExtZbe] in { @@ -483,15 +531,31 @@ def BDECOMPRESS : ALU_rr<0b0100100, 0b110, "bdecompress">, Sched<[]>; def BCOMPRESS : ALU_rr<0b0000100, 0b110, "bcompress">, Sched<[]>; } // Predicates = [HasStdExtZbe] +let Predicates = [HasStdExtZbe, IsRV64] in { +// NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with +// bextw in the 0.93 spec. +def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>; +def BCOMPRESSW : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>; +} // Predicates = [HasStdExtZbe, IsRV64] + let Predicates = [HasStdExtZbpOrZbkb] in { def PACK : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>; def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>; } // Predicates = [HasStdExtZbpOrZbkb] +let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in +def PACKW : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>; + let Predicates = [HasStdExtZbp] in def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>; +let Predicates = [HasStdExtZbp, IsRV64] in +def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>; + let Predicates = [HasStdExtZbm, IsRV64] in { +def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, OPC_OP_IMM, "bmatflip">, + Sched<[]>; + def BMATOR : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>; def BMATXOR : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>; } // Predicates = [HasStdExtZbm, IsRV64] @@ -500,105 +564,18 @@ let Predicates = [HasStdExtZbf] in def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[WriteBFP, ReadBFP, ReadBFP]>; -let Predicates = [HasStdExtZbp] in { -def SHFLI : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>; -def UNSHFLI : RVBShfl_ri<0b0000100, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>; -} // Predicates = [HasStdExtZbp] - -let Predicates = [HasStdExtZba, IsRV64] in { -def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slli.uw">, - Sched<[WriteShiftImm32, ReadShiftImm32]>; -def ADDUW : ALUW_rr<0b0000100, 0b000, "add.uw">, - Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>; -def SH1ADDUW : ALUW_rr<0b0010000, 0b010, "sh1add.uw">, - Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>; -def SH2ADDUW : ALUW_rr<0b0010000, 0b100, "sh2add.uw">, - Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>; -def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">, - Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>; -} // Predicates = [HasStdExtZbb, IsRV64] - -let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in { -def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">, - Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>; -def RORW : ALUW_rr<0b0110000, 0b101, "rorw">, - Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>; -} // Predicates = [HasStdExtZbbOrZbp, IsRV64] - -let Predicates = [HasStdExtZbp, IsRV64] in { -def GORCW : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>; -def GREVW : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>; -} // Predicates = [HasStdExtZbp, IsRV64] - -let Predicates = [HasStdExtZbp, IsRV64] in { -def XPERMW : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>; -} // Predicates = [HasStdExtZbp, IsRV64] - -let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in -def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, - Sched<[WriteRotateImm32, ReadRotateImm32]>; - -let Predicates = [HasStdExtZbp, IsRV64] in { -def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>; -def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>; -} // Predicates = [HasStdExtZbp, IsRV64] - -let Predicates = [HasStdExtZbt, IsRV64] in { -def FSLW : RVBTernaryR<0b10, 0b001, OPC_OP_32, - "fslw", "$rd, $rs1, $rs3, $rs2">, Sched<[]>; -def FSRW : RVBTernaryR<0b10, 0b101, OPC_OP_32, "fsrw", - "$rd, $rs1, $rs3, $rs2">, Sched<[]>; -def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32, - "fsriw", "$rd, $rs1, $rs3, $shamt">, Sched<[]>; -} // Predicates = [HasStdExtZbt, IsRV64] - -let Predicates = [HasStdExtZbb, IsRV64] in { -def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM_32, "clzw">, - Sched<[WriteCLZ32, ReadCLZ32]>; -def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, OPC_OP_IMM_32, "ctzw">, - Sched<[WriteCTZ32, ReadCTZ32]>; -def CPOPW : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM_32, "cpopw">, - Sched<[WriteCPOP32, ReadCPOP32]>; -} // Predicates = [HasStdExtZbb, IsRV64] - -let Predicates = [HasStdExtZbp, IsRV64] in { -def SHFLW : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>; -def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>; -} // Predicates = [HasStdExtZbp, IsRV64] - -let Predicates = [HasStdExtZbe, IsRV64] in { -// NOTE: These mnemonics are from the 0.94 spec. There is a name conflict with -// bextw in the 0.93 spec. -def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>; -def BCOMPRESSW : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>; -} // Predicates = [HasStdExtZbe, IsRV64] - -let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in -def PACKW : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>; - -let Predicates = [HasStdExtZbp, IsRV64] in -def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>; - let Predicates = [HasStdExtZbf, IsRV64] in def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, Sched<[WriteBFP32, ReadBFP32, ReadBFP32]>; let Predicates = [HasStdExtZbbOrZbp, IsRV32] in { -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def ZEXTH_RV32 : RVInstR<0b0000100, 0b100, OPC_OP, (outs GPR:$rd), - (ins GPR:$rs1), "zext.h", "$rd, $rs1">, - Sched<[WriteIALU, ReadIALU]> { - let rs2 = 0b00000; -} +def ZEXT_H_RV32 : RVBUnary<0b0000100, 0b00000, 0b100, OPC_OP, "zext.h">, + Sched<[WriteIALU, ReadIALU]>; } // Predicates = [HasStdExtZbbOrZbp, IsRV32] let Predicates = [HasStdExtZbbOrZbp, IsRV64] in { -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -def ZEXTH_RV64 : RVInstR<0b0000100, 0b100, OPC_OP_32, (outs GPR:$rd), - (ins GPR:$rs1), "zext.h", "$rd, $rs1">, - Sched<[WriteIALU, ReadIALU]> { - let rs2 = 0b00000; -} +def ZEXT_H_RV64 : RVBUnary<0b0000100, 0b00000, 0b100, OPC_OP_32, "zext.h">, + Sched<[WriteIALU, ReadIALU]>; } // Predicates = [HasStdExtZbbOrZbp, IsRV64] // We treat rev8 and orc.b as standalone instructions even though they use a @@ -619,8 +596,8 @@ def REV8_RV64 : RVBUnary<0b0110101, 0b11000, 0b101, OPC_OP_IMM, "rev8">, } // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] let Predicates = [HasStdExtZbbOrZbp] in { -def ORCB : RVBUnary<0b0010100, 0b00111, 0b101, OPC_OP_IMM, "orc.b">, - Sched<[WriteORCB, ReadORCB]>; +def ORC_B : RVBUnary<0b0010100, 0b00111, 0b101, OPC_OP_IMM, "orc.b">, + Sched<[WriteORCB, ReadORCB]>; } // Predicates = [HasStdExtZbbOrZbp] let Predicates = [HasStdExtZbpOrZbkb] in @@ -637,7 +614,7 @@ def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtZba, IsRV64] in { -def : InstAlias<"zext.w $rd, $rs", (ADDUW GPR:$rd, GPR:$rs, X0)>; +def : InstAlias<"zext.w $rd, $rs", (ADD_UW GPR:$rd, GPR:$rs, X0)>; } let Predicates = [HasStdExtZbp] in { @@ -775,8 +752,10 @@ def : InstAlias<"gorcw $rd, $rs1, $shamt", // Zbp is unratified and that it would likely adopt the already ratified Zbkx names. // Thus current Zbp instructions are defined as aliases for Zbkx instructions. let Predicates = [HasStdExtZbp] in { - def : InstAlias<"xperm.b $rd, $rs1, $rs2", (XPERMB GPR:$rd, GPR:$rs1, GPR:$rs2)>; - def : InstAlias<"xperm.n $rd, $rs1, $rs2", (XPERMN GPR:$rd, GPR:$rs1, GPR:$rs2)>; + def : InstAlias<"xperm.b $rd, $rs1, $rs2", + (XPERM8 GPR:$rd, GPR:$rs1, GPR:$rs2)>; + def : InstAlias<"xperm.n $rd, $rs1, $rs2", + (XPERM4 GPR:$rd, GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtZbp] let Predicates = [HasStdExtZbs] in { @@ -803,8 +782,22 @@ def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>; let Predicates = [HasStdExtZbbOrZbpOrZbkb] in { def : PatGprGpr<rotl, ROL>; def : PatGprGpr<rotr, ROR>; + +def : PatGprImm<rotr, RORI, uimmlog2xlen>; +// There's no encoding for roli in the the 'B' extension as it can be +// implemented with rori by negating the immediate. +def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt), + (RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>; } // Predicates = [HasStdExtZbbOrZbpOrZbkb] +let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in { +def : PatGprGpr<riscv_rolw, ROLW>; +def : PatGprGpr<riscv_rorw, RORW>; +def : PatGprImm<riscv_rorw, RORIW, uimm5>; +def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2), + (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>; +} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] + let Predicates = [HasStdExtZbs] in { def : Pat<(and (not (shiftop<shl> 1, GPR:$rs2)), GPR:$rs1), (BCLR GPR:$rs1, GPR:$rs2)>; @@ -852,48 +845,62 @@ def : Pat<(and GPR:$r, BCLRIANDIMask:$i), (BCLRITwoBitsMaskHigh BCLRIANDIMask:$i))>; } -// There's no encoding for roli in the the 'B' extension as it can be -// implemented with rori by negating the immediate. -let Predicates = [HasStdExtZbbOrZbpOrZbkb] in { -def : PatGprImm<rotr, RORI, uimmlog2xlen>; -def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt), - (RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>; - +let Predicates = [HasStdExtZbbOrZbp] in { // We treat orc.b as a separate instruction, so match it directly. We also // lower the Zbb orc.b intrinsic to this. -def : Pat<(riscv_gorc GPR:$rs1, 7), (ORCB GPR:$rs1)>; +def : Pat<(riscv_gorc GPR:$rs1, 7), (ORC_B GPR:$rs1)>; +} + +let Predicates = [HasStdExtZbpOrZbkb] in { +// We treat brev8 as a separate instruction, so match it directly. We also +// use this for brev8 when lowering bitreverse with Zbkb. +def : Pat<(riscv_grev GPR:$rs1, 7), (BREV8 GPR:$rs1)>; + +// We treat zip and unzip as separate instructions, so match it directly. +def : Pat<(i32 (riscv_shfl GPR:$rs1, 15)), (ZIP_RV32 GPR:$rs1)>; +def : Pat<(i32 (riscv_unshfl GPR:$rs1, 15)), (UNZIP_RV32 GPR:$rs1)>; } let Predicates = [HasStdExtZbp] in { def : PatGprGpr<riscv_grev, GREV>; def : PatGprGpr<riscv_gorc, GORC>; +def : PatGprImm<riscv_grev, GREVI, uimmlog2xlen>; +def : PatGprImm<riscv_gorc, GORCI, uimmlog2xlen>; + def : PatGprGpr<riscv_shfl, SHFL>; def : PatGprGpr<riscv_unshfl, UNSHFL>; -def : PatGprGpr<int_riscv_xperm_n, XPERMN>; -def : PatGprGpr<int_riscv_xperm_b, XPERMB>; -def : PatGprGpr<int_riscv_xperm_h, XPERMH>; def : PatGprImm<riscv_shfl, SHFLI, shfl_uimm>; def : PatGprImm<riscv_unshfl, UNSHFLI, shfl_uimm>; -def : PatGprImm<riscv_grev, GREVI, uimmlog2xlen>; -def : PatGprImm<riscv_gorc, GORCI, uimmlog2xlen>; -// We treat brev8 as a separate instruction, so match it directly. -def : Pat<(riscv_grev GPR:$rs1, 7), (BREV8 GPR:$rs1)>; +def : PatGprGpr<int_riscv_xperm_n, XPERM4>; +def : PatGprGpr<int_riscv_xperm_b, XPERM8>; +def : PatGprGpr<int_riscv_xperm_h, XPERM_H>; } // Predicates = [HasStdExtZbp] +let Predicates = [HasStdExtZbp, IsRV64] in { +def : PatGprGpr<riscv_grevw, GREVW>; +def : PatGprGpr<riscv_gorcw, GORCW>; +def : PatGprImm<riscv_grevw, GREVIW, uimm5>; +def : PatGprImm<riscv_gorcw, GORCIW, uimm5>; + +// FIXME: Move to DAG combine. +def : Pat<(riscv_rorw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>; +def : Pat<(riscv_rolw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>; + +def : PatGprGpr<riscv_shflw, SHFLW>; +def : PatGprGpr<riscv_unshflw, UNSHFLW>; +} // Predicates = [HasStdExtZbp, IsRV64] + let Predicates = [HasStdExtZbp, IsRV64] in -def : PatGprGpr<int_riscv_xperm_w, XPERMW>; +def : PatGprGpr<int_riscv_xperm_w, XPERM_W>; let Predicates = [HasStdExtZbp, IsRV32] in { +// FIXME : Move to DAG combine. def : Pat<(i32 (rotr (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>; def : Pat<(i32 (rotl (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>; // We treat rev8 as a separate instruction, so match it directly. def : Pat<(i32 (riscv_grev GPR:$rs1, 24)), (REV8_RV32 GPR:$rs1)>; - -// We treat zip and unzip as separate instructions, so match it directly. -def : Pat<(i32 (riscv_shfl GPR:$rs1, 15)), (ZIP_RV32 GPR:$rs1)>; -def : Pat<(i32 (riscv_unshfl GPR:$rs1, 15)), (UNZIP_RV32 GPR:$rs1)>; } // Predicates = [HasStdExtZbp, IsRV32] let Predicates = [HasStdExtZbp, IsRV64] in { @@ -942,15 +949,34 @@ def : Pat<(riscv_fsl GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt), (FSRI GPR:$rs1, GPR:$rs3, (ImmSubFromXLen uimmlog2xlen:$shamt))>; } // Predicates = [HasStdExtZbt] +let Predicates = [HasStdExtZbt, IsRV64] in { +def : Pat<(riscv_fslw GPR:$rs1, GPR:$rs3, GPR:$rs2), + (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, GPR:$rs2), + (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; +def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, uimm5:$shamt), + (FSRIW GPR:$rs1, GPR:$rs3, uimm5:$shamt)>; +// We can use FSRIW for FSLW by immediate if we subtract the immediate from +// 32 and swap the operands. +def : Pat<(riscv_fslw GPR:$rs3, GPR:$rs1, uimm5:$shamt), + (FSRIW GPR:$rs1, GPR:$rs3, (ImmSubFrom32 uimm5:$shamt))>; +} // Predicates = [HasStdExtZbt, IsRV64] + let Predicates = [HasStdExtZbb] in { def : PatGpr<ctlz, CLZ>; def : PatGpr<cttz, CTZ>; def : PatGpr<ctpop, CPOP>; } // Predicates = [HasStdExtZbb] +let Predicates = [HasStdExtZbb, IsRV64] in { +def : PatGpr<riscv_clzw, CLZW>; +def : PatGpr<riscv_ctzw, CTZW>; +def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>; +} // Predicates = [HasStdExtZbb, IsRV64] + let Predicates = [HasStdExtZbb] in { -def : Pat<(sext_inreg GPR:$rs1, i8), (SEXTB GPR:$rs1)>; -def : Pat<(sext_inreg GPR:$rs1, i16), (SEXTH GPR:$rs1)>; +def : Pat<(sext_inreg GPR:$rs1, i8), (SEXT_B GPR:$rs1)>; +def : Pat<(sext_inreg GPR:$rs1, i16), (SEXT_H GPR:$rs1)>; } let Predicates = [HasStdExtZbb] in { @@ -968,35 +994,49 @@ let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in { def : Pat<(i64 (bswap GPR:$rs1)), (REV8_RV64 GPR:$rs1)>; } // Predicates = [HasStdExtZbbOrZbkb, IsRV64] +let Predicates = [HasStdExtZbpOrZbkb] in { +def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF), + (and GPR:$rs1, 0x00FF)), + (PACKH GPR:$rs1, GPR:$rs2)>; +def : Pat<(or (shl (and GPR:$rs2, 0x00FF), (XLenVT 8)), + (and GPR:$rs1, 0x00FF)), + (PACKH GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtZbpOrZbkb] + let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in def : Pat<(i32 (or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16)))), (PACK GPR:$rs1, GPR:$rs2)>; +let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in { +def : Pat<(i64 (or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32)))), + (PACK GPR:$rs1, GPR:$rs2)>; + +def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)), + (and GPR:$rs1, 0x000000000000FFFF)), + i32)), + (PACKW GPR:$rs1, GPR:$rs2)>; +def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), + (and GPR:$rs1, 0x000000000000FFFF))), + (PACKW GPR:$rs1, GPR:$rs2)>; +} + let Predicates = [HasStdExtZbp, IsRV32] in def : Pat<(i32 (or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16)))), (PACKU GPR:$rs1, GPR:$rs2)>; -let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in -def : Pat<(i64 (or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32)))), - (PACK GPR:$rs1, GPR:$rs2)>; - -let Predicates = [HasStdExtZbp, IsRV64] in +let Predicates = [HasStdExtZbp, IsRV64] in { def : Pat<(i64 (or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32)))), (PACKU GPR:$rs1, GPR:$rs2)>; -let Predicates = [HasStdExtZbpOrZbkb] in { -def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF), - (and GPR:$rs1, 0x00FF)), - (PACKH GPR:$rs1, GPR:$rs2)>; -def : Pat<(or (shl (and GPR:$rs2, 0x00FF), (XLenVT 8)), - (and GPR:$rs1, 0x00FF)), - (PACKH GPR:$rs1, GPR:$rs2)>; -} // Predicates = [HasStdExtZbpOrZbkb] +def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000), + (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))), + (PACKUW GPR:$rs1, GPR:$rs2)>; +} let Predicates = [HasStdExtZbbOrZbp, IsRV32] in -def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV32 GPR:$rs)>; +def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV32 GPR:$rs)>; let Predicates = [HasStdExtZbbOrZbp, IsRV64] in -def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV64 GPR:$rs)>; +def : Pat<(i64 (and GPR:$rs, 0xFFFF)), (ZEXT_H_RV64 GPR:$rs)>; // Pattern to exclude simm12 immediates from matching. def non_imm12 : PatLeaf<(XLenVT GPR:$a), [{ @@ -1074,80 +1114,26 @@ def : Pat<(mul_const_oneuse GPR:$r, (XLenVT 81)), let Predicates = [HasStdExtZba, IsRV64] in { def : Pat<(i64 (shl (and GPR:$rs1, 0xFFFFFFFF), uimm5:$shamt)), - (SLLIUW GPR:$rs1, uimm5:$shamt)>; + (SLLI_UW GPR:$rs1, uimm5:$shamt)>; def : Pat<(i64 (add (and GPR:$rs1, 0xFFFFFFFF), non_imm12:$rs2)), - (ADDUW GPR:$rs1, GPR:$rs2)>; -def : Pat<(i64 (and GPR:$rs, 0xFFFFFFFF)), (ADDUW GPR:$rs, X0)>; + (ADD_UW GPR:$rs1, GPR:$rs2)>; +def : Pat<(i64 (and GPR:$rs, 0xFFFFFFFF)), (ADD_UW GPR:$rs, X0)>; def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 1)), non_imm12:$rs2)), - (SH1ADDUW GPR:$rs1, GPR:$rs2)>; + (SH1ADD_UW GPR:$rs1, GPR:$rs2)>; def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 2)), non_imm12:$rs2)), - (SH2ADDUW GPR:$rs1, GPR:$rs2)>; + (SH2ADD_UW GPR:$rs1, GPR:$rs2)>; def : Pat<(i64 (add (shl (and GPR:$rs1, 0xFFFFFFFF), (i64 3)), non_imm12:$rs2)), - (SH3ADDUW GPR:$rs1, GPR:$rs2)>; + (SH3ADD_UW GPR:$rs1, GPR:$rs2)>; def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 1)), 0x1FFFFFFFF), non_imm12:$rs2)), - (SH1ADDUW GPR:$rs1, GPR:$rs2)>; + (SH1ADD_UW GPR:$rs1, GPR:$rs2)>; def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 2)), 0x3FFFFFFFF), non_imm12:$rs2)), - (SH2ADDUW GPR:$rs1, GPR:$rs2)>; + (SH2ADD_UW GPR:$rs1, GPR:$rs2)>; def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF), non_imm12:$rs2)), - (SH3ADDUW GPR:$rs1, GPR:$rs2)>; + (SH3ADD_UW GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtZba, IsRV64] -let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in { -def : PatGprGpr<riscv_rolw, ROLW>; -def : PatGprGpr<riscv_rorw, RORW>; -def : PatGprImm<riscv_rorw, RORIW, uimm5>; -def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2), - (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>; -} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] - -let Predicates = [HasStdExtZbp, IsRV64] in { -def : Pat<(riscv_rorw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>; -def : Pat<(riscv_rolw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>; -def : PatGprGpr<riscv_grevw, GREVW>; -def : PatGprGpr<riscv_gorcw, GORCW>; -def : PatGprGpr<riscv_shflw, SHFLW>; -def : PatGprGpr<riscv_unshflw, UNSHFLW>; -def : PatGprImm<riscv_grevw, GREVIW, uimm5>; -def : PatGprImm<riscv_gorcw, GORCIW, uimm5>; -} // Predicates = [HasStdExtZbp, IsRV64] - -let Predicates = [HasStdExtZbt, IsRV64] in { -def : Pat<(riscv_fslw GPR:$rs1, GPR:$rs3, GPR:$rs2), - (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; -def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, GPR:$rs2), - (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>; -def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, uimm5:$shamt), - (FSRIW GPR:$rs1, GPR:$rs3, uimm5:$shamt)>; -// We can use FSRIW for FSLW by immediate if we subtract the immediate from -// 32 and swap the operands. -def : Pat<(riscv_fslw GPR:$rs3, GPR:$rs1, uimm5:$shamt), - (FSRIW GPR:$rs1, GPR:$rs3, (ImmSubFrom32 uimm5:$shamt))>; -} // Predicates = [HasStdExtZbt, IsRV64] - -let Predicates = [HasStdExtZbb, IsRV64] in { -def : PatGpr<riscv_clzw, CLZW>; -def : PatGpr<riscv_ctzw, CTZW>; -def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>; -} // Predicates = [HasStdExtZbb, IsRV64] - -let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in { -def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)), - (and GPR:$rs1, 0x000000000000FFFF)), - i32)), - (PACKW GPR:$rs1, GPR:$rs2)>; -def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32), - (and GPR:$rs1, 0x000000000000FFFF))), - (PACKW GPR:$rs1, GPR:$rs2)>; -} - -let Predicates = [HasStdExtZbp, IsRV64] in -def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000), - (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))), - (PACKUW GPR:$rs1, GPR:$rs2)>; - - let Predicates = [HasStdExtZbcOrZbkc] in { def : PatGprGpr<int_riscv_clmul, CLMUL>; def : PatGprGpr<int_riscv_clmulh, CLMULH>; @@ -1167,17 +1153,17 @@ def : PatGprGpr<riscv_bdecompressw, BDECOMPRESSW>; } // Predicates = [HasStdExtZbe, IsRV64] let Predicates = [HasStdExtZbr] in { -def : PatGpr<int_riscv_crc32_b, CRC32B>; -def : PatGpr<int_riscv_crc32_h, CRC32H>; -def : PatGpr<int_riscv_crc32_w, CRC32W>; -def : PatGpr<int_riscv_crc32c_b, CRC32CB>; -def : PatGpr<int_riscv_crc32c_h, CRC32CH>; -def : PatGpr<int_riscv_crc32c_w, CRC32CW>; +def : PatGpr<int_riscv_crc32_b, CRC32_B>; +def : PatGpr<int_riscv_crc32_h, CRC32_H>; +def : PatGpr<int_riscv_crc32_w, CRC32_W>; +def : PatGpr<int_riscv_crc32c_b, CRC32C_B>; +def : PatGpr<int_riscv_crc32c_h, CRC32C_H>; +def : PatGpr<int_riscv_crc32c_w, CRC32C_W>; } // Predicates = [HasStdExtZbr] let Predicates = [HasStdExtZbr, IsRV64] in { -def : PatGpr<int_riscv_crc32_d, CRC32D>; -def : PatGpr<int_riscv_crc32c_d, CRC32CD>; +def : PatGpr<int_riscv_crc32_d, CRC32_D>; +def : PatGpr<int_riscv_crc32c_d, CRC32C_D>; } // Predicates = [HasStdExtZbr, IsRV64] let Predicates = [HasStdExtZbf] in @@ -1186,16 +1172,7 @@ def : PatGprGpr<riscv_bfp, BFP>; let Predicates = [HasStdExtZbf, IsRV64] in def : PatGprGpr<riscv_bfpw, BFPW>; -let Predicates = [HasStdExtZbkb] in { -def : PatGpr<int_riscv_brev8, BREV8>; -} // Predicates = [HasStdExtZbkb] - -let Predicates = [HasStdExtZbkb, IsRV32] in { -def : PatGpr<int_riscv_zip, ZIP_RV32>; -def : PatGpr<int_riscv_unzip, UNZIP_RV32>; -} // Predicates = [HasStdExtZbkb, IsRV32] - let Predicates = [HasStdExtZbkx] in { -def : PatGprGpr<int_riscv_xperm4, XPERMN>; -def : PatGprGpr<int_riscv_xperm8, XPERMB>; +def : PatGprGpr<int_riscv_xperm4, XPERM4>; +def : PatGprGpr<int_riscv_xperm8, XPERM8>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index dfd0c74ee26c..a2753c132354 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -29,14 +29,14 @@ def riscv_fmv_x_anyexth // Instructions //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZfhmin] in { +let Predicates = [HasStdExtZfhOrZfhmin] in { def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>; // Operands for stores are in the order srcreg, base, offset rather than // reflecting the order these fields are specified in the instruction // encoding. def FSH : FPStore_r<0b001, "fsh", FPR16, WriteFST16>; -} // Predicates = [HasStdExtZfhmin] +} // Predicates = [HasStdExtZfhOrZfhmin] let Predicates = [HasStdExtZfh] in { let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16] in { @@ -98,7 +98,7 @@ def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, 0b00001, FPR16, GPR, "fcvt.h.wu">, def : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>; } // Predicates = [HasStdExtZfh] -let Predicates = [HasStdExtZfhmin] in { +let Predicates = [HasStdExtZfhOrZfhmin] in { def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, 0b00000, FPR16, FPR32, "fcvt.h.s">, Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>; def : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>; @@ -113,7 +113,7 @@ def FMV_X_H : FPUnaryOp_r<0b1110010, 0b00000, 0b000, GPR, FPR16, "fmv.x.h">, let mayRaiseFPException = 0 in def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">, Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]>; -} // Predicates = [HasStdExtZfhmin] +} // Predicates = [HasStdExtZfhOrZfhmin] let Predicates = [HasStdExtZfh] in { @@ -146,23 +146,23 @@ def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, 0b00011, FPR16, GPR, "fcvt.h.lu">, def : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>; } // Predicates = [HasStdExtZfh, IsRV64] -let Predicates = [HasStdExtZfhmin, HasStdExtD] in { +let Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] in { def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, 0b00001, FPR16, FPR64, "fcvt.h.d">, Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]>; def : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>; def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b00010, 0b000, FPR64, FPR16, "fcvt.d.h">, Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]>; -} // Predicates = [HasStdExtZfhmin, HasStdExtD] +} // Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] //===----------------------------------------------------------------------===// // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20) //===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZfhmin] in { +let Predicates = [HasStdExtZfhOrZfhmin] in { def : InstAlias<"flh $rd, (${rs1})", (FLH FPR16:$rd, GPR:$rs1, 0), 0>; def : InstAlias<"fsh $rs2, (${rs1})", (FSH FPR16:$rs2, GPR:$rs1, 0), 0>; -} // Predicates = [HasStdExtZfhmin] +} // Predicates = [HasStdExtZfhOrZfhmin] let Predicates = [HasStdExtZfh] in { def : InstAlias<"fmv.h $rd, $rs", (FSGNJ_H FPR16:$rd, FPR16:$rs, FPR16:$rs)>; @@ -177,14 +177,14 @@ def : InstAlias<"fge.h $rd, $rs, $rt", (FLE_H GPR:$rd, FPR16:$rt, FPR16:$rs), 0>; } // Predicates = [HasStdExtZfh] -let Predicates = [HasStdExtZfhmin] in { +let Predicates = [HasStdExtZfhOrZfhmin] in { def PseudoFLH : PseudoFloatLoad<"flh", FPR16>; def PseudoFSH : PseudoStore<"fsh", FPR16>; let usesCustomInserter = 1 in { def PseudoQuietFLE_H : PseudoQuietFCMP<FPR16>; def PseudoQuietFLT_H : PseudoQuietFCMP<FPR16>; } -} // Predicates = [HasStdExtZfhmin] +} // Predicates = [HasStdExtZfhOrZfhmin] //===----------------------------------------------------------------------===// // Pseudo-instructions and codegen patterns @@ -281,7 +281,7 @@ def : PatSetCC<FPR16, any_fsetccs, SETOLE, FLE_H>; def Select_FPR16_Using_CC_GPR : SelectCC_rrirr<FPR16, GPR>; } // Predicates = [HasStdExtZfh] -let Predicates = [HasStdExtZfhmin] in { +let Predicates = [HasStdExtZfhOrZfhmin] in { /// Loads defm : LdPat<load, FLH, f16>; @@ -299,7 +299,7 @@ def : Pat<(any_fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>; // Moves (no conversion) def : Pat<(riscv_fmv_h_x GPR:$src), (FMV_H_X GPR:$src)>; def : Pat<(riscv_fmv_x_anyexth FPR16:$src), (FMV_X_H FPR16:$src)>; -} // Predicates = [HasStdExtZfhmin] +} // Predicates = [HasStdExtZfhOrZfhmin] let Predicates = [HasStdExtZfh, IsRV32] in { // half->[u]int. Round-to-zero must be used. @@ -351,7 +351,7 @@ def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L $rs1, 0b111)>; def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU $rs1, 0b111)>; } // Predicates = [HasStdExtZfh, IsRV64] -let Predicates = [HasStdExtZfhmin, HasStdExtD] in { +let Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] in { /// Float conversion operations // f64 -> f16, f16 -> f64 def : Pat<(any_fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>; @@ -361,4 +361,4 @@ def : Pat<(any_fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>; def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2), (FSGNJ_H $rs1, (FCVT_H_D $rs2, 0b111))>; def : Pat<(fcopysign FPR64:$rs1, FPR16:$rs2), (FSGNJ_D $rs1, (FCVT_D_H $rs2))>; -} // Predicates = [HasStdExtZfhmin, HasStdExtD] +} // Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td index 4a41cddedc71..e4e07f4789a6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td @@ -1,4 +1,4 @@ -//===- RISCVInstrInfoZk.td - RISC-V Scalar Crypto instructions - tablegen -*===// +//===- RISCVInstrInfoZk.td - RISC-V 'Zk' instructions ------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp index 12ec52925798..715d92b036e3 100644 --- a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp +++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp @@ -99,9 +99,9 @@ static bool isSignExtendingOpW(const MachineInstr &MI) { case RISCV::SLTI: case RISCV::SLTU: case RISCV::SLTIU: - case RISCV::SEXTB: - case RISCV::SEXTH: - case RISCV::ZEXTH_RV64: + case RISCV::SEXT_B: + case RISCV::SEXT_H: + case RISCV::ZEXT_H_RV64: return true; // shifting right sufficiently makes the value 32-bit sign-extended case RISCV::SRAI: diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 044dda0a1ccc..34c6e8e684ac 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -195,6 +195,7 @@ public: return 0; } + unsigned getMinVLen() const { return ZvlLen; } RISCVABI::ABI getTargetABI() const { return TargetABI; } bool isRegisterReservedByUser(Register i) const { assert(i < RISCV::NUM_TARGET_REGS && "Register out of range"); diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index e950f9582f09..4d69040a4508 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -8,6 +8,7 @@ #include "MCTargetDesc/SparcFixupKinds.h" #include "MCTargetDesc/SparcMCTargetDesc.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" @@ -131,6 +132,23 @@ namespace { return Sparc::NumTargetFixupKinds; } + Optional<MCFixupKind> getFixupKind(StringRef Name) const override { + unsigned Type; + Type = llvm::StringSwitch<unsigned>(Name) +#define ELF_RELOC(X, Y) .Case(#X, Y) +#include "llvm/BinaryFormat/ELFRelocs/Sparc.def" +#undef ELF_RELOC + .Case("BFD_RELOC_NONE", ELF::R_SPARC_NONE) + .Case("BFD_RELOC_8", ELF::R_SPARC_8) + .Case("BFD_RELOC_16", ELF::R_SPARC_16) + .Case("BFD_RELOC_32", ELF::R_SPARC_32) + .Case("BFD_RELOC_64", ELF::R_SPARC_64) + .Default(-1u); + if (Type == -1u) + return None; + return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type); + } + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { const static MCFixupKindInfo InfosBE[Sparc::NumTargetFixupKinds] = { // name offset bits flags @@ -216,6 +234,11 @@ namespace { { "fixup_sparc_tls_le_lox10", 0, 0, 0 } }; + // Fixup kinds from .reloc directive are like R_SPARC_NONE. They do + // not require any extra processing. + if (Kind >= FirstLiteralRelocationKind) + return MCAsmBackend::getFixupKindInfo(FK_NONE); + if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); @@ -229,6 +252,8 @@ namespace { bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) override { + if (Fixup.getKind() >= FirstLiteralRelocationKind) + return true; switch ((Sparc::Fixups)Fixup.getKind()) { default: return false; @@ -299,6 +324,8 @@ namespace { uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const override { + if (Fixup.getKind() >= FirstLiteralRelocationKind) + return; Value = adjustFixupValue(Fixup.getKind(), Value); if (!Value) return; // Doesn't change encoding. diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index bc508b45c3bd..02261dc5c4cd 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -42,6 +42,9 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { + MCFixupKind Kind = Fixup.getKind(); + if (Kind >= FirstLiteralRelocationKind) + return Kind - FirstLiteralRelocationKind; if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) { if (SExpr->getKind() == SparcMCExpr::VK_Sparc_R_DISP32) @@ -68,6 +71,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx, switch(Fixup.getTargetKind()) { default: llvm_unreachable("Unimplemented fixup -> relocation"); + case FK_NONE: return ELF::R_SPARC_NONE; case FK_Data_1: return ELF::R_SPARC_8; case FK_Data_2: return ((Fixup.getOffset() % 2) ? ELF::R_SPARC_UA16 diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index ccc7d0737f53..610627e7e3f0 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -80,6 +80,88 @@ MachineBasicBlock::iterator SystemZFrameLowering::eliminateCallFramePseudoInstr( } } +namespace { +struct SZFrameSortingObj { + bool IsValid = false; // True if we care about this Object. + uint32_t ObjectIndex = 0; // Index of Object into MFI list. + uint64_t ObjectSize = 0; // Size of Object in bytes. + uint32_t D12Count = 0; // 12-bit displacement only. + uint32_t DPairCount = 0; // 12 or 20 bit displacement. +}; +typedef std::vector<SZFrameSortingObj> SZFrameObjVec; +} // namespace + +// TODO: Move to base class. +void SystemZELFFrameLowering::orderFrameObjects( + const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo()); + + // Make a vector of sorting objects to track all MFI objects and mark those + // to be sorted as valid. + if (ObjectsToAllocate.size() <= 1) + return; + SZFrameObjVec SortingObjects(MFI.getObjectIndexEnd()); + for (auto &Obj : ObjectsToAllocate) { + SortingObjects[Obj].IsValid = true; + SortingObjects[Obj].ObjectIndex = Obj; + SortingObjects[Obj].ObjectSize = MFI.getObjectSize(Obj); + } + + // Examine uses for each object and record short (12-bit) and "pair" + // displacement types. + for (auto &MBB : MF) + for (auto &MI : MBB) { + if (MI.isDebugInstr()) + continue; + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + const MachineOperand &MO = MI.getOperand(I); + if (!MO.isFI()) + continue; + int Index = MO.getIndex(); + if (Index >= 0 && Index < MFI.getObjectIndexEnd() && + SortingObjects[Index].IsValid) { + if (TII->hasDisplacementPairInsn(MI.getOpcode())) + SortingObjects[Index].DPairCount++; + else if (!(MI.getDesc().TSFlags & SystemZII::Has20BitOffset)) + SortingObjects[Index].D12Count++; + } + } + } + + // Sort all objects for short/paired displacements, which should be + // sufficient as it seems like all frame objects typically are within the + // long displacement range. Sorting works by computing the "density" as + // Count / ObjectSize. The comparisons of two such fractions are refactored + // by multiplying both sides with A.ObjectSize * B.ObjectSize, in order to + // eliminate the (fp) divisions. A higher density object needs to go after + // in the list in order for it to end up lower on the stack. + auto CmpD12 = [](const SZFrameSortingObj &A, const SZFrameSortingObj &B) { + // Put all invalid and variable sized objects at the end. + if (!A.IsValid || !B.IsValid) + return A.IsValid; + if (!A.ObjectSize || !B.ObjectSize) + return A.ObjectSize > 0; + uint64_t ADensityCmp = A.D12Count * B.ObjectSize; + uint64_t BDensityCmp = B.D12Count * A.ObjectSize; + if (ADensityCmp != BDensityCmp) + return ADensityCmp < BDensityCmp; + return A.DPairCount * B.ObjectSize < B.DPairCount * A.ObjectSize; + }; + std::stable_sort(SortingObjects.begin(), SortingObjects.end(), CmpD12); + + // Now modify the original list to represent the final order that + // we want. + unsigned Idx = 0; + for (auto &Obj : SortingObjects) { + // All invalid items are sorted at the end, so it's safe to stop. + if (!Obj.IsValid) + break; + ObjectsToAllocate[Idx++] = Obj.ObjectIndex; + } +} + bool SystemZFrameLowering::hasReservedCallFrame( const MachineFunction &MF) const { // The ELF ABI requires us to allocate 160 bytes of stack space for the diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index 3a1af888d8f9..2b3d7efed53b 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -77,6 +77,9 @@ public: bool hasFP(const MachineFunction &MF) const override; StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override; + void + orderFrameObjects(const MachineFunction &MF, + SmallVectorImpl<int> &ObjectsToAllocate) const override; // Return the byte offset from the incoming stack pointer of Reg's // ABI-defined save slot. Return 0 if no slot is defined for Reg. Adjust diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index a8ddb8c62d18..de446f33f5f1 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -443,6 +443,11 @@ public: EVT VT) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; + bool ShouldShrinkFPConstant(EVT VT) const override { + // Do not shrink 64-bit FP constpool entries since LDEB is slower than + // LD, and having the full constant in memory enables reg/mem opcodes. + return VT != MVT::f64; + } bool hasInlineStackProbe(MachineFunction &MF) const override; bool isLegalICmpImmediate(int64_t Imm) const override; bool isLegalAddImmediate(int64_t Imm) const override; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 6db9bf3056b7..4b6aa60f5d55 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -1652,6 +1652,13 @@ unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode, return 0; } +bool SystemZInstrInfo::hasDisplacementPairInsn(unsigned Opcode) const { + const MCInstrDesc &MCID = get(Opcode); + if (MCID.TSFlags & SystemZII::Has20BitOffset) + return SystemZ::getDisp12Opcode(Opcode) >= 0; + return SystemZ::getDisp20Opcode(Opcode) >= 0; +} + unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const { switch (Opcode) { case SystemZ::L: return SystemZ::LT; diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h index 396f56c7f59c..9e5b2729a707 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -312,6 +312,9 @@ public: // exists. unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset) const; + // Return true if Opcode has a mapping in 12 <-> 20 bit displacements. + bool hasDisplacementPairInsn(unsigned Opcode) const; + // If Opcode is a load instruction that has a LOAD AND TEST form, // return the opcode for the testing form, otherwise return 0. unsigned getLoadAndTest(unsigned Opcode) const; diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp index 0412e524f800..0f1655718481 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp @@ -167,3 +167,41 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) { llvm_unreachable("unexpected type"); } } + +void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT, + const SmallVector<MVT, 1> &VTs) { + assert(!Sym->getType()); + + // Tables are represented as Arrays in LLVM IR therefore + // they reach this point as aggregate Array types with an element type + // that is a reference type. + wasm::ValType Type; + bool IsTable = false; + if (GlobalVT->isArrayTy() && + WebAssembly::isRefType(GlobalVT->getArrayElementType())) { + MVT VT; + IsTable = true; + switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) { + case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF: + VT = MVT::funcref; + break; + case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF: + VT = MVT::externref; + break; + default: + report_fatal_error("unhandled address space type"); + } + Type = WebAssembly::toValType(VT); + } else if (VTs.size() == 1) { + Type = WebAssembly::toValType(VTs[0]); + } else + report_fatal_error("Aggregate globals not yet implemented"); + + if (IsTable) { + Sym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); + Sym->setTableType(Type); + } else { + Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); + Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true}); + } +} diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h index 042d51c7d6cb..cdb95d48398d 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h @@ -17,6 +17,8 @@ #include "llvm/ADT/Optional.h" #include "llvm/BinaryFormat/Wasm.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/MC/MCSymbolWasm.h" #include "llvm/Support/MachineValueType.h" namespace llvm { @@ -41,6 +43,43 @@ enum class BlockType : unsigned { Multivalue = 0xffff, }; +enum WasmAddressSpace : unsigned { + // Default address space, for pointers to linear memory (stack, heap, data). + WASM_ADDRESS_SPACE_DEFAULT = 0, + // A non-integral address space for pointers to named objects outside of + // linear memory: WebAssembly globals or WebAssembly locals. Loads and stores + // to these pointers are lowered to global.get / global.set or local.get / + // local.set, as appropriate. + WASM_ADDRESS_SPACE_VAR = 1, + // A non-integral address space for externref values + WASM_ADDRESS_SPACE_EXTERNREF = 10, + // A non-integral address space for funcref values + WASM_ADDRESS_SPACE_FUNCREF = 20, +}; + +inline bool isDefaultAddressSpace(unsigned AS) { + return AS == WASM_ADDRESS_SPACE_DEFAULT; +} +inline bool isWasmVarAddressSpace(unsigned AS) { + return AS == WASM_ADDRESS_SPACE_VAR; +} +inline bool isValidAddressSpace(unsigned AS) { + return isDefaultAddressSpace(AS) || isWasmVarAddressSpace(AS); +} +inline bool isFuncrefType(const Type *Ty) { + return isa<PointerType>(Ty) && + Ty->getPointerAddressSpace() == + WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF; +} +inline bool isExternrefType(const Type *Ty) { + return isa<PointerType>(Ty) && + Ty->getPointerAddressSpace() == + WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF; +} +inline bool isRefType(const Type *Ty) { + return isFuncrefType(Ty) || isExternrefType(Ty); +} + // Convert StringRef to ValType / HealType / BlockType Optional<wasm::ValType> parseType(StringRef Type); @@ -68,6 +107,10 @@ wasm::ValType toValType(MVT Type); // Convert a register class to a wasm ValType. wasm::ValType regClassToValType(unsigned RC); +/// Sets a Wasm Symbol Type. +void wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT, + const SmallVector<MVT, 1> &VTs); + } // end namespace WebAssembly } // end namespace llvm diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h index 57e40f6cd8d7..cdfc758db7ac 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h @@ -15,7 +15,6 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H #define LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H -#include "llvm/IR/DerivedTypes.h" #include "llvm/Support/CommandLine.h" namespace llvm { @@ -30,43 +29,6 @@ class WebAssemblySubtarget; namespace WebAssembly { -enum WasmAddressSpace : unsigned { - // Default address space, for pointers to linear memory (stack, heap, data). - WASM_ADDRESS_SPACE_DEFAULT = 0, - // A non-integral address space for pointers to named objects outside of - // linear memory: WebAssembly globals or WebAssembly locals. Loads and stores - // to these pointers are lowered to global.get / global.set or local.get / - // local.set, as appropriate. - WASM_ADDRESS_SPACE_VAR = 1, - // A non-integral address space for externref values - WASM_ADDRESS_SPACE_EXTERNREF = 10, - // A non-integral address space for funcref values - WASM_ADDRESS_SPACE_FUNCREF = 20, -}; - -inline bool isDefaultAddressSpace(unsigned AS) { - return AS == WASM_ADDRESS_SPACE_DEFAULT; -} -inline bool isWasmVarAddressSpace(unsigned AS) { - return AS == WASM_ADDRESS_SPACE_VAR; -} -inline bool isValidAddressSpace(unsigned AS) { - return isDefaultAddressSpace(AS) || isWasmVarAddressSpace(AS); -} -inline bool isFuncrefType(const Type *Ty) { - return isa<PointerType>(Ty) && - Ty->getPointerAddressSpace() == - WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF; -} -inline bool isExternrefType(const Type *Ty) { - return isa<PointerType>(Ty) && - Ty->getPointerAddressSpace() == - WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF; -} -inline bool isRefType(const Type *Ty) { - return isFuncrefType(Ty) || isExternrefType(Ty); -} - bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI); bool mayThrow(const MachineInstr &MI); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index e3af6b2662ef..bf326e5106be 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -181,17 +181,11 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { if (!Sym->getType()) { const WebAssemblyTargetLowering &TLI = *Subtarget->getTargetLowering(); - SmallVector<EVT, 1> VTs; - ComputeValueVTs(TLI, GV->getParent()->getDataLayout(), GV->getValueType(), - VTs); - if (VTs.size() != 1 || - TLI.getNumRegisters(GV->getParent()->getContext(), VTs[0]) != 1) - report_fatal_error("Aggregate globals not yet implemented"); - MVT VT = TLI.getRegisterType(GV->getParent()->getContext(), VTs[0]); - bool Mutable = true; - wasm::ValType Type = WebAssembly::toValType(VT); - Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); - Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), Mutable}); + SmallVector<MVT, 1> VTs; + Type *GlobalVT = GV->getValueType(); + computeLegalValueVTs(TLI, GV->getParent()->getContext(), + GV->getParent()->getDataLayout(), GlobalVT, VTs); + WebAssembly::wasmSymbolSetType(Sym, GlobalVT, VTs); } // If the GlobalVariable refers to a table, we handle it here instead of diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 406edef8ff3f..8ddd414b043a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -16,6 +16,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "Utils/WebAssemblyTypeUtilities.h" #include "Utils/WebAssemblyUtilities.h" #include "WebAssembly.h" #include "WebAssemblyMachineFunctionInfo.h" diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index c45f7d7176b5..01baa3d9389d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -19,7 +19,7 @@ #include "WebAssemblyFrameLowering.h" #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" -#include "Utils/WebAssemblyUtilities.h" +#include "Utils/WebAssemblyTypeUtilities.h" #include "WebAssembly.h" #include "WebAssemblyInstrInfo.h" #include "WebAssemblyMachineFunctionInfo.h" diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index fe656753889f..b6c43be03aba 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -560,6 +560,9 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) { NEltArg = NEltArg.getValue() + 1; FnAttrs.addAllocSizeAttr(SizeArg, NEltArg); } + // In case the callee has 'noreturn' attribute, We need to remove it, because + // we expect invoke wrappers to return. + FnAttrs.removeAttribute(Attribute::NoReturn); // Reconstruct the AttributesList based on the vector we constructed. AttributeList NewCallAL = AttributeList::get( @@ -630,9 +633,9 @@ static bool canLongjmp(const Value *Callee) { // Exception-catching related functions // - // We intentionally excluded __cxa_end_catch here even though it surely cannot - // longjmp, in order to maintain the unwind relationship from all existing - // catchpads (and calls within them) to catch.dispatch.longjmp. + // We intentionally treat __cxa_end_catch longjmpable in Wasm SjLj even though + // it surely cannot longjmp, in order to maintain the unwind relationship from + // all existing catchpads (and calls within them) to catch.dispatch.longjmp. // // In Wasm EH + Wasm SjLj, we // 1. Make all catchswitch and cleanuppad that unwind to caller unwind to @@ -663,6 +666,8 @@ static bool canLongjmp(const Value *Callee) { // // The comment block in findWasmUnwindDestinations() in // SelectionDAGBuilder.cpp is addressing a similar problem. + if (CalleeName == "__cxa_end_catch") + return WebAssembly::WasmEnableSjLj; if (CalleeName == "__cxa_begin_catch" || CalleeName == "__cxa_allocate_exception" || CalleeName == "__cxa_throw" || CalleeName == "__clang_call_terminate") @@ -869,15 +874,17 @@ static void nullifySetjmp(Function *F) { Function *SetjmpF = M.getFunction("setjmp"); SmallVector<Instruction *, 1> ToErase; - for (User *U : SetjmpF->users()) { - auto *CI = dyn_cast<CallInst>(U); - // FIXME 'invoke' to setjmp can happen when we use Wasm EH + Wasm SjLj, but - // we don't support two being used together yet. - if (!CI) - report_fatal_error("Wasm EH + Wasm SjLj is not fully supported yet"); - BasicBlock *BB = CI->getParent(); + for (User *U : make_early_inc_range(SetjmpF->users())) { + auto *CB = cast<CallBase>(U); + BasicBlock *BB = CB->getParent(); if (BB->getParent() != F) // in other function continue; + CallInst *CI = nullptr; + // setjmp cannot throw. So if it is an invoke, lower it to a call + if (auto *II = dyn_cast<InvokeInst>(CB)) + CI = llvm::changeToCall(II); + else + CI = cast<CallInst>(CB); ToErase.push_back(CI); CI->replaceAllUsesWith(IRB.getInt32(0)); } @@ -1313,10 +1320,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { SmallVector<PHINode *, 4> SetjmpRetPHIs; Function *SetjmpF = M.getFunction("setjmp"); for (auto *U : make_early_inc_range(SetjmpF->users())) { - auto *CB = dyn_cast<CallBase>(U); + auto *CB = cast<CallBase>(U); BasicBlock *BB = CB->getParent(); if (BB->getParent() != &F) // in other function continue; + if (CB->getOperandBundle(LLVMContext::OB_funclet)) + report_fatal_error( + "setjmp within a catch clause is not supported in Wasm EH"); CallInst *CI = nullptr; // setjmp cannot throw. So if it is an invoke, lower it to a call @@ -1815,10 +1825,10 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj( BasicBlock *UnwindDest = nullptr; if (auto Bundle = CI->getOperandBundle(LLVMContext::OB_funclet)) { Instruction *FromPad = cast<Instruction>(Bundle->Inputs[0]); - while (!UnwindDest && FromPad) { + while (!UnwindDest) { if (auto *CPI = dyn_cast<CatchPadInst>(FromPad)) { UnwindDest = CPI->getCatchSwitch()->getUnwindDest(); - FromPad = nullptr; // stop searching + break; } else if (auto *CPI = dyn_cast<CleanupPadInst>(FromPad)) { // getCleanupRetUnwindDest() can return nullptr when // 1. This cleanuppad's matching cleanupret uwninds to caller @@ -1826,7 +1836,10 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj( // unreachable. // In case of 2, we need to traverse the parent pad chain. UnwindDest = getCleanupRetUnwindDest(CPI); - FromPad = cast<Instruction>(CPI->getParentPad()); + Value *ParentPad = CPI->getParentPad(); + if (isa<ConstantTokenNone>(ParentPad)) + break; + FromPad = cast<Instruction>(ParentPad); } } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp index 8ff916c28c4e..6fd87f10150d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp @@ -14,7 +14,7 @@ /// //===----------------------------------------------------------------------===// -#include "Utils/WebAssemblyUtilities.h" +#include "Utils/WebAssemblyTypeUtilities.h" #include "WebAssembly.h" #include "WebAssemblySubtarget.h" #include "llvm/IR/InstIterator.h" diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 09bccef17ab0..2e6027a5605c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -59,39 +59,7 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { SmallVector<MVT, 1> VTs; computeLegalValueVTs(CurrentFunc, TM, GlobalVT, VTs); - // Tables are represented as Arrays in LLVM IR therefore - // they reach this point as aggregate Array types with an element type - // that is a reference type. - wasm::ValType Type; - bool IsTable = false; - if (GlobalVT->isArrayTy() && - WebAssembly::isRefType(GlobalVT->getArrayElementType())) { - MVT VT; - IsTable = true; - switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) { - case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF: - VT = MVT::funcref; - break; - case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_EXTERNREF: - VT = MVT::externref; - break; - default: - report_fatal_error("unhandled address space type"); - } - Type = WebAssembly::toValType(VT); - } else if (VTs.size() == 1) { - Type = WebAssembly::toValType(VTs[0]); - } else - report_fatal_error("Aggregate globals not yet implemented"); - - if (IsTable) { - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); - WasmSym->setTableType(Type); - } else { - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); - WasmSym->setGlobalType( - wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true}); - } + WebAssembly::wasmSymbolSetType(WasmSym, GlobalVT, VTs); } return WasmSym; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 00b11321fdb2..ea80e96d50de 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -30,22 +30,28 @@ void WebAssemblyFunctionInfo::initWARegs(MachineRegisterInfo &MRI) { WARegs.resize(MRI.getNumVirtRegs(), Reg); } -void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM, +void llvm::computeLegalValueVTs(const WebAssemblyTargetLowering &TLI, + LLVMContext &Ctx, const DataLayout &DL, Type *Ty, SmallVectorImpl<MVT> &ValueVTs) { - const DataLayout &DL(F.getParent()->getDataLayout()); - const WebAssemblyTargetLowering &TLI = - *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering(); SmallVector<EVT, 4> VTs; ComputeValueVTs(TLI, DL, Ty, VTs); for (EVT VT : VTs) { - unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT); - MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT); + unsigned NumRegs = TLI.getNumRegisters(Ctx, VT); + MVT RegisterVT = TLI.getRegisterType(Ctx, VT); for (unsigned I = 0; I != NumRegs; ++I) ValueVTs.push_back(RegisterVT); } } +void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM, + Type *Ty, SmallVectorImpl<MVT> &ValueVTs) { + const DataLayout &DL(F.getParent()->getDataLayout()); + const WebAssemblyTargetLowering &TLI = + *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering(); + computeLegalValueVTs(TLI, F.getContext(), DL, Ty, ValueVTs); +} + void llvm::computeSignatureVTs(const FunctionType *Ty, const Function *TargetFunc, const Function &ContextFunc, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index 3fa2d0c8a2f2..413d0d1dc554 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -166,6 +166,10 @@ public: void setWasmEHFuncInfo(WasmEHFuncInfo *Info) { WasmEHInfo = Info; } }; +void computeLegalValueVTs(const WebAssemblyTargetLowering &TLI, + LLVMContext &Ctx, const DataLayout &DL, Type *Ty, + SmallVectorImpl<MVT> &ValueVTs); + void computeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty, SmallVectorImpl<MVT> &ValueVTs); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index aff72452af6c..90753b5b4d33 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -805,8 +805,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // Some FP actions are always expanded for vector types. - for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, - MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { + for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16, + MVT::v4f32, MVT::v8f32, MVT::v16f32, + MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); @@ -1094,13 +1095,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (VT == MVT::v2i64) continue; setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); + setOperationAction(ISD::FSHL, VT, Custom); + setOperationAction(ISD::FSHR, VT, Custom); } - setOperationAction(ISD::FSHL, MVT::v16i8, Custom); - setOperationAction(ISD::FSHR, MVT::v16i8, Custom); - setOperationAction(ISD::FSHL, MVT::v4i32, Custom); - setOperationAction(ISD::FSHR, MVT::v4i32, Custom); - setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); @@ -1958,6 +1956,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // AVX512_FP16 scalar operations setGroup(MVT::f16); addRegisterClass(MVT::f16, &X86::FR16XRegClass); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); setOperationAction(ISD::BR_CC, MVT::f16, Expand); setOperationAction(ISD::SETCC, MVT::f16, Custom); @@ -12571,6 +12571,8 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); + unsigned NumElts = VT.getVectorNumElements(); + switch (VT.SimpleTy) { case MVT::v4i64: case MVT::v8i32: @@ -12629,8 +12631,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { - MVT IntegerType = - MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } @@ -12699,8 +12700,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, // Otherwise load an immediate into a GPR, cast to k-register, and use a // masked move. - MVT IntegerType = - MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } @@ -29843,7 +29843,8 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, {Op0, Op1, Amt}, DAG, Subtarget); } assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || - VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && + VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || + VT == MVT::v16i32) && "Unexpected funnel shift type!"); // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw. @@ -29855,6 +29856,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode()); + // Constant vXi16 funnel shifts can be efficiently handled by default. + if (IsCst && EltSizeInBits == 16) + return SDValue(); + unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL; unsigned NumElts = VT.getVectorNumElements(); MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); @@ -29874,6 +29879,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z)) if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) { if (SDValue ScalarAmt = DAG.getSplatValue(AmtMod)) { + // Uniform vXi16 funnel shifts can be efficiently handled by default. + if (EltSizeInBits == 16) + return SDValue(); + SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0)); ScalarAmt = DAG.getZExtOrTrunc(ScalarAmt, DL, MVT::i32); @@ -29912,7 +29921,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, } // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z) - if ((IsCst && !IsFSHR && EltSizeInBits == 8) || + if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) { SDValue Z = DAG.getConstant(0, DL, VT); SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); @@ -36477,9 +36486,8 @@ static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, bool AllowFloatDomain, bool AllowIntDomain, - SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, - const X86Subtarget &Subtarget, unsigned &Shuffle, - MVT &SrcVT, MVT &DstVT) { + SDValue V1, const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); @@ -36522,9 +36530,6 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, MVT::getIntegerVT(MaskEltSize); SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); - if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) - V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); - Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND); if (SrcVT.getVectorNumElements() != NumDstElts) Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle); @@ -37102,6 +37107,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, assert((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"); + SDLoc DL(Root); MVT RootVT = Root.getSimpleValueType(); unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned NumRootElts = RootVT.getVectorNumElements(); @@ -37109,6 +37115,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Canonicalize shuffle input op to the requested type. // TODO: Support cases where Op is smaller than VT. auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) { + if (VT.getSizeInBits() < Op.getValueSizeInBits()) + Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits()); return DAG.getBitcast(VT, Op); }; @@ -37124,7 +37132,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, assert(VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch"); - SDLoc DL(Root); SDValue Res; unsigned NumBaseMaskElts = BaseMask.size(); @@ -37393,15 +37400,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } - SDValue NewV1 = V1; // Save operand in case early exit happens. - if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, - DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, - ShuffleVT) && + if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, + Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1); + Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); return DAG.getBitcast(RootVT, Res); } @@ -40903,6 +40908,28 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( Known.One.setHighBits(ShAmt); return false; } + case X86ISD::BLENDV: { + SDValue Sel = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + + APInt SignMask = APInt::getSignMask(BitWidth); + SDValue NewSel = SimplifyMultipleUseDemandedBits( + Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1); + SDValue NewLHS = SimplifyMultipleUseDemandedBits( + LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1); + SDValue NewRHS = SimplifyMultipleUseDemandedBits( + RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1); + + if (NewSel || NewLHS || NewRHS) { + NewSel = NewSel ? NewSel : Sel; + NewLHS = NewLHS ? NewLHS : LHS; + NewRHS = NewRHS ? NewRHS : RHS; + return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT, + NewSel, NewLHS, NewRHS)); + } + break; + } case X86ISD::PEXTRB: case X86ISD::PEXTRW: { SDValue Vec = Op.getOperand(0); @@ -41043,6 +41070,13 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (OriginalDemandedBits.countTrailingZeros() >= NumElts) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); + // See if we only demand bits from the lower 128-bit vector. + if (SrcVT.is256BitVector() && + OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) { + SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src)); + return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); + } + // Only demand the vector elements of the sign bits we need. APInt KnownUndef, KnownZero; APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts); @@ -42238,19 +42272,14 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = DAG.getBitcast(MovmskVT, Match); } else { - // For all_of(setcc(x,y,eq)) - // - avoid vXi64 comparisons without PCMPEQQ (SSE41+), use PCMPEQD. - // - avoid vXi16 comparisons, use PMOVMSKB(PCMPEQB()). + // For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()). if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC && cast<CondCodeSDNode>(Match.getOperand(2))->get() == ISD::CondCode::SETEQ) { - SDValue Vec = Match.getOperand(0); - EVT VecSVT = Vec.getValueType().getScalarType(); - if ((VecSVT == MVT::i16 && !Subtarget.hasBWI()) || - (VecSVT == MVT::i64 && !Subtarget.hasSSE41())) { - NumElts *= 2; - VecSVT = VecSVT.getHalfSizedIntegerVT(*DAG.getContext()); - EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumElts); + EVT VecSVT = Match.getOperand(0).getValueType().getScalarType(); + if (VecSVT != MVT::i8) { + NumElts *= VecSVT.getSizeInBits() / 8; + EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumElts); MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); Match = DAG.getSetCC( DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)), @@ -43079,6 +43108,38 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, } } + // If this extract is from a loaded vector value and will be used as an + // integer, that requires a potentially expensive XMM -> GPR transfer. + // Additionally, if we can convert to a scalar integer load, that will likely + // be folded into a subsequent integer op. + // Note: Unlike the related fold for this in DAGCombiner, this is not limited + // to a single-use of the loaded vector. For the reasons above, we + // expect this to be profitable even if it creates an extra load. + bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) { + return Use->getOpcode() == ISD::STORE || + Use->getOpcode() == ISD::INSERT_VECTOR_ELT || + Use->getOpcode() == ISD::SCALAR_TO_VECTOR; + }); + auto *LoadVec = dyn_cast<LoadSDNode>(InputVector); + if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() && + SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() && + !LikelyUsedAsVector) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue NewPtr = + TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx); + unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8; + MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff); + Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff); + SDValue Load = + DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment, + LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo()); + SDValue Chain = Load.getValue(1); + SDValue From[] = {SDValue(N, 0), SDValue(LoadVec, 1)}; + SDValue To[] = {Load, Chain}; + DAG.ReplaceAllUsesOfValuesWith(From, To, 2); + return SDValue(N, 0); + } + return SDValue(); } @@ -44467,8 +44528,8 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, unsigned NumEltBits = VecVT.getScalarSizeInBits(); bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero(); - bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits && - CmpVal.isMask(NumElts); + bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) && + NumElts <= CmpBits && CmpVal.isMask(NumElts); if (!IsAnyOf && !IsAllOf) return SDValue(); @@ -44500,14 +44561,16 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)). // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)). // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)). - if (VecVT.is256BitVector()) { + if (VecVT.is256BitVector() && NumElts <= CmpBits) { SmallVector<SDValue> Ops; if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) && Ops.size() == 2) { SDLoc DL(EFLAGS); - EVT SubVT = Ops[0].getValueType(); + EVT SubVT = Ops[0].getValueType().changeTypeToInteger(); APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2); - SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT, Ops); + SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT, + DAG.getBitcast(SubVT, Ops[0]), + DAG.getBitcast(SubVT, Ops[1])); V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V), @@ -44522,26 +44585,29 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, if (IsAllOf && Subtarget.hasSSE41()) { MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; SDValue BC = peekThroughBitcasts(Vec); - if (BC.getOpcode() == X86ISD::PCMPEQ) { - SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(), - BC.getOperand(0), BC.getOperand(1)); - V = DAG.getBitcast(TestVT, V); - return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); - } - // Check for 256-bit split vector cases. - if (BC.getOpcode() == ISD::AND && - BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ && - BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) { - SDValue LHS = BC.getOperand(0); - SDValue RHS = BC.getOperand(1); - LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(), - LHS.getOperand(0), LHS.getOperand(1)); - RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(), - RHS.getOperand(0), RHS.getOperand(1)); - LHS = DAG.getBitcast(TestVT, LHS); - RHS = DAG.getBitcast(TestVT, RHS); - SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS); - return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); + // Ensure MOVMSK was testing every signbit of BC. + if (BC.getValueType().getVectorNumElements() <= NumElts) { + if (BC.getOpcode() == X86ISD::PCMPEQ) { + SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(), + BC.getOperand(0), BC.getOperand(1)); + V = DAG.getBitcast(TestVT, V); + return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); + } + // Check for 256-bit split vector cases. + if (BC.getOpcode() == ISD::AND && + BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ && + BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) { + SDValue LHS = BC.getOperand(0); + SDValue RHS = BC.getOperand(1); + LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(), + LHS.getOperand(0), LHS.getOperand(1)); + RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(), + RHS.getOperand(0), RHS.getOperand(1)); + LHS = DAG.getBitcast(TestVT, LHS); + RHS = DAG.getBitcast(TestVT, RHS); + SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS); + return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); + } } } @@ -44575,7 +44641,8 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) { SDLoc DL(EFLAGS); SDValue Result = peekThroughBitcasts(Src); - if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ) { + if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ && + Result.getValueType().getVectorNumElements() <= NumElts) { SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(), Result.getOperand(0), Result.getOperand(1)); V = DAG.getBitcast(MVT::v4i64, V); @@ -46840,14 +46907,18 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) return false; + APInt DemandedBits = APInt::getZero(EltSizeInBits); APInt DemandedElts = APInt::getZero(NumElts); for (int I = 0; I != NumElts; ++I) - if (!EltBits[I].isZero()) + if (!EltBits[I].isZero()) { + DemandedBits |= EltBits[I]; DemandedElts.setBit(I); + } APInt KnownUndef, KnownZero; return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef, - KnownZero, DCI); + KnownZero, DCI) || + TLI.SimplifyDemandedBits(OtherOp, DemandedBits, DemandedElts, DCI); }; if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) { if (N->getOpcode() != ISD::DELETED_NODE) @@ -49031,8 +49102,13 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, return SDValue(); // SSSE3's pshufb results in less instructions in the cases below. - if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64) - return SDValue(); + if (Subtarget.hasSSSE3() && NumElems == 8) { + if (InSVT == MVT::i16) + return SDValue(); + if (InSVT == MVT::i32 && + (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256())) + return SDValue(); + } SDLoc DL(N); // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS @@ -51110,6 +51186,30 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, DAG.getConstant(NotMask, DL, VT)); } + // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2))) + // iff pow2splat(c1). + if (Src.getOpcode() == X86ISD::PCMPEQ && + Src.getOperand(0).getOpcode() == ISD::AND && + ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) { + SDValue LHS = Src.getOperand(0).getOperand(0); + SDValue RHS = Src.getOperand(0).getOperand(1); + KnownBits KnownRHS = DAG.computeKnownBits(RHS); + if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) { + SDLoc DL(N); + MVT ShiftVT = SrcVT; + if (ShiftVT.getScalarType() == MVT::i8) { + // vXi8 shifts - we only care about the signbit so can use PSLLW. + ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); + LHS = DAG.getBitcast(ShiftVT, LHS); + } + unsigned ShiftAmt = KnownRHS.getConstant().countLeadingZeros(); + LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS, + ShiftAmt, DAG); + LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT); + return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS); + } + } + // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getAllOnes(NumBits)); diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 7368b64efd9a..6206d8efb3d0 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -61,6 +61,8 @@ #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/Local.h" +#include <map> + using namespace llvm; using namespace PatternMatch; |