diff options
Diffstat (limited to 'lib/Target/AArch64')
68 files changed, 5599 insertions, 1802 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 6965403a25ab..ac765ebcddc0 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -55,8 +55,9 @@ FunctionPass *createAArch64CollectLOHPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); -FunctionPass *createAArch64PreLegalizeCombiner(); -FunctionPass *createAArch64StackTaggingPass(); +FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone); +FunctionPass *createAArch64StackTaggingPass(bool MergeInit); +FunctionPass *createAArch64StackTaggingPreRAPass(); void initializeAArch64A53Fix835769Pass(PassRegistry&); void initializeAArch64A57FPLoadBalancingPass(PassRegistry&); @@ -80,6 +81,7 @@ void initializeFalkorHWPFFixPass(PassRegistry&); void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); void initializeAArch64StackTaggingPass(PassRegistry&); +void initializeAArch64StackTaggingPreRAPass(PassRegistry&); } // end namespace llvm #endif diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index e39c6995e367..5b4c9e2149da 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -115,11 +115,12 @@ def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true", def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true", "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>; -def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true", +def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true", "Enable bit permutation SVE2 instructions", [FeatureSVE2]>; def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; + def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", "Has zero-cycle zeroing instructions for generic registers">; @@ -284,6 +285,10 @@ def FeatureSEL2 : SubtargetFeature< "sel2", "HasSEL2", "true", "Enable v8.4-A Secure Exception Level 2 extension">; +def FeaturePMU : SubtargetFeature< + "pmu", "HasPMU", "true", + "Enable v8.4-A PMU extension">; + def FeatureTLB_RMI : SubtargetFeature< "tlb-rmi", "HasTLB_RMI", "true", "Enable v8.4-A TLB Range and Maintenance Instructions">; @@ -345,6 +350,21 @@ def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen", def FeatureMTE : SubtargetFeature<"mte", "HasMTE", "true", "Enable Memory Tagging Extension" >; +def FeatureTRBE : SubtargetFeature<"trbe", "HasTRBE", + "true", "Enable Trace Buffer Extension">; + +def FeatureETE : SubtargetFeature<"ete", "HasETE", + "true", "Enable Embedded Trace Extension", + [FeatureTRBE]>; + +def FeatureTME : SubtargetFeature<"tme", "HasTME", + "true", "Enable Transactional Memory Extension" >; + +def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", + "AllowTaggedGlobals", + "true", "Use an instruction sequence for taking the address of a global " + "that allows a memory tag in the upper address bits">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -354,7 +374,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", FeaturePAN, FeatureLOR, FeatureVH]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", - "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, + "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>; def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", @@ -364,7 +384,7 @@ def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true", "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd, FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT, - FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI, + FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI, FeatureFMI, FeatureRCPC_IMMO]>; def HasV8_5aOps : SubtargetFeature< @@ -390,6 +410,7 @@ include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; @@ -484,6 +505,19 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", FeaturePredictableSelectIsExpensive ]>; +def ProcA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65", + "Cortex-A65 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureNEON, + FeatureRAS, + FeatureRCPC, + FeatureSSBS, + ]>; + def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", "Cortex-A72 ARM processors", [ FeatureCRC, @@ -641,6 +675,33 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", FeatureSlowSTRQro ]>; +def ProcNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", + "NeoverseE1", + "Neoverse E1 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureNEON, + FeatureRCPC, + FeatureSSBS, + ]>; + +def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", + "NeoverseN1", + "Neoverse N1 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureNEON, + FeatureRCPC, + FeatureSPE, + FeatureSSBS, + ]>; + def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ FeatureCrypto, @@ -732,19 +793,28 @@ def : ProcessorModel<"generic", NoSchedModel, [ FeatureFuseAES, FeatureNEON, FeaturePerfMon, - FeaturePostRAScheduler + FeaturePostRAScheduler, +// ETE and TRBE are future architecture extensions. We temporariliy enable them +// by default for users targeting generic AArch64, until it is decided in which +// armv8.x-a architecture revision they will end up. The extensions do not +// affect code generated by the compiler and can be used only by explicitly +// mentioning the new system register names in assembly. + FeatureETE ]>; -// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53. def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; +def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>; +def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>; def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>; def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>; def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>; +def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>; +def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>; diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 92c8c4955d50..13d389cec7a0 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -552,7 +552,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, std::vector<unsigned> ToErase; for (auto &U : I.operands()) { if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) { - unsigned OrigReg = U.getReg(); + Register OrigReg = U.getReg(); U.setReg(Substs[OrigReg]); if (U.isKill()) // Don't erase straight away, because there may be other operands @@ -611,12 +611,12 @@ void AArch64A57FPLoadBalancing::scanInstruction( // Create a new chain. Multiplies don't require forwarding so can go on any // unit. - unsigned DestReg = MI->getOperand(0).getReg(); + Register DestReg = MI->getOperand(0).getReg(); LLVM_DEBUG(dbgs() << "New chain started for register " << printReg(DestReg, TRI) << " at " << *MI); - auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); + auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg)); ActiveChains[DestReg] = G.get(); AllChains.push_back(std::move(G)); @@ -624,8 +624,8 @@ void AArch64A57FPLoadBalancing::scanInstruction( // It is beneficial to keep MLAs on the same functional unit as their // accumulator operand. - unsigned DestReg = MI->getOperand(0).getReg(); - unsigned AccumReg = MI->getOperand(3).getReg(); + Register DestReg = MI->getOperand(0).getReg(); + Register AccumReg = MI->getOperand(3).getReg(); maybeKillChain(MI->getOperand(1), Idx, ActiveChains); maybeKillChain(MI->getOperand(2), Idx, ActiveChains); @@ -661,7 +661,7 @@ void AArch64A57FPLoadBalancing::scanInstruction( LLVM_DEBUG(dbgs() << "Creating new chain for dest register " << printReg(DestReg, TRI) << "\n"); - auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); + auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg)); ActiveChains[DestReg] = G.get(); AllChains.push_back(std::move(G)); diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 89404463e1f0..981b366c14b1 100644 --- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -105,14 +105,14 @@ static bool isGPR64(unsigned Reg, unsigned SubReg, const MachineRegisterInfo *MRI) { if (SubReg) return false; - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass); return AArch64::GPR64RegClass.contains(Reg); } static bool isFPR64(unsigned Reg, unsigned SubReg, const MachineRegisterInfo *MRI) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Register::isVirtualRegister(Reg)) return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) && SubReg == 0) || (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) && @@ -201,8 +201,8 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform( unsigned NumNewCopies = 3; unsigned NumRemovableCopies = 0; - unsigned OrigSrc0 = MI.getOperand(1).getReg(); - unsigned OrigSrc1 = MI.getOperand(2).getReg(); + Register OrigSrc0 = MI.getOperand(1).getReg(); + Register OrigSrc1 = MI.getOperand(2).getReg(); unsigned SubReg0; unsigned SubReg1; if (!MRI->def_empty(OrigSrc0)) { @@ -236,7 +236,7 @@ bool AArch64AdvSIMDScalar::isProfitableToTransform( // any of the uses is a transformable instruction, it's likely the tranforms // will chain, enabling us to save a copy there, too. This is an aggressive // heuristic that approximates the graph based cost analysis described above. - unsigned Dst = MI.getOperand(0).getReg(); + Register Dst = MI.getOperand(0).getReg(); bool AllUsesAreCopies = true; for (MachineRegisterInfo::use_instr_nodbg_iterator Use = MRI->use_instr_nodbg_begin(Dst), @@ -293,8 +293,8 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) { assert(OldOpc != NewOpc && "transform an instruction to itself?!"); // Check if we need a copy for the source registers. - unsigned OrigSrc0 = MI.getOperand(1).getReg(); - unsigned OrigSrc1 = MI.getOperand(2).getReg(); + Register OrigSrc0 = MI.getOperand(1).getReg(); + Register OrigSrc1 = MI.getOperand(2).getReg(); unsigned Src0 = 0, SubReg0; unsigned Src1 = 0, SubReg1; bool KillSrc0 = false, KillSrc1 = false; @@ -354,7 +354,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) { // Create a vreg for the destination. // FIXME: No need to do this if the ultimate user expects an FPR64. // Check for that and avoid the copy if possible. - unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); + Register Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); // For now, all of the new instructions have the same simple three-register // form, so no need to special case based on what instruction we're diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 094fbd999523..7ea7915c2ca6 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -99,7 +99,8 @@ public: void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI); void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI); - std::map<std::pair<unsigned, uint32_t>, MCSymbol *> HwasanMemaccessSymbols; + typedef std::tuple<unsigned, bool, uint32_t> HwasanMemaccessTuple; + std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols; void LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI); void EmitHwasanMemaccessSymbols(Module &M); @@ -150,7 +151,7 @@ private: void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O); bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); bool printAsmRegInClass(const MachineOperand &MO, - const TargetRegisterClass *RC, bool isVector, + const TargetRegisterClass *RC, unsigned AltName, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, @@ -236,9 +237,12 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) } void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) { - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); + bool IsShort = + MI.getOpcode() == AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES; uint32_t AccessInfo = MI.getOperand(1).getImm(); - MCSymbol *&Sym = HwasanMemaccessSymbols[{Reg, AccessInfo}]; + MCSymbol *&Sym = + HwasanMemaccessSymbols[HwasanMemaccessTuple(Reg, IsShort, AccessInfo)]; if (!Sym) { // FIXME: Make this work on non-ELF. if (!TM.getTargetTriple().isOSBinFormatELF()) @@ -246,6 +250,8 @@ void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) { std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" + utostr(AccessInfo); + if (IsShort) + SymName += "_short"; Sym = OutContext.getOrCreateSymbol(SymName); } @@ -263,15 +269,22 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { std::unique_ptr<MCSubtargetInfo> STI( TM.getTarget().createMCSubtargetInfo(TT.str(), "", "")); - MCSymbol *HwasanTagMismatchSym = + MCSymbol *HwasanTagMismatchV1Sym = OutContext.getOrCreateSymbol("__hwasan_tag_mismatch"); + MCSymbol *HwasanTagMismatchV2Sym = + OutContext.getOrCreateSymbol("__hwasan_tag_mismatch_v2"); - const MCSymbolRefExpr *HwasanTagMismatchRef = - MCSymbolRefExpr::create(HwasanTagMismatchSym, OutContext); + const MCSymbolRefExpr *HwasanTagMismatchV1Ref = + MCSymbolRefExpr::create(HwasanTagMismatchV1Sym, OutContext); + const MCSymbolRefExpr *HwasanTagMismatchV2Ref = + MCSymbolRefExpr::create(HwasanTagMismatchV2Sym, OutContext); for (auto &P : HwasanMemaccessSymbols) { - unsigned Reg = P.first.first; - uint32_t AccessInfo = P.first.second; + unsigned Reg = std::get<0>(P.first); + bool IsShort = std::get<1>(P.first); + uint32_t AccessInfo = std::get<2>(P.first); + const MCSymbolRefExpr *HwasanTagMismatchRef = + IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref; MCSymbol *Sym = P.second; OutStreamer->SwitchSection(OutContext.getELFSection( @@ -304,82 +317,86 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { .addReg(Reg) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), *STI); - MCSymbol *HandlePartialSym = OutContext.createTempSymbol(); + MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol(); OutStreamer->EmitInstruction( MCInstBuilder(AArch64::Bcc) .addImm(AArch64CC::NE) - .addExpr(MCSymbolRefExpr::create(HandlePartialSym, OutContext)), + .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym, + OutContext)), *STI); MCSymbol *ReturnSym = OutContext.createTempSymbol(); OutStreamer->EmitLabel(ReturnSym); OutStreamer->EmitInstruction( MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI); + OutStreamer->EmitLabel(HandleMismatchOrPartialSym); - OutStreamer->EmitLabel(HandlePartialSym); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri) - .addReg(AArch64::WZR) - .addReg(AArch64::W16) - .addImm(15) - .addImm(0), - *STI); - MCSymbol *HandleMismatchSym = OutContext.createTempSymbol(); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::Bcc) - .addImm(AArch64CC::HI) - .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), - *STI); - - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::ANDXri) - .addReg(AArch64::X17) - .addReg(Reg) - .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), - *STI); - unsigned Size = 1 << (AccessInfo & 0xf); - if (Size != 1) - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri) - .addReg(AArch64::X17) - .addReg(AArch64::X17) - .addImm(Size - 1) + if (IsShort) { + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri) + .addReg(AArch64::WZR) + .addReg(AArch64::W16) + .addImm(15) .addImm(0), *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs) - .addReg(AArch64::WZR) - .addReg(AArch64::W16) - .addReg(AArch64::W17) - .addImm(0), - *STI); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::Bcc) - .addImm(AArch64CC::LS) - .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), - *STI); - - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::ORRXri) - .addReg(AArch64::X16) - .addReg(Reg) - .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), - *STI); - OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui) - .addReg(AArch64::W16) - .addReg(AArch64::X16) - .addImm(0), - *STI); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::SUBSXrs) - .addReg(AArch64::XZR) - .addReg(AArch64::X16) - .addReg(Reg) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), - *STI); - OutStreamer->EmitInstruction( - MCInstBuilder(AArch64::Bcc) - .addImm(AArch64CC::EQ) - .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), - *STI); + MCSymbol *HandleMismatchSym = OutContext.createTempSymbol(); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::HI) + .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), + *STI); + + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::ANDXri) + .addReg(AArch64::X17) + .addReg(Reg) + .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), + *STI); + unsigned Size = 1 << (AccessInfo & 0xf); + if (Size != 1) + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X17) + .addReg(AArch64::X17) + .addImm(Size - 1) + .addImm(0), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs) + .addReg(AArch64::WZR) + .addReg(AArch64::W16) + .addReg(AArch64::W17) + .addImm(0), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::LS) + .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)), + *STI); + + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::ORRXri) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)), + *STI); + OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui) + .addReg(AArch64::W16) + .addReg(AArch64::X16) + .addImm(0), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::SUBSXrs) + .addReg(AArch64::XZR) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)), + *STI); + OutStreamer->EmitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::EQ) + .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), + *STI); + + OutStreamer->EmitLabel(HandleMismatchSym); + } - OutStreamer->EmitLabel(HandleMismatchSym); OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre) .addReg(AArch64::SP) .addReg(AArch64::X0) @@ -414,16 +431,16 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { MCInstBuilder(AArch64::ADRP) .addReg(AArch64::X16) .addExpr(AArch64MCExpr::create( - HwasanTagMismatchRef, - AArch64MCExpr::VariantKind::VK_GOT_PAGE, OutContext)), + HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE, + OutContext)), *STI); OutStreamer->EmitInstruction( MCInstBuilder(AArch64::LDRXui) .addReg(AArch64::X16) .addReg(AArch64::X16) .addExpr(AArch64MCExpr::create( - HwasanTagMismatchRef, - AArch64MCExpr::VariantKind::VK_GOT_LO12, OutContext)), + HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12, + OutContext)), *STI); OutStreamer->EmitInstruction( MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI); @@ -485,15 +502,14 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, default: llvm_unreachable("<unknown operand type>"); case MachineOperand::MO_Register: { - unsigned Reg = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + Register Reg = MO.getReg(); + assert(Register::isPhysicalRegister(Reg)); assert(!MO.getSubReg() && "Subregs should be eliminated!"); O << AArch64InstPrinter::getRegisterName(Reg); break; } case MachineOperand::MO_Immediate: { - int64_t Imm = MO.getImm(); - O << '#' << Imm; + O << MO.getImm(); break; } case MachineOperand::MO_GlobalAddress: { @@ -510,7 +526,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); switch (Mode) { default: return true; // Unknown mode. @@ -531,14 +547,13 @@ bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, // printing. bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, const TargetRegisterClass *RC, - bool isVector, raw_ostream &O) { + unsigned AltName, raw_ostream &O) { assert(MO.isReg() && "Should only get here with a register!"); const TargetRegisterInfo *RI = STI->getRegisterInfo(); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); assert(RI->regsOverlap(RegToPrint, Reg)); - O << AArch64InstPrinter::getRegisterName( - RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName); + O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName); return false; } @@ -574,6 +589,7 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, case 's': // Print S register. case 'd': // Print D register. case 'q': // Print Q register. + case 'z': // Print Z register. if (MO.isReg()) { const TargetRegisterClass *RC; switch (ExtraCode[0]) { @@ -592,10 +608,13 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, case 'q': RC = &AArch64::FPR128RegClass; break; + case 'z': + RC = &AArch64::ZPRRegClass; + break; default: return true; } - return printAsmRegInClass(MO, RC, false /* vector */, O); + return printAsmRegInClass(MO, RC, AArch64::NoRegAltName, O); } printOperand(MI, OpNum, O); return false; @@ -605,16 +624,26 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, // According to ARM, we should emit x and v registers unless we have a // modifier. if (MO.isReg()) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); // If this is a w or x register, print an x register. if (AArch64::GPR32allRegClass.contains(Reg) || AArch64::GPR64allRegClass.contains(Reg)) return printAsmMRegister(MO, 'x', O); + unsigned AltName = AArch64::NoRegAltName; + const TargetRegisterClass *RegClass; + if (AArch64::ZPRRegClass.contains(Reg)) { + RegClass = &AArch64::ZPRRegClass; + } else if (AArch64::PPRRegClass.contains(Reg)) { + RegClass = &AArch64::PPRRegClass; + } else { + RegClass = &AArch64::FPR128RegClass; + AltName = AArch64::vreg; + } + // If this is a b, h, s, d, or q register, print it as a v register. - return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */, - O); + return printAsmRegInClass(MO, RegClass, AltName, O); } printOperand(MI, OpNum, O); @@ -682,7 +711,7 @@ void AArch64AsmPrinter::EmitJumpTableInfo() { if (JTBBs.empty()) continue; unsigned Size = AFI->getJumpTableEntrySize(JTI); - EmitAlignment(Log2_32(Size)); + EmitAlignment(Align(Size)); OutStreamer->EmitLabel(GetJTISymbol(JTI)); for (auto *JTBB : JTBBs) @@ -725,12 +754,12 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI, /// add xDest, xDest, xScratch, lsl #2 void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer, const llvm::MachineInstr &MI) { - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned ScratchReg = MI.getOperand(1).getReg(); - unsigned ScratchRegW = + Register DestReg = MI.getOperand(0).getReg(); + Register ScratchReg = MI.getOperand(1).getReg(); + Register ScratchRegW = STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32); - unsigned TableReg = MI.getOperand(2).getReg(); - unsigned EntryReg = MI.getOperand(3).getReg(); + Register TableReg = MI.getOperand(2).getReg(); + Register EntryReg = MI.getOperand(3).getReg(); int JTIdx = MI.getOperand(4).getIndex(); bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8; @@ -800,7 +829,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, if (CallTarget) { assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && "High 16 bits of call target should be zero."); - unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); + Register ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); EncodedBytes = 16; // Materialize the jump address: EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi) @@ -830,7 +859,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, } void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) { // Convert H/S/D register to corresponding Q register if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) @@ -894,32 +923,32 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { default: break; case AArch64::MOVMCSym: { - unsigned DestReg = MI->getOperand(0).getReg(); - const MachineOperand &MO_Sym = MI->getOperand(1); - MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym); - MCOperand Hi_MCSym, Lo_MCSym; - - Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S); - Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC); - - MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym); - MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym); - - MCInst MovZ; - MovZ.setOpcode(AArch64::MOVZXi); - MovZ.addOperand(MCOperand::createReg(DestReg)); - MovZ.addOperand(Hi_MCSym); - MovZ.addOperand(MCOperand::createImm(16)); - EmitToStreamer(*OutStreamer, MovZ); - - MCInst MovK; - MovK.setOpcode(AArch64::MOVKXi); - MovK.addOperand(MCOperand::createReg(DestReg)); - MovK.addOperand(MCOperand::createReg(DestReg)); - MovK.addOperand(Lo_MCSym); - MovK.addOperand(MCOperand::createImm(0)); - EmitToStreamer(*OutStreamer, MovK); - return; + Register DestReg = MI->getOperand(0).getReg(); + const MachineOperand &MO_Sym = MI->getOperand(1); + MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym); + MCOperand Hi_MCSym, Lo_MCSym; + + Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S); + Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC); + + MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym); + MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym); + + MCInst MovZ; + MovZ.setOpcode(AArch64::MOVZXi); + MovZ.addOperand(MCOperand::createReg(DestReg)); + MovZ.addOperand(Hi_MCSym); + MovZ.addOperand(MCOperand::createImm(16)); + EmitToStreamer(*OutStreamer, MovZ); + + MCInst MovK; + MovK.setOpcode(AArch64::MOVKXi); + MovK.addOperand(MCOperand::createReg(DestReg)); + MovK.addOperand(MCOperand::createReg(DestReg)); + MovK.addOperand(Lo_MCSym); + MovK.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MovK); + return; } case AArch64::MOVIv2d_ns: // If the target has <rdar://problem/16473581>, lower this @@ -1084,6 +1113,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; case AArch64::HWASAN_CHECK_MEMACCESS: + case AArch64::HWASAN_CHECK_MEMACCESS_SHORTGRANULES: LowerHWASAN_CHECK_MEMACCESS(*MI); return; @@ -1193,4 +1223,6 @@ extern "C" void LLVMInitializeAArch64AsmPrinter() { RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget()); RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget()); RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target()); + RegisterAsmPrinter<AArch64AsmPrinter> W(getTheARM64_32Target()); + RegisterAsmPrinter<AArch64AsmPrinter> V(getTheAArch64_32Target()); } diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp index 59757769c89a..ed93d02aa615 100644 --- a/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/lib/Target/AArch64/AArch64CallLowering.cpp @@ -99,7 +99,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { /// (it's an implicit-def of the BL). virtual void markPhysRegUsed(unsigned PhysReg) = 0; - bool isArgumentHandler() const override { return true; } + bool isIncomingArgumentHandler() const override { return true; } uint64_t StackUsed; }; @@ -110,6 +110,7 @@ struct FormalArgHandler : public IncomingArgHandler { : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMRI()->addLiveIn(PhysReg); MIRBuilder.getMBB().addLiveIn(PhysReg); } }; @@ -129,14 +130,29 @@ struct CallReturnHandler : public IncomingArgHandler { struct OutgoingArgHandler : public CallLowering::ValueHandler { OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstrBuilder MIB, CCAssignFn *AssignFn, - CCAssignFn *AssignFnVarArg) + CCAssignFn *AssignFnVarArg, bool IsTailCall = false, + int FPDiff = 0) : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), - AssignFnVarArg(AssignFnVarArg), StackSize(0) {} + AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff), + StackSize(0) {} + + bool isIncomingArgumentHandler() const override { return false; } Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { + MachineFunction &MF = MIRBuilder.getMF(); LLT p0 = LLT::pointer(0, 64); LLT s64 = LLT::scalar(64); + + if (IsTailCall) { + Offset += FPDiff; + int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); + Register FIReg = MRI.createGenericVirtualRegister(p0); + MIRBuilder.buildFrameIndex(FIReg, FI); + MPO = MachinePointerInfo::getFixedStack(MF, FI); + return FIReg; + } + Register SPReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildCopy(SPReg, Register(AArch64::SP)); @@ -146,7 +162,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { Register AddrReg = MRI.createGenericVirtualRegister(p0); MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); - MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); + MPO = MachinePointerInfo::getStack(MF, Offset); return AddrReg; } @@ -173,12 +189,13 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, const CallLowering::ArgInfo &Info, + ISD::ArgFlagsTy Flags, CCState &State) override { bool Res; if (Info.IsFixed) - Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); else - Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Flags, State); StackSize = State.getNextStackOffset(); return Res; @@ -186,10 +203,19 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { MachineInstrBuilder MIB; CCAssignFn *AssignFnVarArg; + bool IsTailCall; + + /// For tail calls, the byte offset of the call's argument area from the + /// callee's. Unused elsewhere. + int FPDiff; uint64_t StackSize; }; } // namespace +static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) { + return CallConv == CallingConv::Fast && TailCallOpt; +} + void AArch64CallLowering::splitToValueTypes( const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const { @@ -207,7 +233,7 @@ void AArch64CallLowering::splitToValueTypes( // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags, OrigArg.IsFixed); + OrigArg.Flags[0], OrigArg.IsFixed); return; } @@ -218,13 +244,13 @@ void AArch64CallLowering::splitToValueTypes( OrigArg.Ty, CallConv, false); for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); - SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags, + SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], OrigArg.IsFixed); if (NeedsRegBlock) - SplitArgs.back().Flags.setInConsecutiveRegs(); + SplitArgs.back().Flags[0].setInConsecutiveRegs(); } - SplitArgs.back().Flags.setInConsecutiveRegsLast(); + SplitArgs.back().Flags[0].setInConsecutiveRegsLast(); } bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, @@ -344,6 +370,49 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, return Success; } +/// Helper function to compute forwarded registers for musttail calls. Computes +/// the forwarded registers, sets MBB liveness, and emits COPY instructions that +/// can be used to save + restore registers later. +static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, + CCAssignFn *AssignFn) { + MachineBasicBlock &MBB = MIRBuilder.getMBB(); + MachineFunction &MF = MIRBuilder.getMF(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (!MFI.hasMustTailInVarArgFunc()) + return; + + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + const Function &F = MF.getFunction(); + assert(F.isVarArg() && "Expected F to be vararg?"); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(F.getCallingConv(), /*IsVarArg=*/true, MF, ArgLocs, + F.getContext()); + SmallVector<MVT, 2> RegParmTypes; + RegParmTypes.push_back(MVT::i64); + RegParmTypes.push_back(MVT::f128); + + // Later on, we can use this vector to restore the registers if necessary. + SmallVectorImpl<ForwardedRegister> &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, AssignFn); + + // Conservatively forward X8, since it might be used for an aggregate + // return. + if (!CCInfo.isAllocated(AArch64::X8)) { + unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); + Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); + } + + // Add the forwards to the MachineBasicBlock and MachineFunction. + for (const auto &F : Forwards) { + MBB.addLiveIn(F.PReg); + MIRBuilder.buildCopy(Register(F.VReg), Register(F.PReg)); + } +} + bool AArch64CallLowering::lowerFormalArguments( MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const { @@ -376,64 +445,530 @@ bool AArch64CallLowering::lowerFormalArguments( if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + uint64_t StackOffset = Handler.StackUsed; if (F.isVarArg()) { - if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) { - // FIXME: we need to reimplement saveVarArgsRegisters from + auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + if (!Subtarget.isTargetDarwin()) { + // FIXME: we need to reimplement saveVarArgsRegisters from // AArch64ISelLowering. return false; } - // We currently pass all varargs at 8-byte alignment. - uint64_t StackOffset = alignTo(Handler.StackUsed, 8); + // We currently pass all varargs at 8-byte alignment, or 4 in ILP32. + StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8); auto &MFI = MIRBuilder.getMF().getFrameInfo(); - AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); } + if (doesCalleeRestoreStack(F.getCallingConv(), + MF.getTarget().Options.GuaranteedTailCallOpt)) { + // We have a non-standard ABI, so why not make full use of the stack that + // we're going to pop? It must be aligned to 16 B in any case. + StackOffset = alignTo(StackOffset, 16); + + // If we're expected to restore the stack (e.g. fastcc), then we'll be + // adding a multiple of 16. + FuncInfo->setArgumentStackToRestore(StackOffset); + + // Our own callers will guarantee that the space is free by giving an + // aligned value to CALLSEQ_START. + } + + // When we tail call, we need to check if the callee's arguments + // will fit on the caller's stack. So, whenever we lower formal arguments, + // we should keep track of this information, since we might lower a tail call + // in this function later. + FuncInfo->setBytesInStackArgArea(StackOffset); + auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); if (Subtarget.hasCustomCallingConv()) Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); + handleMustTailForwardedRegisters(MIRBuilder, AssignFn); + // Move back to the end of the basic block. MIRBuilder.setMBB(MBB); return true; } +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { + return CC == CallingConv::Fast; +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + case CallingConv::PreserveMost: + case CallingConv::Swift: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +/// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for +/// CC. +static std::pair<CCAssignFn *, CCAssignFn *> +getAssignFnsForCC(CallingConv::ID CC, const AArch64TargetLowering &TLI) { + return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; +} + +bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &InArgs) const { + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + + // If the calling conventions match, then everything must be the same. + if (CalleeCC == CallerCC) + return true; + + // Check if the caller and callee will handle arguments in the same way. + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + CCAssignFn *CalleeAssignFnFixed; + CCAssignFn *CalleeAssignFnVarArg; + std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) = + getAssignFnsForCC(CalleeCC, TLI); + + CCAssignFn *CallerAssignFnFixed; + CCAssignFn *CallerAssignFnVarArg; + std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) = + getAssignFnsForCC(CallerCC, TLI); + + if (!resultsCompatible(Info, MF, InArgs, *CalleeAssignFnFixed, + *CalleeAssignFnVarArg, *CallerAssignFnFixed, + *CallerAssignFnVarArg)) + return false; + + // Make sure that the caller and callee preserve all of the same registers. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) { + TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); + TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); + } + + return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved); +} + +bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable( + CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &OutArgs) const { + // If there are no outgoing arguments, then we are done. + if (OutArgs.empty()) + return true; + + const Function &CallerF = MF.getFunction(); + CallingConv::ID CalleeCC = Info.CallConv; + CallingConv::ID CallerCC = CallerF.getCallingConv(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + // We have outgoing arguments. Make sure that we can tail call with them. + SmallVector<CCValAssign, 16> OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext()); + + if (!analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg)) { + LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n"); + return false; + } + + // Make sure that they can fit on the caller's stack. + const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) { + LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n"); + return false; + } + + // Verify that the parameters in callee-saved registers match. + // TODO: Port this over to CallLowering as general code once swiftself is + // supported. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (unsigned i = 0; i < OutLocs.size(); ++i) { + auto &ArgLoc = OutLocs[i]; + // If it's not a register, it's fine. + if (!ArgLoc.isRegLoc()) { + if (Info.IsVarArg) { + // Be conservative and disallow variadic memory operands to match SDAG's + // behaviour. + // FIXME: If the caller's calling convention is C, then we can + // potentially use its argument area. However, for cases like fastcc, + // we can't do anything. + LLVM_DEBUG( + dbgs() + << "... Cannot tail call vararg function with stack arguments\n"); + return false; + } + continue; + } + + Register Reg = ArgLoc.getLocReg(); + + // Only look at callee-saved registers. + if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg)) + continue; + + LLVM_DEBUG( + dbgs() + << "... Call has an argument passed in a callee-saved register.\n"); + + // Check if it was copied from. + ArgInfo &OutInfo = OutArgs[i]; + + if (OutInfo.Regs.size() > 1) { + LLVM_DEBUG( + dbgs() << "... Cannot handle arguments in multiple registers.\n"); + return false; + } + + // Check if we copy the register, walking through copies from virtual + // registers. Note that getDefIgnoringCopies does not ignore copies from + // physical registers. + MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI); + if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) { + LLVM_DEBUG( + dbgs() + << "... Parameter was not copied into a VReg, cannot tail call.\n"); + return false; + } + + // Got a copy. Verify that it's the same as the register we want. + Register CopyRHS = RegDef->getOperand(1).getReg(); + if (CopyRHS != Reg) { + LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into " + "VReg, cannot tail call.\n"); + return false; + } + } + + return true; +} + +bool AArch64CallLowering::isEligibleForTailCallOptimization( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &InArgs, + SmallVectorImpl<ArgInfo> &OutArgs) const { + + // Must pass all target-independent checks in order to tail call optimize. + if (!Info.IsTailCall) + return false; + + CallingConv::ID CalleeCC = Info.CallConv; + MachineFunction &MF = MIRBuilder.getMF(); + const Function &CallerF = MF.getFunction(); + + LLVM_DEBUG(dbgs() << "Attempting to lower call as tail call\n"); + + if (Info.SwiftErrorVReg) { + // TODO: We should handle this. + // Note that this is also handled by the check for no outgoing arguments. + // Proactively disabling this though, because the swifterror handling in + // lowerCall inserts a COPY *after* the location of the call. + LLVM_DEBUG(dbgs() << "... Cannot handle tail calls with swifterror yet.\n"); + return false; + } + + if (!mayTailCallThisCC(CalleeCC)) { + LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n"); + return false; + } + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible (see + // X86). + // + // FIXME: In AArch64ISelLowering, this isn't worked around. Can/should we try + // it? + // + // On Windows, "inreg" attributes signify non-aggregate indirect returns. + // In this case, it is necessary to save/restore X0 in the callee. Tail + // call opt interferes with this. So we disable tail call opt when the + // caller has an argument with "inreg" attribute. + // + // FIXME: Check whether the callee also has an "inreg" argument. + // + // When the caller has a swifterror argument, we don't want to tail call + // because would have to move into the swifterror register before the + // tail call. + if (any_of(CallerF.args(), [](const Argument &A) { + return A.hasByValAttr() || A.hasInRegAttr() || A.hasSwiftErrorAttr(); + })) { + LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval, " + "inreg, or swifterror arguments\n"); + return false; + } + + // Externally-defined functions with weak linkage should not be + // tail-called on AArch64 when the OS does not support dynamic + // pre-emption of symbols, as the AAELF spec requires normal calls + // to undefined weak functions to be replaced with a NOP or jump to the + // next instruction. The behaviour of branch instructions in this + // situation (as used for tail calls) is implementation-defined, so we + // cannot rely on the linker replacing the tail call with a return. + if (Info.Callee.isGlobal()) { + const GlobalValue *GV = Info.Callee.getGlobal(); + const Triple &TT = MF.getTarget().getTargetTriple(); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || + TT.isOSBinFormatMachO())) { + LLVM_DEBUG(dbgs() << "... Cannot tail call externally-defined function " + "with weak linkage for this OS.\n"); + return false; + } + } + + // If we have -tailcallopt, then we're done. + if (MF.getTarget().Options.GuaranteedTailCallOpt) + return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv(); + + // We don't have -tailcallopt, so we're allowed to change the ABI (sibcall). + // Try to find cases where we can do that. + + // I want anyone implementing a new calling convention to think long and hard + // about this assert. + assert((!Info.IsVarArg || CalleeCC == CallingConv::C) && + "Unexpected variadic calling convention"); + + // Verify that the incoming and outgoing arguments from the callee are + // safe to tail call. + if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) { + LLVM_DEBUG( + dbgs() + << "... Caller and callee have incompatible calling conventions.\n"); + return false; + } + + if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs)) + return false; + + LLVM_DEBUG( + dbgs() << "... Call is eligible for tail call optimization.\n"); + return true; +} + +static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect, + bool IsTailCall) { + if (!IsTailCall) + return IsIndirect ? AArch64::BLR : AArch64::BL; + + if (!IsIndirect) + return AArch64::TCRETURNdi; + + // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use + // x16 or x17. + if (CallerF.hasFnAttribute("branch-target-enforcement")) + return AArch64::TCRETURNriBTI; + + return AArch64::TCRETURNri; +} + +bool AArch64CallLowering::lowerTailCall( + MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &OutArgs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + + // True when we're tail calling, but without -tailcallopt. + bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt; + + // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64 + // register class. Until we can do that, we should fall back here. + if (F.hasFnAttribute("branch-target-enforcement")) { + LLVM_DEBUG( + dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n"); + return false; + } + + // Find out which ABI gets to decide where things go. + CallingConv::ID CalleeCC = Info.CallConv; + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI); + + MachineInstrBuilder CallSeqStart; + if (!IsSibCall) + CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); + + unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true); + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + MIB.add(Info.Callee); + + // Byte offset for the tail call. When we are sibcalling, this will always + // be 0. + MIB.addImm(0); + + // Tell the call which registers are clobbered. + auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv()); + if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) + TRI->UpdateCustomCallPreservedMask(MF, &Mask); + MIB.addRegMask(Mask); + + if (TRI->isAnyArgRegReserved(MF)) + TRI->emitReservedArgRegCallError(MF); + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. + int FPDiff = 0; + + // This will be 0 for sibcalls, potentially nonzero for tail calls produced + // by -tailcallopt. For sibcalls, the memory operands for the call are + // already available in the caller's incoming argument space. + unsigned NumBytes = 0; + if (!IsSibCall) { + // We aren't sibcalling, so we need to compute FPDiff. We need to do this + // before handling assignments, because FPDiff must be known for memory + // arguments. + unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); + SmallVector<CCValAssign, 16> OutLocs; + CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext()); + analyzeArgInfo(OutInfo, OutArgs, *AssignFnFixed, *AssignFnVarArg); + + // The callee will pop the argument stack as a tail call. Thus, we must + // keep it 16-byte aligned. + NumBytes = alignTo(OutInfo.getNextStackOffset(), 16); + + // FPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // actually shrink the stack. + FPDiff = NumReusableBytes - NumBytes; + + // The stack pointer must be 16-byte aligned at all times it's used for a + // memory operation, which in practice means at *all* times and in + // particular across call boundaries. Therefore our own arguments started at + // a 16-byte aligned SP and the delta applied for the tail call should + // satisfy the same constraint. + assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); + } + + const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); + + // Do the actual argument marshalling. + SmallVector<unsigned, 8> PhysRegs; + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, + AssignFnVarArg, true, FPDiff); + if (!handleAssignments(MIRBuilder, OutArgs, Handler)) + return false; + + if (Info.IsVarArg && Info.IsMustTailCall) { + // Now we know what's being passed to the function. Add uses to the call for + // the forwarded registers that we *aren't* passing as parameters. This will + // preserve the copies we build earlier. + for (const auto &F : Forwards) { + Register ForwardedReg = F.PReg; + // If the register is already passed, or aliases a register which is + // already being passed, then skip it. + if (any_of(MIB->uses(), [&ForwardedReg, &TRI](const MachineOperand &Use) { + if (!Use.isReg()) + return false; + return TRI->regsOverlap(Use.getReg(), ForwardedReg); + })) + continue; + + // We aren't passing it already, so we should add it to the call. + MIRBuilder.buildCopy(ForwardedReg, Register(F.VReg)); + MIB.addReg(ForwardedReg, RegState::Implicit); + } + } + + // If we have -tailcallopt, we need to adjust the stack. We'll do the call + // sequence start and end here. + if (!IsSibCall) { + MIB->getOperand(1).setImm(FPDiff); + CallSeqStart.addImm(NumBytes).addImm(0); + // End the call sequence *before* emitting the call. Normally, we would + // tidy the frame up after the call. However, here, we've laid out the + // parameters so that when SP is reset, they will be in the correct + // location. + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP).addImm(NumBytes).addImm(0); + } + + // Now we can add the actual call instruction to the correct basic block. + MIRBuilder.insertInstr(MIB); + + // If Callee is a reg, since it is used by a target specific instruction, + // it must have a register class matching the constraint of that instruction. + if (Info.Callee.isReg()) + MIB->getOperand(0).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); + + MF.getFrameInfo().setHasTailCall(); + Info.LoweredTailCall = true; + return true; +} + bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, - CallingConv::ID CallConv, - const MachineOperand &Callee, - const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs, - Register SwiftErrorVReg) const { + CallLoweringInfo &Info) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); auto &DL = F.getParent()->getDataLayout(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); - SmallVector<ArgInfo, 8> SplitArgs; - for (auto &OrigArg : OrigArgs) { - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv); + SmallVector<ArgInfo, 8> OutArgs; + for (auto &OrigArg : Info.OrigArgs) { + splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv); // AAPCS requires that we zero-extend i1 to 8 bits by the caller. if (OrigArg.Ty->isIntegerTy(1)) - SplitArgs.back().Flags.setZExt(); + OutArgs.back().Flags[0].setZExt(); + } + + SmallVector<ArgInfo, 8> InArgs; + if (!Info.OrigRet.Ty->isVoidTy()) + splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv()); + + // If we can lower as a tail call, do that instead. + bool CanTailCallOpt = + isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs); + + // We must emit a tail call if we have musttail. + if (Info.IsMustTailCall && !CanTailCallOpt) { + // There are types of incoming/outgoing arguments we can't handle yet, so + // it doesn't make sense to actually die here like in ISelLowering. Instead, + // fall back to SelectionDAG and let it try to handle this. + LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); + return false; } + if (CanTailCallOpt) + return lowerTailCall(MIRBuilder, Info, OutArgs); + // Find out which ABI gets to decide where things go. - const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); - CCAssignFn *AssignFnFixed = - TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false); - CCAssignFn *AssignFnVarArg = - TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true); + CCAssignFn *AssignFnFixed; + CCAssignFn *AssignFnVarArg; + std::tie(AssignFnFixed, AssignFnVarArg) = + getAssignFnsForCC(Info.CallConv, TLI); - auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); + MachineInstrBuilder CallSeqStart; + CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR - : AArch64::BL); - MIB.add(Callee); + unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false); + + auto MIB = MIRBuilder.buildInstrNoInsert(Opc); + MIB.add(Info.Callee); // Tell the call which registers are clobbered. auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); @@ -448,8 +983,8 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Do the actual argument marshalling. SmallVector<unsigned, 8> PhysRegs; OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, - AssignFnVarArg); - if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + AssignFnVarArg, false); + if (!handleAssignments(MIRBuilder, OutArgs, Handler)) return false; // Now we can add the actual call instruction to the correct basic block. @@ -458,34 +993,37 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // If Callee is a reg, since it is used by a target specific // instruction, it must have a register class matching the // constraint of that instruction. - if (Callee.isReg()) + if (Info.Callee.isReg()) MIB->getOperand(0).setReg(constrainOperandRegClass( MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), - *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0)); + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arugments, the physical register must be an // implicit-define of the call instruction. - CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); - if (!OrigRet.Ty->isVoidTy()) { - SplitArgs.clear(); - - splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv()); - + if (!Info.OrigRet.Ty->isVoidTy()) { + CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); - if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + if (!handleAssignments(MIRBuilder, InArgs, Handler)) return false; } - if (SwiftErrorVReg) { + if (Info.SwiftErrorVReg) { MIB.addDef(AArch64::X21, RegState::Implicit); - MIRBuilder.buildCopy(SwiftErrorVReg, Register(AArch64::X21)); + MIRBuilder.buildCopy(Info.SwiftErrorVReg, Register(AArch64::X21)); } + uint64_t CalleePopBytes = + doesCalleeRestoreStack(Info.CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt) + ? alignTo(Handler.StackSize, 16) + : 0; + CallSeqStart.addImm(Handler.StackSize).addImm(0); MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) .addImm(Handler.StackSize) - .addImm(0); + .addImm(CalleePopBytes); return true; } diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h index 4f428f254537..b0c601c7062c 100644 --- a/lib/Target/AArch64/AArch64CallLowering.h +++ b/lib/Target/AArch64/AArch64CallLowering.h @@ -40,16 +40,15 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs, - Register SwiftErrorVReg) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const override { - return lowerCall(MIRBuilder, CallConv, Callee, OrigRet, OrigArgs, 0); - } + /// Returns true if the call can be lowered as a tail call. + bool + isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &InArgs, + SmallVectorImpl<ArgInfo> &OutArgs) const; bool supportSwiftError() const override { return true; } @@ -64,6 +63,18 @@ private: SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv) const; + + bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info, + SmallVectorImpl<ArgInfo> &OutArgs) const; + + bool + doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info, + MachineFunction &MF, + SmallVectorImpl<ArgInfo> &InArgs) const; + + bool + areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF, + SmallVectorImpl<ArgInfo> &OutArgs) const; }; } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64CallingConvention.cpp b/lib/Target/AArch64/AArch64CallingConvention.cpp index 02538a187611..a0695cef615f 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -40,12 +40,14 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State, unsigned SlotAlign) { unsigned Size = LocVT.getSizeInBits() / 8; - unsigned StackAlign = + const Align StackAlign = State.getMachineFunction().getDataLayout().getStackAlignment(); - unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); + const Align OrigAlign(ArgFlags.getOrigAlign()); + const Align Align = std::min(OrigAlign, StackAlign); for (auto &It : PendingMembers) { - It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign))); + It.convertToMem(State.AllocateStack( + Size, std::max((unsigned)Align.value(), SlotAlign))); State.addLoc(It); SlotAlign = 1; } @@ -79,10 +81,14 @@ static bool CC_AArch64_Custom_Stack_Block( static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { + const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( + State.getMachineFunction().getSubtarget()); + bool IsDarwinILP32 = Subtarget.isTargetILP32() && Subtarget.isTargetMachO(); + // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. ArrayRef<MCPhysReg> RegList; - if (LocVT.SimpleTy == MVT::i64) + if (LocVT.SimpleTy == MVT::i64 || (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32)) RegList = XRegList; else if (LocVT.SimpleTy == MVT::f16) RegList = HRegList; @@ -107,8 +113,12 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (!ArgFlags.isInConsecutiveRegsLast()) return true; - unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); - if (RegResult) { + // [N x i32] arguments get packed into x-registers on Darwin's arm64_32 + // because that's how the armv7k Clang front-end emits small structs. + unsigned EltsPerReg = (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32) ? 2 : 1; + unsigned RegResult = State.AllocateRegBlock( + RegList, alignTo(PendingMembers.size(), EltsPerReg) / EltsPerReg); + if (RegResult && EltsPerReg == 1) { for (auto &It : PendingMembers) { It.convertToReg(RegResult); State.addLoc(It); @@ -116,14 +126,26 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, } PendingMembers.clear(); return true; + } else if (RegResult) { + assert(EltsPerReg == 2 && "unexpected ABI"); + bool UseHigh = false; + CCValAssign::LocInfo Info; + for (auto &It : PendingMembers) { + Info = UseHigh ? CCValAssign::AExtUpper : CCValAssign::ZExt; + State.addLoc(CCValAssign::getReg(It.getValNo(), MVT::i32, RegResult, + MVT::i64, Info)); + UseHigh = !UseHigh; + if (!UseHigh) + ++RegResult; + } + PendingMembers.clear(); + return true; } // Mark all regs in the class as unavailable for (auto Reg : RegList) State.AllocateReg(Reg); - const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( - State.getMachineFunction().getSubtarget()); unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h index 13cc0c583fd2..5a55d090d7c8 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.h +++ b/lib/Target/AArch64/AArch64CallingConvention.h @@ -25,6 +25,9 @@ bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index d969a9e1ab3a..bccbbd4591ed 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -17,6 +17,10 @@ class CCIfAlign<string Align, CCAction A> : class CCIfBigEndian<CCAction A> : CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; +class CCIfILP32<CCAction A> : + CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>; + + //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention //===----------------------------------------------------------------------===// @@ -70,6 +74,18 @@ def CC_AArch64_AAPCS : CallingConv<[ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + CCPassIndirect<i64>>, + + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCAssignToReg<[P0, P1, P2, P3]>>, + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCPassIndirect<i64>>, + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -111,6 +127,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCIfType<[v2f32], CCBitConvertToType<v2i32>>, CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>, // Big endian vectors must be passed as if they were 1-element vectors so that @@ -135,7 +152,14 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], - CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, + + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCAssignToReg<[P0, P1, P2, P3]>> ]>; // Vararg functions on windows pass floats in integer registers @@ -202,6 +226,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[ CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + // Re-demote pointers to 32-bits so we don't end up storing 64-bit + // values and clobbering neighbouring stack locations. Not very pretty. + CCIfPtr<CCIfILP32<CCTruncToType<i32>>>, + CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>, + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], CCAssignToStack<8, 8>>, CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], @@ -229,6 +259,29 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ CCAssignToStack<16, 16>> ]>; +// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the +// same as the normal Darwin VarArgs handling. +let Entry = 1 in +def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType<v2i32>>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>, + + // Handle all scalar types as either i32 or f32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + CCIfType<[f16], CCPromoteToType<f32>>, + + // Everything is on the stack. + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfPtr<CCIfILP32<CCTruncToType<i32>>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>, + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> +]>; + + // The WebKit_JS calling convention only passes the first argument (the callee) // in register and the remaining arguments on stack. We allow 32bit stack slots, // so that WebKit can write partial values in the stack and define the other @@ -298,6 +351,12 @@ def CC_AArch64_GHC : CallingConv<[ CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>> ]>; +// The order of the callee-saves in this file is important, because the +// FrameLowering code will use this order to determine the layout the +// callee-save area in the stack frame. As can be observed below, Darwin +// requires the frame-record (LR, FP) to be at the top the callee-save area, +// whereas for other platforms they are at the bottom. + // FIXME: LR is only callee-saved in the sense that *we* preserve it and are // presumably a callee to someone. External functions may not do so, but this // is currently safe since BL has LR as an implicit-def and what happens after a @@ -306,7 +365,13 @@ def CC_AArch64_GHC : CallingConv<[ // It would be better to model its preservation semantics properly (create a // vreg on entry, use it in RET & tail call generation; make that vreg def if we // end up saving LR as part of a call frame). Watch this space... -def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, +def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +// Darwin puts the frame-record at the top of the callee-save area. +def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, X23, X24, X25, X26, X27, X28, D8, D9, D10, D11, D12, D13, D14, D15)>; @@ -314,17 +379,24 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, // Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x. // We put FP before LR, so that frame lowering logic generates (FP,LR) pairs, // and not (LR,FP) pairs. -def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add FP, LR, X19, X20, X21, X22, - X23, X24, X25, X26, X27, X28, +def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, FP, LR, D8, D9, D10, D11, D12, D13, D14, D15)>; // AArch64 PCS for vector functions (VPCS) // must (additionally) preserve full Q8-Q23 registers -def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, - X23, X24, X25, X26, X27, X28, +def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, (sequence "Q%u", 8, 23))>; +// Functions taking SVE arguments or returning an SVE type +// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15 +def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, + (sequence "Z%u", 8, 23), + (sequence "P%u", 4, 15))>; + // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since // 'this' and the pointer return value are both passed in X0 in these cases, // this can be partially modelled by treating X0 as a callee-saved register; @@ -336,7 +408,7 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; def CSR_AArch64_AAPCS_SwiftError - : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>; + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; // The function used by Darwin to obtain the address of a thread-local variable // guarantees more than a normal AAPCS function. x16 and x17 are used on the @@ -352,7 +424,7 @@ def CSR_AArch64_TLS_Darwin // fast path calls a function that follows CSR_AArch64_TLS_Darwin, // CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. def CSR_AArch64_CXX_TLS_Darwin - : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), (sequence "D%u", 0, 31))>; diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 9f324b433209..35e6fef24363 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -103,6 +103,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -181,6 +182,7 @@ static bool canDefBePartOfLOH(const MachineInstr &MI) { case AArch64::ADDXri: return canAddBePartOfLOH(MI); case AArch64::LDRXui: + case AArch64::LDRWui: // Check immediate to see if the immediate is an address. switch (MI.getOperand(2).getType()) { default: @@ -312,7 +314,8 @@ static void handleUse(const MachineInstr &MI, const MachineOperand &MO, Info.Type = MCLOH_AdrpAdd; Info.IsCandidate = true; Info.MI0 = &MI; - } else if (MI.getOpcode() == AArch64::LDRXui && + } else if ((MI.getOpcode() == AArch64::LDRXui || + MI.getOpcode() == AArch64::LDRWui) && MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) { Info.Type = MCLOH_AdrpLdrGot; Info.IsCandidate = true; @@ -357,7 +360,9 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo, return true; } } else { - assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui"); + assert((MI.getOpcode() == AArch64::LDRXui || + MI.getOpcode() == AArch64::LDRWui) && + "Expect LDRXui or LDRWui"); assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) && "Expected GOT relocation"); if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) { @@ -474,13 +479,23 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) { handleClobber(LOHInfos[Idx]); } // Handle uses. + + SmallSet<int, 4> UsesSeen; for (const MachineOperand &MO : MI.uses()) { if (!MO.isReg() || !MO.readsReg()) continue; int Idx = mapRegToGPRIndex(MO.getReg()); if (Idx < 0) continue; - handleUse(MI, MO, LOHInfos[Idx]); + + // Multiple uses of the same register within a single instruction don't + // count as MultiUser or block optimization. This is especially important on + // arm64_32, where any memory operation is likely to be an explicit use of + // xN and an implicit use of wN (the base address register). + if (!UsesSeen.count(Idx)) { + handleUse(MI, MO, LOHInfos[Idx]); + UsesSeen.insert(Idx); + } } } @@ -512,6 +527,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { switch (Opcode) { case AArch64::ADDXri: case AArch64::LDRXui: + case AArch64::LDRWui: if (canDefBePartOfLOH(MI)) { const MachineOperand &Def = MI.getOperand(0); const MachineOperand &Op = MI.getOperand(1); diff --git a/lib/Target/AArch64/AArch64Combine.td b/lib/Target/AArch64/AArch64Combine.td new file mode 100644 index 000000000000..bb99f2516ecf --- /dev/null +++ b/lib/Target/AArch64/AArch64Combine.td @@ -0,0 +1,18 @@ +//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/GlobalISel/Combine.td" + +def AArch64PreLegalizerCombinerHelper: GICombinerHelper< + "AArch64GenPreLegalizerCombinerHelper", [all_combines, + elide_br_by_inverting_cond]> { + let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule"; +} diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp index 453132e09669..25e23e4623de 100644 --- a/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -78,7 +78,7 @@ void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const { } MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) { - if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (!Register::isVirtualRegister(MO.getReg())) return nullptr; return MRI->getUniqueVRegDef(MO.getReg()); } @@ -98,7 +98,7 @@ MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI, } bool Is64Bit; unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit); - unsigned NewDestReg = MI.getOperand(0).getReg(); + Register NewDestReg = MI.getOperand(0).getReg(); if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg())) NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR; diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 2cfbcc592d6a..43ae9f8ec47f 100644 --- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -220,7 +220,7 @@ bool SSACCmpConv::trivialTailPHIs() { // PHI operands come in (VReg, MBB) pairs. for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) { MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB(); - unsigned Reg = I.getOperand(oi).getReg(); + Register Reg = I.getOperand(oi).getReg(); if (MBB == Head) { assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands"); HeadReg = Reg; @@ -259,7 +259,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) { // Writes to the zero register are dead. if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) return true; - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) + if (!Register::isVirtualRegister(DstReg)) return false; // A virtual register def without any uses will be marked dead later, and // eventually replaced by the zero register. @@ -631,7 +631,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { } const MCInstrDesc &MCID = TII->get(Opc); // Create a dummy virtual register for the SUBS def. - unsigned DestReg = + Register DestReg = MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF)); // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. BuildMI(*Head, Head->end(), TermDL, MCID) diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index a43077cb88ec..bc3808df1dbc 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -145,8 +145,8 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( continue; // We should not have any relevant physreg defs that are replacable by // zero before register allocation. So we just check for dead vreg defs. - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg) || + Register Reg = MO.getReg(); + if (!Register::isVirtualRegister(Reg) || (!MO.isDead() && !MRI->use_nodbg_empty(Reg))) continue; assert(!MO.isImplicit() && "Unexpected implicit def!"); diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 210c10eb1842..082e17e44d04 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -109,7 +109,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize) { MachineInstr &MI = *MBBI; - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); uint64_t Imm = MI.getOperand(1).getImm(); if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) { @@ -150,7 +150,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, } break; case AArch64::MOVKWi: case AArch64::MOVKXi: { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) .addReg(DstReg, @@ -174,14 +174,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Dest = MI.getOperand(0); - unsigned StatusReg = MI.getOperand(1).getReg(); + Register StatusReg = MI.getOperand(1).getReg(); bool StatusDead = MI.getOperand(1).isDead(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); - unsigned AddrReg = MI.getOperand(2).getReg(); - unsigned DesiredReg = MI.getOperand(3).getReg(); - unsigned NewReg = MI.getOperand(4).getReg(); + Register AddrReg = MI.getOperand(2).getReg(); + Register DesiredReg = MI.getOperand(3).getReg(); + Register NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -254,16 +254,16 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( DebugLoc DL = MI.getDebugLoc(); MachineOperand &DestLo = MI.getOperand(0); MachineOperand &DestHi = MI.getOperand(1); - unsigned StatusReg = MI.getOperand(2).getReg(); + Register StatusReg = MI.getOperand(2).getReg(); bool StatusDead = MI.getOperand(2).isDead(); // Duplicating undef operands into 2 instructions does not guarantee the same // value on both; However undef should be replaced by xzr anyway. assert(!MI.getOperand(3).isUndef() && "cannot handle undef"); - unsigned AddrReg = MI.getOperand(3).getReg(); - unsigned DesiredLoReg = MI.getOperand(4).getReg(); - unsigned DesiredHiReg = MI.getOperand(5).getReg(); - unsigned NewLoReg = MI.getOperand(6).getReg(); - unsigned NewHiReg = MI.getOperand(7).getReg(); + Register AddrReg = MI.getOperand(3).getReg(); + Register DesiredLoReg = MI.getOperand(4).getReg(); + Register DesiredHiReg = MI.getOperand(5).getReg(); + Register NewLoReg = MI.getOperand(6).getReg(); + Register NewHiReg = MI.getOperand(7).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -475,7 +475,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, case AArch64::LOADgot: { MachineFunction *MF = MBB.getParent(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); const MachineOperand &MO1 = MI.getOperand(1); unsigned Flags = MO1.getTargetFlags(); @@ -495,12 +495,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, } } else { // Small codemodel expand into ADRP + LDR. + MachineFunction &MF = *MI.getParent()->getParent(); + DebugLoc DL = MI.getDebugLoc(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg); - MachineInstrBuilder MIB2 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui)) - .add(MI.getOperand(0)) - .addReg(DstReg); + + MachineInstrBuilder MIB2; + if (MF.getSubtarget<AArch64Subtarget>().isTargetILP32()) { + auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); + unsigned Reg32 = TRI->getSubReg(DstReg, AArch64::sub_32); + unsigned DstFlags = MI.getOperand(0).getTargetFlags(); + MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRWui)) + .addDef(Reg32) + .addReg(DstReg, RegState::Kill) + .addReg(DstReg, DstFlags | RegState::Implicit); + } else { + unsigned DstReg = MI.getOperand(0).getReg(); + MIB2 = BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui)) + .add(MI.getOperand(0)) + .addUse(DstReg, RegState::Kill); + } if (MO1.isGlobal()) { MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE); @@ -534,11 +548,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, case AArch64::MOVaddrTLS: case AArch64::MOVaddrEXT: { // Expand into ADRP + ADD. - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg) .add(MI.getOperand(1)); + if (MI.getOperand(1).getTargetFlags() & AArch64II::MO_TAGGED) { + // MO_TAGGED on the page indicates a tagged address. Set the tag now. + // We do so by creating a MOVK that sets bits 48-63 of the register to + // (global address + 0x100000000 - PC) >> 48. This assumes that we're in + // the small code model so we can assume a binary size of <= 4GB, which + // makes the untagged PC relative offset positive. The binary must also be + // loaded into address range [0, 2^48). Both of these properties need to + // be ensured at runtime when using tagged addresses. + auto Tag = MI.getOperand(1); + Tag.setTargetFlags(AArch64II::MO_PREL | AArch64II::MO_G3); + Tag.setOffset(0x100000000); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi), DstReg) + .addReg(DstReg) + .add(Tag) + .addImm(48); + } + MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) .add(MI.getOperand(0)) @@ -561,7 +592,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return true; case AArch64::MOVbaseTLS: { - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); auto SysReg = AArch64SysReg::TPIDR_EL0; MachineFunction *MF = MBB.getParent(); if (MF->getTarget().getTargetTriple().isOSFuchsia() && @@ -642,11 +673,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, // instruction sequence. int BaseOffset = -AFI->getTaggedBasePointerOffset(); unsigned FrameReg; - int FrameRegOffset = TFI->resolveFrameOffsetReference( - MF, BaseOffset, false /*isFixed*/, FrameReg, /*PreferFP=*/false, + StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference( + MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg, + /*PreferFP=*/false, /*ForSimm=*/true); Register SrcReg = FrameReg; - if (FrameRegOffset != 0) { + if (FrameRegOffset) { // Use output register as temporary. SrcReg = MI.getOperand(0).getReg(); emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg, diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 3b3182128c4c..b54fc2e51bac 100644 --- a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -642,7 +642,7 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) { } // Loads from the stack pointer don't get prefetched. - unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg(); + Register BaseReg = MI.getOperand(BaseRegIdx).getReg(); if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP) return None; diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 8dc2768b9597..277a3052f1e5 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -459,7 +459,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO()) return 0; - unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); + unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); EVT DestEVT = TLI.getValueType(DL, GV->getType(), true); if (!DestEVT.isSimple()) @@ -474,12 +474,32 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { ADRPReg) .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags); - ResultReg = createResultReg(&AArch64::GPR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui), + unsigned LdrOpc; + if (Subtarget->isTargetILP32()) { + ResultReg = createResultReg(&AArch64::GPR32RegClass); + LdrOpc = AArch64::LDRWui; + } else { + ResultReg = createResultReg(&AArch64::GPR64RegClass); + LdrOpc = AArch64::LDRXui; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(LdrOpc), ResultReg) - .addReg(ADRPReg) - .addGlobalAddress(GV, 0, - AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags); + .addReg(ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC | OpFlags); + if (!Subtarget->isTargetILP32()) + return ResultReg; + + // LDRWui produces a 32-bit register, but pointers in-register are 64-bits + // so we must extend the result on ILP32. + unsigned Result64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG)) + .addDef(Result64) + .addImm(0) + .addReg(ResultReg, RegState::Kill) + .addImm(AArch64::sub_32); + return Result64; } else { // ADRP + ADDX BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), @@ -504,6 +524,15 @@ unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) { if (!CEVT.isSimple()) return 0; MVT VT = CEVT.getSimpleVT(); + // arm64_32 has 32-bit pointers held in 64-bit registers. Because of that, + // 'null' pointers need to have a somewhat special treatment. + if (const auto *CPN = dyn_cast<ConstantPointerNull>(C)) { + (void)CPN; + assert(CPN->getType()->getPointerAddressSpace() == 0 && + "Unexpected address space"); + assert(VT == MVT::i64 && "Expected 64-bit pointers"); + return materializeInt(ConstantInt::get(Type::getInt64Ty(*Context), 0), VT); + } if (const auto *CI = dyn_cast<ConstantInt>(C)) return materializeInt(CI, VT); @@ -946,6 +975,9 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) { EVT evt = TLI.getValueType(DL, Ty, true); + if (Subtarget->isTargetILP32() && Ty->isPointerTy()) + return false; + // Only handle simple types. if (evt == MVT::Other || !evt.isSimple()) return false; @@ -988,6 +1020,9 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const { } bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { + if (Subtarget->isTargetILP32()) + return false; + unsigned ScaleFactor = getImplicitScaleFactor(VT); if (!ScaleFactor) return false; @@ -3165,6 +3200,11 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (IsTailCall) return false; + // FIXME: we could and should support this, but for now correctness at -O0 is + // more important. + if (Subtarget->isTargetILP32()) + return false; + CodeModel::Model CM = TM.getCodeModel(); // Only support the small-addressing and large code models. if (CM != CodeModel::Large && !Subtarget->useSmallAddressing()) @@ -3434,8 +3474,8 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { MFI.setFrameAddressIsTaken(true); const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); - unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); + Register SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr); // Recursively load frame address @@ -3796,6 +3836,11 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + // FIXME: in principle it could. Mostly just a case of zero extending outgoing + // pointers. + if (Subtarget->isTargetILP32()) + return false; + if (F.isVarArg()) return false; @@ -3842,7 +3887,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) { return false; unsigned SrcReg = Reg + VA.getValNo(); - unsigned DestReg = VA.getLocReg(); + Register DestReg = VA.getLocReg(); // Avoid a cross-class copy. This is very unlikely. if (!MRI.getRegClass(SrcReg)->contains(DestReg)) return false; @@ -3970,7 +4015,7 @@ unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) { if (DestVT == MVT::i64) { // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the // upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd. - unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), Reg64) .addImm(0) @@ -4123,7 +4168,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, }; unsigned Opc = OpcTable[IsZExt][Is64Bit]; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { - unsigned TmpReg = MRI.createVirtualRegister(RC); + Register TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) @@ -4244,7 +4289,7 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, }; unsigned Opc = OpcTable[IsZExt][Is64Bit]; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { - unsigned TmpReg = MRI.createVirtualRegister(RC); + Register TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) @@ -4353,7 +4398,7 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, }; unsigned Opc = OpcTable[IsZExt][Is64Bit]; if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { - unsigned TmpReg = MRI.createVirtualRegister(RC); + Register TmpReg = MRI.createVirtualRegister(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) @@ -4412,7 +4457,7 @@ unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, if (DestVT == MVT::i8 || DestVT == MVT::i16) DestVT = MVT::i32; else if (DestVT == MVT::i64) { - unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBREG_TO_REG), Src64) .addImm(0) @@ -4495,7 +4540,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, const auto *LoadMI = MI; if (LoadMI->getOpcode() == TargetOpcode::COPY && LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) { - unsigned LoadReg = MI->getOperand(1).getReg(); + Register LoadReg = MI->getOperand(1).getReg(); LoadMI = MRI.getUniqueVRegDef(LoadReg); assert(LoadMI && "Expected valid instruction"); } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 8c6e5cbd5c13..68e1e6a30224 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -44,11 +44,19 @@ // | | // |-----------------------------------| // | | -// | prev_fp, prev_lr | +// | callee-saved gpr registers | <--. +// | | | On Darwin platforms these +// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped, +// | | | (frame record first) +// | prev_fp, prev_lr | <--' // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) // | | -// | other callee-saved registers | +// | callee-saved fp/simd/SVE regs | +// | | +// |-----------------------------------| +// | | +// | SVE stack objects | // | | // |-----------------------------------| // |.empty.space.to.make.part.below....| @@ -80,6 +88,20 @@ // * A frame pointer is definitely needed when there are local variables with // more-than-default alignment requirements. // +// For Darwin platforms the frame-record (fp, lr) is stored at the top of the +// callee-saved area, since the unwind encoding does not allow for encoding +// this dynamically and existing tools depend on this layout. For other +// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved +// area to allow SVE stack objects (allocated directly below the callee-saves, +// if available) to be accessed directly from the framepointer. +// The SVE spill/fill instructions have VL-scaled addressing modes such +// as: +// ldr z8, [fp, #-7 mul vl] +// For SVE the size of the vector length (VL) is not known at compile-time, so +// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this +// layout, we don't need to add an unscaled offset to the framepointer before +// accessing the SVE object in the frame. +// // In some cases when a base pointer is not strictly needed, it is generated // anyway when offsets from the frame pointer to access local variables become // so large that the offset can't be encoded in the immediate fields of loads @@ -94,6 +116,7 @@ #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" +#include "AArch64StackOffset.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -173,7 +196,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { if (!MO.isFI()) continue; - int Offset = 0; + StackOffset Offset; if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) == AArch64FrameOffsetCannotUpdate) return 0; @@ -183,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +/// Returns the size of the entire SVE stackframe (calleesaves + spills). +static StackOffset getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8}; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -195,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -273,14 +303,15 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8}, + TII); } } else if (CalleePopAmount != 0) { // If the calling convention demands that the callee pops arguments from the // stack, we want to add it back if we have a reserved call frame. assert(CalleePopAmount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, - TII); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, + {-(int64_t)CalleePopAmount, MVT::i8}, TII); } return MBB.erase(I); } @@ -416,6 +447,9 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + if (MF.getFunction().hasOptSize()) + return false; + if (AFI->getLocalStackSize() == 0) return false; @@ -436,6 +470,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (canUseRedZone(MF)) return false; + // When there is an SVE area on the stack, always allocate the + // callee-saves and spills/locals separately. + if (getSVEStackSize(MF)) + return false; + return true; } @@ -474,8 +513,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, Imm = -Imm; LLVM_FALLTHROUGH; case AArch64::STPXpre: { - unsigned Reg0 = MBBI->getOperand(1).getReg(); - unsigned Reg1 = MBBI->getOperand(2).getReg(); + Register Reg0 = MBBI->getOperand(1).getReg(); + Register Reg1 = MBBI->getOperand(2).getReg(); if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X)) .addImm(Imm * 8) @@ -523,8 +562,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, } case AArch64::STPXi: case AArch64::LDPXi: { - unsigned Reg0 = MBBI->getOperand(0).getReg(); - unsigned Reg1 = MBBI->getOperand(1).getReg(); + Register Reg0 = MBBI->getOperand(0).getReg(); + Register Reg1 = MBBI->getOperand(1).getReg(); if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR)) .addImm(Imm * 8) @@ -791,6 +830,10 @@ static bool needsWinCFI(const MachineFunction &MF) { F.needsUnwindTableEntry(); } +static bool isTargetDarwin(const MachineFunction &MF) { + return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin(); +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -846,6 +889,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Ideally it should match SP value after prologue. AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -856,6 +901,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && + "unexpected function without stack frame but with SVE objects"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -866,8 +913,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setHasRedZone(true); ++NumRedZoneFunctions; } else { - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, + false, NeedsWinCFI, &HasWinCFI); if (!NeedsWinCFI) { // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); @@ -901,8 +949,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, + NeedsWinCFI, &HasWinCFI); NumBytes = 0; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( @@ -948,9 +998,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (HasFP) { - // Only set up FP if we actually need to. Frame pointer is fp = - // sp - fixedobject - 16. - int FPOffset = AFI->getCalleeSavedStackSize() - 16; + // Only set up FP if we actually need to. + int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; + if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -958,8 +1008,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // mov fp,sp when FPOffset is zero. // Note: All stores of callee-saved registers are marked as "FrameSetup". // This code marks the instruction(s) that set the FP also. - emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, + {FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false, + NeedsWinCFI, &HasWinCFI); } if (windowsRequiresStackProbe(MF, NumBytes)) { @@ -1056,6 +1107,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + MachineInstr::FrameSetup); + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1071,8 +1125,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, + false, NeedsWinCFI, &HasWinCFI); if (NeedsRealignment) { const unsigned Alignment = MFI.getMaxAlignment(); @@ -1130,8 +1185,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (needsFrameMoves) { const DataLayout &TD = MF.getDataLayout(); - const int StackGrowth = -TD.getPointerSize(0); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + const int StackGrowth = isTargetDarwin(MF) + ? (2 * -TD.getPointerSize(0)) + : -AFI->getCalleeSavedStackSize(); + Register FramePtr = RegInfo->getFrameRegister(MF); // An example of the prologue: // // .globl __foo @@ -1202,7 +1259,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( - nullptr, Reg, 2 * StackGrowth - FixedObject)); + nullptr, Reg, StackGrowth - FixedObject)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1401,11 +1458,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, - NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI); + {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1416,6 +1476,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Deallocate the SVE area. + if (SVEStackSize) + if (!AFI->isStackRealigned()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, + TII, MachineInstr::FrameDestroy); + if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the @@ -1437,8 +1503,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, - StackRestoreBytes, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + {StackRestoreBytes, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); if (Done) { if (NeedsWinCFI) { HasWinCFI = true; @@ -1456,13 +1522,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. - if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) + if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) { + int64_t OffsetToFrameRecord = + isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0; emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -AFI->getCalleeSavedStackSize() + 16, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI); - else if (NumBytes) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI); + {OffsetToFrameRecord, MVT::i8}, + TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); + } else if (NumBytes) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI); // This must be placed after the callee-save restore code because that code // assumes the SP is at the same location as it was after the callee-save save @@ -1483,8 +1552,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, - AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + {(int64_t)AfterCSRPopSize, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1501,10 +1570,11 @@ int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { return resolveFrameIndexReference( - MF, FI, FrameReg, - /*PreferFP=*/ - MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), - /*ForSimm=*/false); + MF, FI, FrameReg, + /*PreferFP=*/ + MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), + /*ForSimm=*/false) + .getBytes(); } int AArch64FrameLowering::getNonLocalFrameIndexReference( @@ -1512,18 +1582,19 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference( return getSEHFrameIndexOffset(MF, FI); } -static int getFPOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) { const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; - return ObjectOffset + FixedObject + 16; + unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize(); + return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; } -static int getStackOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) { const auto &MFI = MF.getFrameInfo(); - return ObjectOffset + MFI.getStackSize(); + return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8}; } int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, @@ -1532,23 +1603,23 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, MF.getSubtarget().getRegisterInfo()); int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); return RegInfo->getLocalAddressRegister(MF) == AArch64::FP - ? getFPOffset(MF, ObjectOffset) - : getStackOffset(MF, ObjectOffset); + ? getFPOffset(MF, ObjectOffset).getBytes() + : getStackOffset(MF, ObjectOffset).getBytes(); } -int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, - int FI, unsigned &FrameReg, - bool PreferFP, - bool ForSimm) const { +StackOffset AArch64FrameLowering::resolveFrameIndexReference( + const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP, + bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); int ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); - return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, FrameReg, + bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector; + return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, PreferFP, ForSimm); } -int AArch64FrameLowering::resolveFrameOffsetReference( - const MachineFunction &MF, int ObjectOffset, bool isFixed, +StackOffset AArch64FrameLowering::resolveFrameOffsetReference( + const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE, unsigned &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( @@ -1556,17 +1627,23 @@ int AArch64FrameLowering::resolveFrameOffsetReference( const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - int FPOffset = getFPOffset(MF, ObjectOffset); - int Offset = getStackOffset(MF, ObjectOffset); + int FPOffset = getFPOffset(MF, ObjectOffset).getBytes(); + int Offset = getStackOffset(MF, ObjectOffset).getBytes(); bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the // right thing for the emergency spill slot. bool UseFP = false; - if (AFI->hasStackFrame()) { + if (AFI->hasStackFrame() && !isSVE) { + // We shouldn't prefer using the FP when there is an SVE area + // in between the FP and the non-SVE locals/spills. + PreferFP &= !SVEStackSize; + // Note: Keeping the following as multiple 'if' statements rather than // merging to a single expression for readability. // @@ -1594,8 +1671,10 @@ int AArch64FrameLowering::resolveFrameOffsetReference( bool CanUseBP = RegInfo->hasBasePointer(MF); if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best. UseFP = PreferFP; - else if (!CanUseBP) // Can't use BP. Forced to use FP. + else if (!CanUseBP) { // Can't use BP. Forced to use FP. + assert(!SVEStackSize && "Expected BP to be available"); UseFP = true; + } // else we can use BP and FP, but the offset from FP won't fit. // That will make us scavenge registers which we can probably avoid by // using BP. If it won't fit for BP either, we'll scavenge anyway. @@ -1625,9 +1704,36 @@ int AArch64FrameLowering::resolveFrameOffsetReference( "In the presence of dynamic stack pointer realignment, " "non-argument/CSR objects cannot be accessed through the frame pointer"); + if (isSVE) { + int64_t OffsetToSVEArea = + MFI.getStackSize() - AFI->getCalleeSavedStackSize(); + StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8}; + StackOffset SPOffset = SVEStackSize + + StackOffset(ObjectOffset, MVT::nxv1i8) + + StackOffset(OffsetToSVEArea, MVT::i8); + // Always use the FP for SVE spills if available and beneficial. + if (hasFP(MF) && + (SPOffset.getBytes() || + FPOffset.getScalableBytes() < SPOffset.getScalableBytes() || + RegInfo->needsStackRealignment(MF))) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + + FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() + : (unsigned)AArch64::SP; + return SPOffset; + } + + StackOffset ScalableOffset = {}; + if (UseFP && !(isFixed || isCSR)) + ScalableOffset = -SVEStackSize; + if (!UseFP && (isFixed || isCSR)) + ScalableOffset = SVEStackSize; + if (UseFP) { FrameReg = RegInfo->getFrameRegister(MF); - return FPOffset; + return StackOffset(FPOffset, MVT::i8) + ScalableOffset; } // Use the base pointer if we have one. @@ -1644,7 +1750,7 @@ int AArch64FrameLowering::resolveFrameOffsetReference( Offset -= AFI->getLocalStackSize(); } - return Offset; + return StackOffset(Offset, MVT::i8) + ScalableOffset; } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { @@ -1682,6 +1788,23 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, return true; } +/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction. +/// WindowsCFI requires that only consecutive registers can be paired. +/// LR and FP need to be allocated together when the frame needs to save +/// the frame-record. This means any other register pairing with LR is invalid. +static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, + bool NeedsWinCFI, bool NeedsFrameRecord) { + if (NeedsWinCFI) + return invalidateWindowsRegisterPairing(Reg1, Reg2, true); + + // If we need to store the frame record, don't pair any register + // with LR other than FP. + if (NeedsFrameRecord) + return Reg2 == AArch64::LR; + + return false; +} + namespace { struct RegPairInfo { @@ -1701,7 +1824,7 @@ struct RegPairInfo { static void computeCalleeSaveRegisterPairs( MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs, - bool &NeedShadowCallStackProlog) { + bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { if (CSI.empty()) return; @@ -1743,7 +1866,8 @@ static void computeCalleeSaveRegisterPairs( switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI)) + !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + NeedsFrameRecord)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR64: @@ -1777,6 +1901,10 @@ static void computeCalleeSaveRegisterPairs( (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) && "Out of order callee saved regs!"); + assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP || + RPI.Reg1 == AArch64::LR) && + "FrameRecord must be allocated together with LR"); + // MachO's compact unwind format relies on all registers being stored in // adjacent register pairs. assert((!produceCompactUnwindFrame(MF) || @@ -1825,7 +1953,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( bool NeedShadowCallStackProlog = false; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog); + NeedShadowCallStackProlog, hasFP(MF)); const MachineRegisterInfo &MRI = MF.getRegInfo(); if (NeedShadowCallStackProlog) { @@ -1955,7 +2083,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( bool NeedShadowCallStackProlog = false; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog); + NeedShadowCallStackProlog, hasFP(MF)); auto EmitMI = [&](const RegPairInfo &RPI) { unsigned Reg1 = RPI.Reg1; @@ -2113,19 +2241,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::LR); } - LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; + LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) dbgs() << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + unsigned MaxAlign = getStackAlignment(); + int64_t SVEStackSize = + alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); - bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit; + + // Conservatively always assume BigStack when there are SVE spills. + bool BigStack = SVEStackSize || + (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit; if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) AFI->setHasStackFrame(true); @@ -2145,7 +2280,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // store the pair. if (produceCompactUnwindFrame(MF)) SavedRegs.set(UnspilledCSGPRPaired); - ExtraCSSpill = UnspilledCSGPRPaired; + ExtraCSSpill = UnspilledCSGPR; } // If we didn't find an extra callee-saved register to spill, create @@ -2181,14 +2316,42 @@ bool AArch64FrameLowering::enableStackSlotScavenging( return AFI->hasCalleeSaveStackFreeSpace(); } +int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, + unsigned &MaxAlign) const { + // Process all fixed stack objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == TargetStackID::SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) + Offset = FixedOffset; + } + + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return Offset; +} + void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + unsigned MaxAlign = getStackAlignment(); + int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign)); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 6dbd34b2189f..ac150e86c9eb 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H +#include "AArch64StackOffset.h" #include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { @@ -20,7 +21,7 @@ namespace llvm { class AArch64FrameLowering : public TargetFrameLowering { public: explicit AArch64FrameLowering() - : TargetFrameLowering(StackGrowsDown, 16, 0, 16, + : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16), true /*StackRealignable*/) {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, @@ -39,12 +40,13 @@ public: int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; - int resolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, bool PreferFP, - bool ForSimm) const; - int resolveFrameOffsetReference(const MachineFunction &MF, int ObjectOffset, - bool isFixed, unsigned &FrameReg, - bool PreferFP, bool ForSimm) const; + StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg, bool PreferFP, + bool ForSimm) const; + StackOffset resolveFrameOffsetReference(const MachineFunction &MF, + int ObjectOffset, bool isFixed, + bool isSVE, unsigned &FrameReg, + bool PreferFP, bool ForSimm) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -85,9 +87,21 @@ public: int FI) const override; int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const; + bool isSupportedStackID(TargetStackID::Value ID) const override { + switch (ID) { + default: + return false; + case TargetStackID::Default: + case TargetStackID::SVEVector: + case TargetStackID::NoAlloc: + return true; + } + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; + int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const; }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index cd7e927ac80c..1f08505f37e7 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -2053,7 +2053,7 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits, } static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) { - if (Depth >= 6) + if (Depth >= SelectionDAG::MaxRecursionDepth) return; // Initialize UsefulBits if (!Depth) { @@ -2913,49 +2913,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; break; - case ISD::EXTRACT_VECTOR_ELT: { - // Extracting lane zero is a special case where we can just use a plain - // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for - // the rest of the compiler, especially the register allocator and copyi - // propagation, to reason about, so is preferred when it's possible to - // use it. - ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1)); - // Bail and use the default Select() for non-zero lanes. - if (LaneNode->getZExtValue() != 0) - break; - // If the element type is not the same as the result type, likewise - // bail and use the default Select(), as there's more to do than just - // a cross-class COPY. This catches extracts of i8 and i16 elements - // since they will need an explicit zext. - if (VT != Node->getOperand(0).getValueType().getVectorElementType()) - break; - unsigned SubReg; - switch (Node->getOperand(0) - .getValueType() - .getVectorElementType() - .getSizeInBits()) { - default: - llvm_unreachable("Unexpected vector element type!"); - case 64: - SubReg = AArch64::dsub; - break; - case 32: - SubReg = AArch64::ssub; - break; - case 16: - SubReg = AArch64::hsub; - break; - case 8: - llvm_unreachable("unexpected zext-requiring extract element!"); - } - SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT, - Node->getOperand(0)); - LLVM_DEBUG(dbgs() << "ISEL: Custom selection!\n=> "); - LLVM_DEBUG(Extract->dumpr(CurDAG)); - LLVM_DEBUG(dbgs() << "\n"); - ReplaceNode(Node, Extract.getNode()); - return; - } case ISD::Constant: { // Materialize zero constants as copies from WZR/XZR. This allows // the coalescer to propagate these into other instructions. diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 7becc99fb5c7..2746117e8ee5 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -161,6 +162,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); } + if (Subtarget->hasSVE()) { + // Add legal sve predicate types + addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); + + // Add legal sve data types + addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); + + addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); + } + // Compute derived properties from the register classes computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -283,7 +307,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } @@ -297,7 +321,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); } @@ -606,6 +630,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; + MaxLoadsPerMemcmpOptSize = 4; + MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() + ? MaxLoadsPerMemcmpOptSize : 8; + setStackPointerRegisterToSaveRestore(AArch64::SP); setSchedulingPreference(Sched::Hybrid); @@ -613,10 +641,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, EnableExtLdPromotion = true; // Set required alignment. - setMinFunctionAlignment(2); + setMinFunctionAlignment(Align(4)); // Set preferred alignments. - setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); - setPrefLoopAlignment(STI.getPrefLoopAlignment()); + setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment())); + setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment())); // Only change the limit for entries in a jump table if specified by // the sub target, but not at the command line. @@ -725,7 +753,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { @@ -741,7 +769,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); @@ -773,6 +801,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } + if (Subtarget->hasSVE()) { + for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { + if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1) + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + } + } + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } @@ -1025,6 +1060,14 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( Known.One &= Known2.One; break; } + case AArch64ISD::LOADgot: + case AArch64ISD::ADDlow: { + if (!Subtarget->isTargetILP32()) + break; + // In ILP32 mode all valid pointers are in the low 4GB of the address-space. + Known.Zero = APInt::getHighBitsSet(64, 32); + break; + } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); @@ -1100,6 +1143,32 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( return true; } +// Same as above but handling LLTs instead. +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + if (Fast) { + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || + Ty.getSizeInBytes() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + Ty == LLT::vector(2, 64); + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -1238,6 +1307,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::STZG: return "AArch64ISD::STZG"; case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; + case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI"; + case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO"; + case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; + case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; } return nullptr; } @@ -1263,9 +1336,9 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned IfTrueReg = MI.getOperand(1).getReg(); - unsigned IfFalseReg = MI.getOperand(2).getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register IfTrueReg = MI.getOperand(1).getReg(); + Register IfFalseReg = MI.getOperand(2).getReg(); unsigned CondCode = MI.getOperand(3).getImm(); bool NZCVKilled = MI.getOperand(4).isKill(); @@ -2140,7 +2213,8 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; } // Returns true if the given Op is the overflow flag result of an overflow @@ -2349,7 +2423,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions, SDLoc(Op)).first; } @@ -2419,7 +2494,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -2773,6 +2849,19 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_sunpkhi: + return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_sunpklo: + return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uunpkhi: + return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uunpklo: + return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::localaddress: { const auto &MF = DAG.getMachineFunction(); const auto *RegInfo = Subtarget->getRegisterInfo(); @@ -2937,6 +3026,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SPLAT_VECTOR: + return LowerSPLAT_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::SRA: @@ -3014,8 +3105,11 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; - return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; - case CallingConv::Win64: + if (!IsVarArg) + return CC_AArch64_DarwinPCS; + return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg + : CC_AArch64_DarwinPCS_VarArg; + case CallingConv::Win64: return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; case CallingConv::AArch64_VectorCall: return CC_AArch64_AAPCS; @@ -3038,6 +3132,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; + DenseMap<unsigned, SDValue> CopiedRegs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); @@ -3094,11 +3189,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments( continue; } + SDValue ArgValue; if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); - - SDValue ArgValue; const TargetRegisterClass *RC; if (RegVT == MVT::i32) @@ -3113,6 +3207,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments( RC = &AArch64::FPR64RegClass; else if (RegVT == MVT::f128 || RegVT.is128BitVector()) RC = &AArch64::FPR128RegClass; + else if (RegVT.isScalableVector() && + RegVT.getVectorElementType() == MVT::i1) + RC = &AArch64::PPRRegClass; + else if (RegVT.isScalableVector()) + RC = &AArch64::ZPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); @@ -3128,20 +3227,23 @@ SDValue AArch64TargetLowering::LowerFormalArguments( llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; case CCValAssign::AExt: case CCValAssign::SExt: case CCValAssign::ZExt: - // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt - // nodes after our lowering. - assert(RegVT == Ins[i].VT && "incorrect register location selected"); + break; + case CCValAssign::AExtUpper: + ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, + DAG.getConstant(32, DL, RegVT)); + ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); break; } - - InVals.push_back(ArgValue); - } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); @@ -3156,7 +3258,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue ArgValue; // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; @@ -3165,9 +3266,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments( switch (VA.getLocInfo()) { default: break; + case CCValAssign::Trunc: case CCValAssign::BCvt: MemVT = VA.getLocVT(); break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; @@ -3184,8 +3290,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT); - InVals.push_back(ArgValue); } + if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) + ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), + ArgValue, DAG.getValueType(MVT::i32)); + InVals.push_back(ArgValue); } // varargs @@ -3202,8 +3311,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); - // We currently pass all varargs at 8-byte alignment. - StackOffset = ((StackOffset + 7) & ~7); + // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 + StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8); FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); if (MFI.hasMustTailInVarArgFunc()) { @@ -3233,8 +3342,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( assert(!FuncInfo->getSRetReturnReg()); MVT PtrTy = getPointerTy(DAG.getDataLayout()); - unsigned Reg = - MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + Register Reg = + MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); @@ -3366,6 +3475,7 @@ SDValue AArch64TargetLowering::LowerCallResult( : RetCC_AArch64_AAPCS; // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; + DenseMap<unsigned, SDValue> CopiedRegs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC); @@ -3383,10 +3493,16 @@ SDValue AArch64TargetLowering::LowerCallResult( continue; } - SDValue Val = - DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); - Chain = Val.getValue(1); - InFlag = Val.getValue(2); + // Avoid copying a physreg twice since RegAllocFast is incompetent and only + // allows one use of a physreg per block. + SDValue Val = CopiedRegs.lookup(VA.getLocReg()); + if (!Val) { + Val = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + CopiedRegs[VA.getLocReg()] = Val; + } switch (VA.getLocInfo()) { default: @@ -3396,6 +3512,15 @@ SDValue AArch64TargetLowering::LowerCallResult( case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; + case CCValAssign::AExtUpper: + Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, + DAG.getConstant(32, DL, VA.getLocVT())); + LLVM_FALLTHROUGH; + case CCValAssign::AExt: + LLVM_FALLTHROUGH; + case CCValAssign::ZExt: + Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); + break; } InVals.push_back(Val); @@ -3593,6 +3718,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction::CallSiteInfo CSInfo; bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); @@ -3709,6 +3835,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, getPointerTy(DAG.getDataLayout())); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallSet<unsigned, 8> RegsUsed; SmallVector<SDValue, 8> MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -3716,7 +3843,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); - RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + RegsToPass.emplace_back(F.PReg, Val); } } @@ -3747,12 +3874,25 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; + case CCValAssign::AExtUpper: + assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, + DAG.getConstant(32, DL, VA.getLocVT())); + break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + Arg = DAG.getBitcast(VA.getLocVT(), Arg); + break; + case CCValAssign::Trunc: + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); break; case CCValAssign::FPExt: Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); } if (VA.isRegLoc()) { @@ -3764,7 +3904,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, "unexpected use of 'returned'"); IsThisReturn = true; } - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (RegsUsed.count(VA.getLocReg())) { + // If this register has already been used then we're trying to pack + // parts of an [N x i32] into an X-register. The extension type will + // take care of putting the two halves in the right place but we have to + // combine them. + SDValue &Bits = + std::find_if(RegsToPass.begin(), RegsToPass.end(), + [=](const std::pair<unsigned, SDValue> &Elt) { + return Elt.first == VA.getLocReg(); + }) + ->second; + Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); + // Call site info is used for function's parameter entry value + // tracking. For now we track only simple cases when parameter + // is transferred through whole register. + CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(), + [&VA](MachineFunction::ArgRegPair ArgReg) { + return ArgReg.Reg == VA.getLocReg(); + }), + CSInfo.end()); + } else { + RegsToPass.emplace_back(VA.getLocReg(), Arg); + RegsUsed.insert(VA.getLocReg()); + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EnableDebugEntryValues) + CSInfo.emplace_back(VA.getLocReg(), i); + } } else { assert(VA.isMemLoc()); @@ -3899,6 +4065,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getRegister(RegToPass.first, RegToPass.second.getValueType())); + // Check callee args/returns for SVE registers and set calling convention + // accordingly. + if (CallConv == CallingConv::C) { + bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){ + return Out.VT.isScalableVector(); + }); + bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){ + return In.VT.isScalableVector(); + }); + + if (CalleeInSVE || CalleeOutSVE) + CallConv = CallingConv::AArch64_SVE_VectorCall; + } + // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); @@ -3930,12 +4110,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // actual call instruction. if (IsTailCall) { MF.getFrameInfo().setHasTailCall(); - return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; @@ -3983,7 +4166,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Copy the result values into the output registers. SDValue Flag; - SmallVector<SDValue, 4> RetOps(1, Chain); + SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; + SmallSet<unsigned, 4> RegsUsed; for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; @@ -4005,11 +4189,38 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; + case CCValAssign::AExt: + case CCValAssign::ZExt: + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); + break; + case CCValAssign::AExtUpper: + assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); + Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, + DAG.getConstant(32, DL, VA.getLocVT())); + break; } - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + if (RegsUsed.count(VA.getLocReg())) { + SDValue &Bits = + std::find_if(RetVals.begin(), RetVals.end(), + [=](const std::pair<unsigned, SDValue> &Elt) { + return Elt.first == VA.getLocReg(); + }) + ->second; + Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); + } else { + RetVals.emplace_back(VA.getLocReg(), Arg); + RegsUsed.insert(VA.getLocReg()); + } + } + + SmallVector<SDValue, 4> RetOps(1, Chain); + for (auto &RetVal : RetVals) { + Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + RetOps.push_back( + DAG.getRegister(RetVal.first, RetVal.second.getValueType())); } // Windows AArch64 ABIs require that for returning structs by value we copy @@ -4139,8 +4350,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); - unsigned char OpFlags = - Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); + unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); if (OpFlags != AArch64II::MO_NO_FLAG) assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && @@ -4204,6 +4414,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SDLoc DL(Op); MVT PtrVT = getPointerTy(DAG.getDataLayout()); + MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); SDValue TLVPAddr = @@ -4214,13 +4425,15 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( - MVT::i64, DL, Chain, DescAddr, + PtrMemVT, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), - /* Alignment = */ 8, - MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant | - MachineMemOperand::MODereferenceable); + /* Alignment = */ PtrMemVT.getSizeInBits() / 8, + MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); + // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. + FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); @@ -4470,7 +4683,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // value of a libcall against zero, which is just what the rest of LowerBR_CC // is expecting to deal with. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. @@ -4736,7 +4949,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { @@ -4798,7 +5011,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. @@ -5096,6 +5309,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); + FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); @@ -5202,15 +5416,15 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); - unsigned VaListSize = - Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; + unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; + unsigned VaListSize = (Subtarget->isTargetDarwin() || + Subtarget->isTargetWindows()) ? PtrSize : 32; const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), - Op.getOperand(2), - DAG.getConstant(VaListSize, DL, MVT::i32), - 8, false, false, false, MachinePointerInfo(DestSV), + return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize, + false, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } @@ -5224,12 +5438,15 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); unsigned Align = Op.getConstantOperandVal(3); + unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; auto PtrVT = getPointerTy(DAG.getDataLayout()); - - SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); + auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); + SDValue VAList = + DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); + VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); - if (Align > 8) { + if (Align > MinSlotSize) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); @@ -5238,14 +5455,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); - uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); + unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the // vaargs list to match this, and for FP values we need to introduce // FP_ROUND nodes as well. if (VT.isInteger() && !VT.isVector()) - ArgSize = 8; + ArgSize = std::max(ArgSize, MinSlotSize); bool NeedFPTrunc = false; if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { ArgSize = 8; @@ -5255,6 +5472,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // Increment the pointer, VAList, to the next vaarg SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); + VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); + // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); @@ -5284,10 +5503,15 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SDLoc DL(Op); unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); SDValue FrameAddr = - DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); + DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); + + if (Subtarget->isTargetILP32()) + FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, + DAG.getValueType(VT)); + return FrameAddr; } @@ -5306,9 +5530,9 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = MatchRegisterName(RegName); +Register AArch64TargetLowering:: +getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const { + Register Reg = MatchRegisterName(RegName); if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); @@ -5653,6 +5877,21 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { return "r"; } +enum PredicateConstraint { + Upl, + Upa, + Invalid +}; + +static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { + PredicateConstraint P = PredicateConstraint::Invalid; + if (Constraint == "Upa") + P = PredicateConstraint::Upa; + if (Constraint == "Upl") + P = PredicateConstraint::Upl; + return P; +} + /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType @@ -5661,19 +5900,30 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const { switch (Constraint[0]) { default: break; - case 'z': - return C_Other; case 'x': case 'w': + case 'y': return C_RegisterClass; // An address with a single base register. Due to the way we // currently handle addresses it is the same as 'r'. case 'Q': return C_Memory; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'Y': + case 'Z': + return C_Immediate; + case 'z': case 'S': // A symbolic address return C_Other; } - } + } else if (parsePredicateConstraint(Constraint) != + PredicateConstraint::Invalid) + return C_RegisterClass; return TargetLowering::getConstraintType(Constraint); } @@ -5697,12 +5947,17 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( break; case 'x': case 'w': + case 'y': if (type->isFloatingPointTy() || type->isVectorTy()) weight = CW_Register; break; case 'z': weight = CW_Constant; break; + case 'U': + if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) + weight = CW_Register; + break; } return weight; } @@ -5719,6 +5974,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( case 'w': if (!Subtarget->hasFPARMv8()) break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPRRegClass); if (VT.getSizeInBits() == 16) return std::make_pair(0U, &AArch64::FPR16RegClass); if (VT.getSizeInBits() == 32) @@ -5733,9 +5990,25 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( case 'x': if (!Subtarget->hasFPARMv8()) break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPR_4bRegClass); if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128_loRegClass); break; + case 'y': + if (!Subtarget->hasFPARMv8()) + break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPR_3bRegClass); + break; + } + } else { + PredicateConstraint PC = parsePredicateConstraint(Constraint); + if (PC != PredicateConstraint::Invalid) { + assert(VT.isScalableVector()); + bool restricted = (PC == PredicateConstraint::Upl); + return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) + : std::make_pair(0U, &AArch64::PPRRegClass); } } if (StringRef("{cc}").equals_lower(Constraint)) @@ -6279,6 +6552,8 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); + if (NumElts % 2 != 0) + return false; WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { @@ -6446,8 +6721,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { if (!isConcatMask(Mask, VT, SplitV0)) return SDValue(); - EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements() / 2); + EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); if (SplitV0) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, DAG.getConstant(0, DL, MVT::i64)); @@ -6790,6 +7064,41 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return GenerateTBL(Op, ShuffleMask, DAG); } +SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT ElemVT = VT.getScalarType(); + + SDValue SplatVal = Op.getOperand(0); + + // Extend input splat value where needed to fit into a GPR (32b or 64b only) + // FPRs don't have this restriction. + switch (ElemVT.getSimpleVT().SimpleTy) { + case MVT::i8: + case MVT::i16: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); + break; + case MVT::i64: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); + break; + case MVT::i32: + // Fine as is + break; + // TODO: we can support splats of i1s and float types, but haven't added + // patterns yet. + case MVT::i1: + case MVT::f16: + case MVT::f32: + case MVT::f64: + default: + llvm_unreachable("Unsupported SPLAT_VECTOR input operand type"); + break; + } + + return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); +} + static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); @@ -8063,7 +8372,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -8089,7 +8398,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -8101,7 +8410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -8112,7 +8421,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -8122,7 +8431,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 16; + Info.align = Align(16); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_stlxp: @@ -8131,7 +8440,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = 16; + Info.align = Align(16); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; default: @@ -8278,7 +8587,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). uint64_t ShiftAmt = - countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; + countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) @@ -8739,6 +9048,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType( return MVT::Other; } +LLT AArch64TargetLowering::getOptimalMemOpLLT( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + bool CanImplicitFloat = + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); + bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; + bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; + // Only use AdvSIMD to implement memset of 32-byte and above. It would have + // taken one instruction to materialize the v2i64 zero and one store (with + // restrictive addressing mode). Just do i64 stores. + bool IsSmallMemset = IsMemset && Size < 32; + auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { + if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + return true; + bool Fast; + return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, + &Fast) && + Fast; + }; + + if (CanUseNEON && IsMemset && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, 16)) + return LLT::vector(2, 64); + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + return LLT::scalar(128); + if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + return LLT::scalar(64); + if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + return LLT::scalar(32); + return LLT(); +} + // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits<int64_t>::min()) { @@ -10065,6 +10407,14 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { Opcode = AArch64ISD::SQSHLU_I; IsRightShift = false; break; + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_ushl: + // For positive shift amounts we can use SHL, as ushl/sshl perform a regular + // left shift for positive shift amounts. Below, we only replace the current + // node with VSHL, if this condition is met. + Opcode = AArch64ISD::VSHL; + IsRightShift = false; + break; } if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { @@ -10151,6 +10501,8 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_sqshlu: case Intrinsic::aarch64_neon_srshl: case Intrinsic::aarch64_neon_urshl: + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_ushl: return tryCombineShiftImm(IID, N, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: @@ -10482,10 +10834,10 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return ReplacedSplat; SDLoc DL(S); - unsigned NumElts = VT.getVectorNumElements() / 2; + // Split VT into two. - EVT HalfVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); + EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned NumElts = HalfVT.getVectorNumElements(); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(0, DL, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, @@ -10567,7 +10919,7 @@ static SDValue performPostLD1Combine(SDNode *N, // are predecessors to each other or the Vector. SmallPtrSet<const SDNode *, 32> Visited; SmallVector<const SDNode *, 16> Worklist; - Visited.insert(N); + Visited.insert(Addr.getNode()); Worklist.push_back(User); Worklist.push_back(LD); Worklist.push_back(Vector.getNode()); @@ -11983,6 +12335,27 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( return Mask->getValue().isPowerOf2(); } +bool AArch64TargetLowering:: + shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const { + // Does baseline recommend not to perform the fold by default? + if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) + return false; + // Else, if this is a vector shift, prefer 'shl'. + return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL; +} + +bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG, + SDNode *N) const { + if (DAG.getMachineFunction().getFunction().hasMinSize() && + !Subtarget->isTargetWindows()) + return false; + return true; +} + void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in AArch64unctionInfo. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); @@ -12009,7 +12382,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 4421c31f65c9..00fa96bc4e6d 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -191,6 +191,11 @@ enum NodeType : unsigned { FRECPE, FRECPS, FRSQRTE, FRSQRTS, + SUNPKHI, + SUNPKLO, + UUNPKHI, + UUNPKLO, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -261,6 +266,14 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; + MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override { + // Returning i64 unconditionally here (i.e. even for ILP32) means that the + // *DAG* representation of pointers will always be 64-bits. They will be + // truncated and extended when transferred to memory, but the 64-bit DAG + // allows us to use AArch64's addressing modes much more easily. + return MVT::getIntegerVT(64); + } + bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const override; @@ -272,6 +285,10 @@ public: EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *Fast = nullptr) const override; + /// LLT variant. + bool allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -358,6 +375,10 @@ public: bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const override; + LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const override; + /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, @@ -480,11 +501,12 @@ public: return VT.getSizeInBits() >= 64; // vector 'bic' } - bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { - if (DAG.getMachineFunction().getFunction().hasMinSize()) - return false; - return true; - } + bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const override; + + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override { @@ -655,6 +677,7 @@ private: SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; @@ -690,8 +713,8 @@ private: unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index e22cb44d81ae..459b53923625 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -204,19 +204,27 @@ def : Pat<(relaxed_store<atomic_store_64> def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(ldxr_1 GPR64sp:$addr), (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; @@ -237,19 +245,27 @@ def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff), def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(ldaxr_1 GPR64sp:$addr), (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; @@ -271,22 +287,30 @@ def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff), def stxr_1 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def stxr_2 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def stxr_4 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def stxr_8 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr), @@ -317,22 +341,30 @@ def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), def stlxr_1 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def stlxr_2 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def stlxr_4 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def stlxr_8 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr), @@ -422,4 +454,3 @@ let Predicates = [HasLSE] in { defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">; defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">; } - diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index d619137b55c5..f555e4123307 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -480,76 +480,40 @@ def BranchTarget14Operand : BranchTarget<14>; def BranchTarget26Operand : BranchTarget<26>; def PCRelLabel19Operand : PCRelLabel<19>; -def MovZSymbolG3AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG3"; +def MovWSymbolG3AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG3"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g3 : Operand<i32> { - let ParserMatchClass = MovZSymbolG3AsmOperand; +def movw_symbol_g3 : Operand<i32> { + let ParserMatchClass = MovWSymbolG3AsmOperand; } -def MovZSymbolG2AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG2"; +def MovWSymbolG2AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG2"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g2 : Operand<i32> { - let ParserMatchClass = MovZSymbolG2AsmOperand; +def movw_symbol_g2 : Operand<i32> { + let ParserMatchClass = MovWSymbolG2AsmOperand; } -def MovZSymbolG1AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG1"; +def MovWSymbolG1AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG1"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g1 : Operand<i32> { - let ParserMatchClass = MovZSymbolG1AsmOperand; +def movw_symbol_g1 : Operand<i32> { + let ParserMatchClass = MovWSymbolG1AsmOperand; } -def MovZSymbolG0AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG0"; +def MovWSymbolG0AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG0"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g0 : Operand<i32> { - let ParserMatchClass = MovZSymbolG0AsmOperand; -} - -def MovKSymbolG3AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG3"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g3 : Operand<i32> { - let ParserMatchClass = MovKSymbolG3AsmOperand; -} - -def MovKSymbolG2AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG2"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g2 : Operand<i32> { - let ParserMatchClass = MovKSymbolG2AsmOperand; -} - -def MovKSymbolG1AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG1"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g1 : Operand<i32> { - let ParserMatchClass = MovKSymbolG1AsmOperand; -} - -def MovKSymbolG0AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG0"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g0 : Operand<i32> { - let ParserMatchClass = MovKSymbolG0AsmOperand; +def movw_symbol_g0 : Operand<i32> { + let ParserMatchClass = MovWSymbolG0AsmOperand; } class fixedpoint_i32<ValueType FloatVT> @@ -673,6 +637,11 @@ def logical_imm64_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); }]>; +def gi_logical_imm32_XFORM : GICustomOperandRenderer<"renderLogicalImm32">, + GISDNodeXFormEquiv<logical_imm32_XFORM>; +def gi_logical_imm64_XFORM : GICustomOperandRenderer<"renderLogicalImm64">, + GISDNodeXFormEquiv<logical_imm64_XFORM>; + let DiagnosticType = "LogicalSecondSource" in { def LogicalImm32Operand : AsmOperandClass { let Name = "LogicalImm32"; @@ -714,12 +683,15 @@ def logical_imm64_not : Operand<i64> { let ParserMatchClass = LogicalImm64NotOperand; } -// imm0_65535 predicate - True if the immediate is in the range [0,65535]. -def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ +// iXX_imm0_65535 predicates - True if the immediate is in the range [0,65535]. +let ParserMatchClass = AsmImmRange<0, 65535>, PrintMethod = "printImmHex" in { +def i32_imm0_65535 : Operand<i32>, TImmLeaf<i32, [{ return ((uint32_t)Imm) < 65536; -}]> { - let ParserMatchClass = AsmImmRange<0, 65535>; - let PrintMethod = "printImmHex"; +}]>; + +def i64_imm0_65535 : Operand<i64>, TImmLeaf<i64, [{ + return ((uint64_t)Imm) < 65536; +}]>; } // imm0_255 predicate - True if the immediate is in the range [0,255]. @@ -815,6 +787,14 @@ class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width> def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>; def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>; +def gi_arith_shifted_reg32 : + GIComplexOperandMatcher<s32, "selectArithShiftedRegister">, + GIComplexPatternEquiv<arith_shifted_reg32>; + +def gi_arith_shifted_reg64 : + GIComplexOperandMatcher<s64, "selectArithShiftedRegister">, + GIComplexPatternEquiv<arith_shifted_reg64>; + // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror // {5-0} - imm6 @@ -837,6 +817,14 @@ class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop> def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>; def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>; +def gi_logical_shifted_reg32 : + GIComplexOperandMatcher<s32, "selectLogicalShiftedRegister">, + GIComplexPatternEquiv<logical_shifted_reg32>; + +def gi_logical_shifted_reg64 : + GIComplexOperandMatcher<s64, "selectLogicalShiftedRegister">, + GIComplexPatternEquiv<logical_shifted_reg64>; + // A logical vector shifter operand: // {7-6} - shift type: 00 = lsl // {5-0} - imm6: #0, #8, #16, or #24 @@ -918,6 +906,14 @@ class neg_addsub_shifted_imm<ValueType Ty> def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>; def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>; +def gi_neg_addsub_shifted_imm32 : + GIComplexOperandMatcher<s32, "selectNegArithImmed">, + GIComplexPatternEquiv<neg_addsub_shifted_imm32>; + +def gi_neg_addsub_shifted_imm64 : + GIComplexOperandMatcher<s64, "selectNegArithImmed">, + GIComplexPatternEquiv<neg_addsub_shifted_imm64>; + // An extend operand: // {5-3} - extend type // {2-0} - imm3 @@ -948,6 +944,21 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>, let MIOperandInfo = (ops GPR32, arith_extend64); } +def arith_extended_reg32_i32 : arith_extended_reg32<i32>; +def gi_arith_extended_reg32_i32 : + GIComplexOperandMatcher<s32, "selectArithExtendedRegister">, + GIComplexPatternEquiv<arith_extended_reg32_i32>; + +def arith_extended_reg32_i64 : arith_extended_reg32<i64>; +def gi_arith_extended_reg32_i64 : + GIComplexOperandMatcher<s64, "selectArithExtendedRegister">, + GIComplexPatternEquiv<arith_extended_reg32_i64>; + +def arith_extended_reg32to64_i64 : arith_extended_reg32to64<i64>; +def gi_arith_extended_reg32to64_i64 : + GIComplexOperandMatcher<s64, "selectArithExtendedRegister">, + GIComplexPatternEquiv<arith_extended_reg32to64_i64>; + // Floating-point immediate. def fpimm16 : Operand<f16>, FPImmLeaf<f16, [{ @@ -1000,8 +1011,8 @@ class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass { let RenderMethod = "addVectorIndexOperands"; } -class AsmVectorIndexOpnd<AsmOperandClass mc, code pred> - : Operand<i64>, ImmLeaf<i64, pred> { +class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc, code pred> + : Operand<ty>, ImmLeaf<ty, pred> { let ParserMatchClass = mc; let PrintMethod = "printVectorIndex"; } @@ -1012,11 +1023,17 @@ def VectorIndexHOperand : AsmVectorIndex<0, 7>; def VectorIndexSOperand : AsmVectorIndex<0, 3>; def VectorIndexDOperand : AsmVectorIndex<0, 1>; -def VectorIndex1 : AsmVectorIndexOpnd<VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; -def VectorIndexB : AsmVectorIndexOpnd<VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>; -def VectorIndexH : AsmVectorIndexOpnd<VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>; -def VectorIndexS : AsmVectorIndexOpnd<VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>; -def VectorIndexD : AsmVectorIndexOpnd<VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>; +def VectorIndex1 : AsmVectorIndexOpnd<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; +def VectorIndexB : AsmVectorIndexOpnd<i64, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>; +def VectorIndexH : AsmVectorIndexOpnd<i64, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>; +def VectorIndexS : AsmVectorIndexOpnd<i64, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>; +def VectorIndexD : AsmVectorIndexOpnd<i64, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>; + +def VectorIndex132b : AsmVectorIndexOpnd<i32, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>; +def VectorIndexB32b : AsmVectorIndexOpnd<i32, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>; +def VectorIndexH32b : AsmVectorIndexOpnd<i32, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>; +def VectorIndexS32b : AsmVectorIndexOpnd<i32, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>; +def VectorIndexD32b : AsmVectorIndexOpnd<i32, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>; def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">; def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">; @@ -1025,15 +1042,15 @@ def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">; def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">; def sve_elm_idx_extdup_b - : AsmVectorIndexOpnd<SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>; def sve_elm_idx_extdup_h - : AsmVectorIndexOpnd<SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>; def sve_elm_idx_extdup_s - : AsmVectorIndexOpnd<SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>; def sve_elm_idx_extdup_d - : AsmVectorIndexOpnd<SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>; def sve_elm_idx_extdup_q - : AsmVectorIndexOpnd<SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>; + : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>; // 8-bit immediate for AdvSIMD where 64-bit values of the form: // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh @@ -1082,6 +1099,45 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands> let Inst{4-0} = Rt; } +// System instructions for transactional memory extension +class TMBaseSystemI<bit L, bits<4> CRm, bits<3> op2, dag oops, dag iops, + string asm, string operands, list<dag> pattern> + : BaseSystemI<L, oops, iops, asm, operands, pattern>, + Sched<[WriteSys]> { + let Inst{20-12} = 0b000110011; + let Inst{11-8} = CRm; + let Inst{7-5} = op2; + let DecoderMethod = ""; + + let mayLoad = 1; + let mayStore = 1; +} + +// System instructions for transactional memory - single input operand +class TMSystemI<bits<4> CRm, string asm, list<dag> pattern> + : TMBaseSystemI<0b1, CRm, 0b011, + (outs GPR64:$Rt), (ins), asm, "\t$Rt", pattern> { + bits<5> Rt; + let Inst{4-0} = Rt; +} + +// System instructions for transactional memory - no operand +class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern> + : TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> { + let Inst{4-0} = 0b11111; +} + +// System instructions for exit from transactions +class TMSystemException<bits<3> op1, string asm, list<dag> pattern> + : I<(outs), (ins i64_imm0_65535:$imm), asm, "\t$imm", "", pattern>, + Sched<[WriteSys]> { + bits<16> imm; + let Inst{31-24} = 0b11010100; + let Inst{23-21} = op1; + let Inst{20-5} = imm; + let Inst{4-0} = 0b00000; +} + // Hint instructions that take both a CRm and a 3-bit immediate. // NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot // model patterns with sufficiently fine granularity @@ -2180,11 +2236,11 @@ multiclass AddSub<bit isSub, string mnemonic, string alias, // Add/Subtract extended register let AddedComplexity = 1, hasSideEffects = 0 in { def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp, - arith_extended_reg32<i32>, mnemonic, OpNode> { + arith_extended_reg32_i32, mnemonic, OpNode> { let Inst{31} = 0; } def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp, - arith_extended_reg32to64<i64>, mnemonic, OpNode> { + arith_extended_reg32to64_i64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2254,11 +2310,11 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, // Add/Subtract extended register let AddedComplexity = 1 in { def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp, - arith_extended_reg32<i32>, mnemonic, OpNode> { + arith_extended_reg32_i32, mnemonic, OpNode> { let Inst{31} = 0; } def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp, - arith_extended_reg32<i64>, mnemonic, OpNode> { + arith_extended_reg32_i64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2969,6 +3025,22 @@ def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>; def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>; def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>; +def gi_ro_Xindexed8 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">, + GIComplexPatternEquiv<ro_Xindexed8>; +def gi_ro_Xindexed16 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<16>">, + GIComplexPatternEquiv<ro_Xindexed16>; +def gi_ro_Xindexed32 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<32>">, + GIComplexPatternEquiv<ro_Xindexed32>; +def gi_ro_Xindexed64 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<64>">, + GIComplexPatternEquiv<ro_Xindexed64>; +def gi_ro_Xindexed128 : + GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">, + GIComplexPatternEquiv<ro_Xindexed128>; + def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>; def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>; def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>; @@ -4086,7 +4158,7 @@ multiclass MemTagStore<bits<2> opc1, string insn> { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm> - : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>, + : I<(outs), (ins i32_imm0_65535:$imm), asm, "\t$imm", "", []>, Sched<[WriteSys]> { bits<16> imm; let Inst{31-24} = 0b11010100; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 215e96a82d0e..5c35e5bcdd30 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/Casting.h" @@ -82,6 +83,10 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); } + // Meta-instructions emit no code. + if (MI.isMetaInstruction()) + return 0; + // FIXME: We currently only handle pseudoinstructions that don't get expanded // before the assembly printer. unsigned NumBytes = 0; @@ -91,12 +96,6 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // Anything not explicitly designated otherwise is a normal 4-byte insn. NumBytes = 4; break; - case TargetOpcode::DBG_VALUE: - case TargetOpcode::EH_LABEL: - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - NumBytes = 0; - break; case TargetOpcode::STACKMAP: // The upper bound for a stackmap intrinsic is the full length of its shadow NumBytes = StackMapOpers(&MI).getNumPatchBytes(); @@ -416,7 +415,7 @@ unsigned AArch64InstrInfo::insertBranch( // Find the original register that VReg is copied from. static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { - while (TargetRegisterInfo::isVirtualRegister(VReg)) { + while (Register::isVirtualRegister(VReg)) { const MachineInstr *DefMI = MRI.getVRegDef(VReg); if (!DefMI->isFullCopy()) return VReg; @@ -431,7 +430,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewVReg = nullptr) { VReg = removeCopies(MRI, VReg); - if (!TargetRegisterInfo::isVirtualRegister(VReg)) + if (!Register::isVirtualRegister(VReg)) return 0; bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); @@ -574,7 +573,7 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, CC = AArch64CC::NE; break; } - unsigned SrcReg = Cond[2].getReg(); + Register SrcReg = Cond[2].getReg(); if (Is64Bit) { // cmp reg, #0 is actually subs xzr, reg, #0. MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); @@ -930,7 +929,7 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( - const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const { + const MachineInstr &MIa, const MachineInstr &MIb) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; int64_t OffsetA = 0, OffsetB = 0; @@ -1071,8 +1070,8 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) { assert(MO.isReg() && "Operand has register constraints without being a register!"); - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + Register Reg = MO.getReg(); + if (Register::isPhysicalRegister(Reg)) { if (!OpRegCstraints->contains(Reg)) return false; } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && @@ -1472,6 +1471,8 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return false; MachineBasicBlock &MBB = *MI.getParent(); + auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); + auto TRI = Subtarget.getRegisterInfo(); DebugLoc DL = MI.getDebugLoc(); if (MI.getOpcode() == AArch64::CATCHRET) { @@ -1497,21 +1498,32 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } - unsigned Reg = MI.getOperand(0).getReg(); + Register Reg = MI.getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MI.memoperands_begin())->getValue()); const TargetMachine &TM = MBB.getParent()->getTarget(); - unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); + unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); const unsigned char MO_NC = AArch64II::MO_NC; if ((OpFlags & AArch64II::MO_GOT) != 0) { BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) .addGlobalAddress(GV, 0, OpFlags); - BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill) - .addImm(0) - .addMemOperand(*MI.memoperands_begin()); + if (Subtarget.isTargetILP32()) { + unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); + BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) + .addDef(Reg32, RegState::Dead) + .addUse(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()) + .addDef(Reg, RegState::Implicit); + } else { + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()); + } } else if (TM.getCodeModel() == CodeModel::Large) { + assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) .addImm(0); @@ -1538,10 +1550,20 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; - BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, LoFlags) - .addMemOperand(*MI.memoperands_begin()); + if (Subtarget.isTargetILP32()) { + unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); + BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) + .addDef(Reg32, RegState::Dead) + .addUse(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, LoFlags) + .addMemOperand(*MI.memoperands_begin()) + .addDef(Reg, RegState::Implicit); + } else { + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, LoFlags) + .addMemOperand(*MI.memoperands_begin()); + } } MBB.erase(MI); @@ -1581,7 +1603,7 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { break; case TargetOpcode::COPY: { // GPR32 copies will by lowered to ORRXrs - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); return (AArch64::GPR32RegClass.contains(DstReg) || AArch64::GPR64RegClass.contains(DstReg)); } @@ -1611,7 +1633,7 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { break; case TargetOpcode::COPY: { // FPR64 copies will by lowered to ORR.16b - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); return (AArch64::FPR64RegClass.contains(DstReg) || AArch64::FPR128RegClass.contains(DstReg)); } @@ -1917,7 +1939,7 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { // e.g., ldr x0, [x0] // This case will never occur with an FI base. if (MI.getOperand(1).isReg()) { - unsigned BaseReg = MI.getOperand(1).getReg(); + Register BaseReg = MI.getOperand(1).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (MI.modifiesRegister(BaseReg, TRI)) return false; @@ -1928,6 +1950,17 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { if (isLdStPairSuppressed(MI)) return false; + // Do not pair any callee-save store/reload instructions in the + // prologue/epilogue if the CFI information encoded the operations as separate + // instructions, as that will cause the size of the actual prologue to mismatch + // with the prologue size recorded in the Windows CFI. + const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); + bool NeedsWinCFI = MAI->usesWindowsCFI() && + MI.getMF()->getFunction().needsUnwindTableEntry(); + if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy))) + return false; + // On some CPUs quad load/store pairs are slower than two single load/stores. if (Subtarget.isPaired128Slow()) { switch (MI.getOpcode()) { @@ -2165,6 +2198,18 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, MinOffset = -256; MaxOffset = 255; break; + case AArch64::LDR_PXI: + case AArch64::STR_PXI: + Scale = Width = 2; + MinOffset = -256; + MaxOffset = 255; + break; + case AArch64::LDR_ZXI: + case AArch64::STR_ZXI: + Scale = Width = 16; + MinOffset = -256; + MaxOffset = 255; + break; case AArch64::ST2GOffset: case AArch64::STZ2GOffset: Scale = 16; @@ -2350,7 +2395,7 @@ static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, if (!SubIdx) return MIB.addReg(Reg, State); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); return MIB.addReg(Reg, State, SubIdx); } @@ -2474,6 +2519,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + // Copy a Predicate register by ORRing with itself. + if (AArch64::PPRRegClass.contains(DestReg) && + AArch64::PPRRegClass.contains(SrcReg)) { + assert(Subtarget.hasSVE() && "Unexpected SVE register."); + BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) + .addReg(SrcReg) // Pg + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + // Copy a Z register by ORRing with itself. + if (AArch64::ZPRRegClass.contains(DestReg) && + AArch64::ZPRRegClass.contains(SrcReg)) { + assert(Subtarget.hasSVE() && "Unexpected SVE register."); + BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (AArch64::GPR64spRegClass.contains(DestReg) && (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { @@ -2722,7 +2788,7 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineMemOperand *MMO) { unsigned SrcReg0 = SrcReg; unsigned SrcReg1 = SrcReg; - if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + if (Register::isPhysicalRegister(SrcReg)) { SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); SubIdx0 = 0; SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); @@ -2761,7 +2827,7 @@ void AArch64InstrInfo::storeRegToStackSlot( case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { Opc = AArch64::STRWui; - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) + if (Register::isVirtualRegister(SrcReg)) MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); else assert(SrcReg != AArch64::WSP); @@ -2771,7 +2837,7 @@ void AArch64InstrInfo::storeRegToStackSlot( case 8: if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { Opc = AArch64::STRXui; - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) + if (Register::isVirtualRegister(SrcReg)) MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); else assert(SrcReg != AArch64::SP); @@ -2852,7 +2918,7 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, unsigned DestReg0 = DestReg; unsigned DestReg1 = DestReg; bool IsUndef = true; - if (TargetRegisterInfo::isPhysicalRegister(DestReg)) { + if (Register::isPhysicalRegister(DestReg)) { DestReg0 = TRI.getSubReg(DestReg, SubIdx0); SubIdx0 = 0; DestReg1 = TRI.getSubReg(DestReg, SubIdx1); @@ -2892,7 +2958,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( case 4: if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { Opc = AArch64::LDRWui; - if (TargetRegisterInfo::isVirtualRegister(DestReg)) + if (Register::isVirtualRegister(DestReg)) MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); else assert(DestReg != AArch64::WSP); @@ -2902,7 +2968,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( case 8: if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { Opc = AArch64::LDRXui; - if (TargetRegisterInfo::isVirtualRegister(DestReg)) + if (Register::isVirtualRegister(DestReg)) MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); else assert(DestReg != AArch64::SP); @@ -2972,21 +3038,39 @@ void AArch64InstrInfo::loadRegFromStackSlot( MI.addMemOperand(MMO); } -void llvm::emitFrameOffset(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - unsigned DestReg, unsigned SrcReg, int Offset, - const TargetInstrInfo *TII, - MachineInstr::MIFlag Flag, bool SetNZCV, - bool NeedsWinCFI, bool *HasWinCFI) { - if (DestReg == SrcReg && Offset == 0) - return; - - assert((DestReg != AArch64::SP || Offset % 16 == 0) && - "SP increment/decrement not 16-byte aligned"); - - bool isSub = Offset < 0; - if (isSub) - Offset = -Offset; +// Helper function to emit a frame offset adjustment from a given +// pointer (SrcReg), stored into DestReg. This function is explicit +// in that it requires the opcode. +static void emitFrameOffsetAdj(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, int64_t Offset, unsigned Opc, + const TargetInstrInfo *TII, + MachineInstr::MIFlag Flag, bool NeedsWinCFI, + bool *HasWinCFI) { + int Sign = 1; + unsigned MaxEncoding, ShiftSize; + switch (Opc) { + case AArch64::ADDXri: + case AArch64::ADDSXri: + case AArch64::SUBXri: + case AArch64::SUBSXri: + MaxEncoding = 0xfff; + ShiftSize = 12; + break; + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + MaxEncoding = 31; + ShiftSize = 0; + if (Offset < 0) { + MaxEncoding = 32; + Sign = -1; + Offset = -Offset; + } + break; + default: + llvm_unreachable("Unsupported opcode"); + } // FIXME: If the offset won't fit in 24-bits, compute the offset into a // scratch register. If DestReg is a virtual register, use it as the @@ -2999,65 +3083,94 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, // of code. // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); - unsigned Opc; - if (SetNZCV) - Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri; - else - Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri; - const unsigned MaxEncoding = 0xfff; - const unsigned ShiftSize = 12; const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; - while (((unsigned)Offset) >= (1 << ShiftSize)) { - unsigned ThisVal; - if (((unsigned)Offset) > MaxEncodableValue) { - ThisVal = MaxEncodableValue; - } else { - ThisVal = Offset & MaxEncodableValue; + do { + unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue); + unsigned LocalShiftSize = 0; + if (ThisVal > MaxEncoding) { + ThisVal = ThisVal >> ShiftSize; + LocalShiftSize = ShiftSize; } assert((ThisVal >> ShiftSize) <= MaxEncoding && "Encoding cannot handle value that big"); - BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) - .addReg(SrcReg) - .addImm(ThisVal >> ShiftSize) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize)) - .setMIFlag(Flag); - - if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP) { + auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + .addReg(SrcReg) + .addImm(Sign * (int)ThisVal); + if (ShiftSize) + MBI = MBI.addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); + MBI = MBI.setMIFlag(Flag); + + if (NeedsWinCFI) { + assert(Sign == 1 && "SEH directives should always have a positive sign"); + int Imm = (int)(ThisVal << LocalShiftSize); + if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || + (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { + if (HasWinCFI) + *HasWinCFI = true; + if (Imm == 0) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); + else + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) + .addImm(Imm) + .setMIFlag(Flag); + assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to " + "emit a single SEH directive"); + } else if (DestReg == AArch64::SP) { + if (HasWinCFI) + *HasWinCFI = true; + assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) + .addImm(Imm) + .setMIFlag(Flag); + } if (HasWinCFI) *HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) - .addImm(ThisVal) - .setMIFlag(Flag); } SrcReg = DestReg; - Offset -= ThisVal; - if (Offset == 0) - return; - } - BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) - .addReg(SrcReg) - .addImm(Offset) - .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) - .setMIFlag(Flag); + Offset -= ThisVal << LocalShiftSize; + } while (Offset); +} - if (NeedsWinCFI) { - if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || - (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { - if (HasWinCFI) - *HasWinCFI = true; - if (Offset == 0) - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)). - setMIFlag(Flag); - else - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)). - addImm(Offset).setMIFlag(Flag); - } else if (DestReg == AArch64::SP) { - if (HasWinCFI) - *HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)). - addImm(Offset).setMIFlag(Flag); +void llvm::emitFrameOffset(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + unsigned DestReg, unsigned SrcReg, + StackOffset Offset, const TargetInstrInfo *TII, + MachineInstr::MIFlag Flag, bool SetNZCV, + bool NeedsWinCFI, bool *HasWinCFI) { + int64_t Bytes, NumPredicateVectors, NumDataVectors; + Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); + + // First emit non-scalable frame offsets, or a simple 'mov'. + if (Bytes || (!Offset && SrcReg != DestReg)) { + assert((DestReg != AArch64::SP || Bytes % 16 == 0) && + "SP increment/decrement not 16-byte aligned"); + unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; + if (Bytes < 0) { + Bytes = -Bytes; + Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; } + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, + NeedsWinCFI, HasWinCFI); + SrcReg = DestReg; + } + + assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && + "SetNZCV not supported with SVE vectors"); + assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && + "WinCFI not supported with SVE vectors"); + + if (NumDataVectors) { + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); + SrcReg = DestReg; + } + + if (NumPredicateVectors) { + assert(DestReg != AArch64::SP && "Unaligned access to SP"); + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); } } @@ -3079,15 +3192,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // <rdar://problem/11522048> // if (MI.isFullCopy()) { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); - if (SrcReg == AArch64::SP && - TargetRegisterInfo::isVirtualRegister(DstReg)) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); return nullptr; } - if (DstReg == AArch64::SP && - TargetRegisterInfo::isVirtualRegister(SrcReg)) { + if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); return nullptr; } @@ -3127,14 +3238,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( MachineBasicBlock &MBB = *MI.getParent(); const MachineOperand &DstMO = MI.getOperand(0); const MachineOperand &SrcMO = MI.getOperand(1); - unsigned DstReg = DstMO.getReg(); - unsigned SrcReg = SrcMO.getReg(); + Register DstReg = DstMO.getReg(); + Register SrcReg = SrcMO.getReg(); // This is slightly expensive to compute for physical regs since // getMinimalPhysRegClass is slow. auto getRegClass = [&](unsigned Reg) { - return TargetRegisterInfo::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : TRI.getMinimalPhysRegClass(Reg); + return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) + : TRI.getMinimalPhysRegClass(Reg); }; if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { @@ -3159,8 +3269,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // // STRXui %xzr, %stack.0 // - if (IsSpill && DstMO.isUndef() && - TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { assert(SrcMO.getSubReg() == 0 && "Unexpected subreg on physical register"); const TargetRegisterClass *SpillRC; @@ -3243,10 +3352,23 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( return nullptr; } -int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, +static bool isSVEScaledImmInstruction(unsigned Opcode) { + switch (Opcode) { + case AArch64::LDR_ZXI: + case AArch64::STR_ZXI: + case AArch64::LDR_PXI: + case AArch64::STR_PXI: + return true; + default: + return false; + } +} + +int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, + StackOffset &SOffset, bool *OutUseUnscaledOp, unsigned *OutUnscaledOp, - int *EmittableOffset) { + int64_t *EmittableOffset) { // Set output values in case of early exit. if (EmittableOffset) *EmittableOffset = 0; @@ -3285,6 +3407,10 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); // Construct the complete offset. + bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode()); + int64_t Offset = + IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes()); + const MachineOperand &ImmOpnd = MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); Offset += ImmOpnd.getImm() * Scale; @@ -3304,7 +3430,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, "Cannot have remainder when using unscaled op"); assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); - int NewOffset = Offset / Scale; + int64_t NewOffset = Offset / Scale; if (MinOff <= NewOffset && NewOffset <= MaxOff) Offset = Remainder; else { @@ -3319,27 +3445,33 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, if (OutUnscaledOp && UnscaledOp) *OutUnscaledOp = *UnscaledOp; + if (IsMulVL) + SOffset = StackOffset(Offset, MVT::nxv1i8) + + StackOffset(SOffset.getBytes(), MVT::i8); + else + SOffset = StackOffset(Offset, MVT::i8) + + StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8); return AArch64FrameOffsetCanUpdate | - (Offset == 0 ? AArch64FrameOffsetIsLegal : 0); + (SOffset ? 0 : AArch64FrameOffsetIsLegal); } bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII) { unsigned Opcode = MI.getOpcode(); unsigned ImmIdx = FrameRegIdx + 1; if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { - Offset += MI.getOperand(ImmIdx).getImm(); + Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8); emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), MI.getOperand(0).getReg(), FrameReg, Offset, TII, MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); MI.eraseFromParent(); - Offset = 0; + Offset = StackOffset(); return true; } - int NewOffset; + int64_t NewOffset; unsigned UnscaledOp; bool UseUnscaledOp; int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, @@ -3352,7 +3484,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, MI.setDesc(TII->get(UnscaledOp)); MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); - return Offset == 0; + return !Offset; } return false; @@ -3428,13 +3560,19 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { switch (Inst.getOpcode()) { default: break; + case AArch64::FADDHrr: case AArch64::FADDSrr: case AArch64::FADDDrr: + case AArch64::FADDv4f16: + case AArch64::FADDv8f16: case AArch64::FADDv2f32: case AArch64::FADDv2f64: case AArch64::FADDv4f32: + case AArch64::FSUBHrr: case AArch64::FSUBSrr: case AArch64::FSUBDrr: + case AArch64::FSUBv4f16: + case AArch64::FSUBv8f16: case AArch64::FSUBv2f32: case AArch64::FSUBv2f64: case AArch64::FSUBv4f32: @@ -3459,7 +3597,7 @@ static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineInstr *MI = nullptr; - if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) @@ -3544,86 +3682,48 @@ static bool getMaddPatterns(MachineInstr &Root, Opc = NewOpc; } + auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, + MachineCombinerPattern Pattern) { + if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { + Patterns.push_back(Pattern); + Found = true; + } + }; + + typedef MachineCombinerPattern MCP; + switch (Opc) { default: break; case AArch64::ADDWrr: assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && "ADDWrr does not have register operands"); - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDW_OP1); - Found = true; - } - if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDW_OP2); - Found = true; - } + setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); + setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); break; case AArch64::ADDXrr: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDX_OP1); - Found = true; - } - if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDX_OP2); - Found = true; - } + setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); + setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); break; case AArch64::SUBWrr: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1); - Found = true; - } - if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2); - Found = true; - } + setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); + setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); break; case AArch64::SUBXrr: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1); - Found = true; - } - if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2); - Found = true; - } + setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); + setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); break; case AArch64::ADDWri: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1); - Found = true; - } + setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); break; case AArch64::ADDXri: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1); - Found = true; - } + setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); break; case AArch64::SUBWri: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, - AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1); - Found = true; - } + setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); break; case AArch64::SUBXri: - if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, - AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1); - Found = true; - } + setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); break; } return Found; @@ -3640,204 +3740,135 @@ static bool getFMAPatterns(MachineInstr &Root, MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; + auto Match = [&](int Opcode, int Operand, + MachineCombinerPattern Pattern) -> bool { + if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { + Patterns.push_back(Pattern); + return true; + } + return false; + }; + + typedef MachineCombinerPattern MCP; + switch (Root.getOpcode()) { default: assert(false && "Unsupported FP instruction in combiner\n"); break; + case AArch64::FADDHrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "FADDHrr does not have register operands"); + + Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); + Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); + break; case AArch64::FADDSrr: assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && - "FADDWrr does not have register operands"); - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2); - Found = true; - } + "FADDSrr does not have register operands"); + + Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || + Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); + + Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || + Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); break; case AArch64::FADDDrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2); - Found = true; - } + Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || + Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); + + Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || + Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); + break; + case AArch64::FADDv4f16: + Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || + Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); + + Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || + Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); + break; + case AArch64::FADDv8f16: + Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || + Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); + + Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || + Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); break; case AArch64::FADDv2f32: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2); - Found = true; - } + Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || + Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); + + Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || + Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); break; case AArch64::FADDv2f64: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2); - Found = true; - } + Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || + Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); + + Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || + Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); break; case AArch64::FADDv4f32: - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2); - Found = true; - } - break; + Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || + Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); + Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || + Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); + break; + case AArch64::FSUBHrr: + Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); + Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); + Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); + break; case AArch64::FSUBSrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) { - Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1); - Found = true; - } + Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); + + Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || + Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); + + Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); break; case AArch64::FSUBDrr: - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv1i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) { - Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1); - Found = true; - } + Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); + + Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || + Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); + + Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); + break; + case AArch64::FSUBv4f16: + Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || + Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); + + Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || + Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); + break; + case AArch64::FSUBv8f16: + Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || + Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); + + Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || + Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); break; case AArch64::FSUBv2f32: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1); - Found = true; - } + Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || + Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); + + Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || + Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); break; case AArch64::FSUBv2f64: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2i64_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv2f64)) { - Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1); - Found = true; - } + Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || + Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); + + Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || + Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); break; case AArch64::FSUBv4f32: - if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(2), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2); - Found = true; - } - if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4i32_indexed)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1); - Found = true; - } else if (canCombineWithFMUL(MBB, Root.getOperand(1), - AArch64::FMULv4f32)) { - Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1); - Found = true; - } + Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || + Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); + + Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || + Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); break; } return Found; @@ -3851,6 +3882,10 @@ bool AArch64InstrInfo::isThroughputPattern( switch (Pattern) { default: break; + case MachineCombinerPattern::FMULADDH_OP1: + case MachineCombinerPattern::FMULADDH_OP2: + case MachineCombinerPattern::FMULSUBH_OP1: + case MachineCombinerPattern::FMULSUBH_OP2: case MachineCombinerPattern::FMULADDS_OP1: case MachineCombinerPattern::FMULADDS_OP2: case MachineCombinerPattern::FMULSUBS_OP1: @@ -3859,12 +3894,21 @@ bool AArch64InstrInfo::isThroughputPattern( case MachineCombinerPattern::FMULADDD_OP2: case MachineCombinerPattern::FMULSUBD_OP1: case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FNMULSUBH_OP1: case MachineCombinerPattern::FNMULSUBS_OP1: case MachineCombinerPattern::FNMULSUBD_OP1: + case MachineCombinerPattern::FMLAv4i16_indexed_OP1: + case MachineCombinerPattern::FMLAv4i16_indexed_OP2: + case MachineCombinerPattern::FMLAv8i16_indexed_OP1: + case MachineCombinerPattern::FMLAv8i16_indexed_OP2: case MachineCombinerPattern::FMLAv1i32_indexed_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP2: case MachineCombinerPattern::FMLAv1i64_indexed_OP1: case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + case MachineCombinerPattern::FMLAv4f16_OP2: + case MachineCombinerPattern::FMLAv4f16_OP1: + case MachineCombinerPattern::FMLAv8f16_OP1: + case MachineCombinerPattern::FMLAv8f16_OP2: case MachineCombinerPattern::FMLAv2f32_OP2: case MachineCombinerPattern::FMLAv2f32_OP1: case MachineCombinerPattern::FMLAv2f64_OP1: @@ -3877,10 +3921,18 @@ bool AArch64InstrInfo::isThroughputPattern( case MachineCombinerPattern::FMLAv4f32_OP2: case MachineCombinerPattern::FMLAv4i32_indexed_OP1: case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv4i16_indexed_OP1: + case MachineCombinerPattern::FMLSv4i16_indexed_OP2: + case MachineCombinerPattern::FMLSv8i16_indexed_OP1: + case MachineCombinerPattern::FMLSv8i16_indexed_OP2: case MachineCombinerPattern::FMLSv1i32_indexed_OP2: case MachineCombinerPattern::FMLSv1i64_indexed_OP2: case MachineCombinerPattern::FMLSv2i32_indexed_OP2: case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + case MachineCombinerPattern::FMLSv4f16_OP1: + case MachineCombinerPattern::FMLSv4f16_OP2: + case MachineCombinerPattern::FMLSv8f16_OP1: + case MachineCombinerPattern::FMLSv8f16_OP2: case MachineCombinerPattern::FMLSv2f32_OP2: case MachineCombinerPattern::FMLSv2f64_OP2: case MachineCombinerPattern::FMLSv4i32_indexed_OP2: @@ -3933,15 +3985,15 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind = FMAInstKind::Default, - const unsigned *ReplacedAddend = nullptr) { + const Register *ReplacedAddend = nullptr) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); - unsigned ResultReg = Root.getOperand(0).getReg(); - unsigned SrcReg0 = MUL->getOperand(1).getReg(); + Register ResultReg = Root.getOperand(0).getReg(); + Register SrcReg0 = MUL->getOperand(1).getReg(); bool Src0IsKill = MUL->getOperand(1).isKill(); - unsigned SrcReg1 = MUL->getOperand(2).getReg(); + Register SrcReg1 = MUL->getOperand(2).getReg(); bool Src1IsKill = MUL->getOperand(2).isKill(); unsigned SrcReg2; @@ -3955,13 +4007,13 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); } - if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + if (Register::isVirtualRegister(ResultReg)) MRI.constrainRegClass(ResultReg, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) + if (Register::isVirtualRegister(SrcReg0)) MRI.constrainRegClass(SrcReg0, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) + if (Register::isVirtualRegister(SrcReg1)) MRI.constrainRegClass(SrcReg1, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) + if (Register::isVirtualRegister(SrcReg2)) MRI.constrainRegClass(SrcReg2, RC); MachineInstrBuilder MIB; @@ -4015,19 +4067,19 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, assert(IdxMulOpd == 1 || IdxMulOpd == 2); MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); - unsigned ResultReg = Root.getOperand(0).getReg(); - unsigned SrcReg0 = MUL->getOperand(1).getReg(); + Register ResultReg = Root.getOperand(0).getReg(); + Register SrcReg0 = MUL->getOperand(1).getReg(); bool Src0IsKill = MUL->getOperand(1).isKill(); - unsigned SrcReg1 = MUL->getOperand(2).getReg(); + Register SrcReg1 = MUL->getOperand(2).getReg(); bool Src1IsKill = MUL->getOperand(2).isKill(); - if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + if (Register::isVirtualRegister(ResultReg)) MRI.constrainRegClass(ResultReg, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) + if (Register::isVirtualRegister(SrcReg0)) MRI.constrainRegClass(SrcReg0, RC); - if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) + if (Register::isVirtualRegister(SrcReg1)) MRI.constrainRegClass(SrcReg1, RC); - if (TargetRegisterInfo::isVirtualRegister(VR)) + if (Register::isVirtualRegister(VR)) MRI.constrainRegClass(VR, RC); MachineInstrBuilder MIB = @@ -4116,7 +4168,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - unsigned NewVR = MRI.createVirtualRegister(OrrRC); + Register NewVR = MRI.createVirtualRegister(OrrRC); uint64_t Imm = Root.getOperand(2).getImm(); if (Root.getOperand(3).isImm()) { @@ -4158,7 +4210,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - unsigned NewVR = MRI.createVirtualRegister(SubRC); + Register NewVR = MRI.createVirtualRegister(SubRC); // SUB NewVR, 0, C MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) @@ -4208,7 +4260,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - unsigned NewVR = MRI.createVirtualRegister(OrrRC); + Register NewVR = MRI.createVirtualRegister(OrrRC); uint64_t Imm = Root.getOperand(2).getImm(); if (Root.getOperand(3).isImm()) { unsigned Val = Root.getOperand(3).getImm(); @@ -4228,34 +4280,35 @@ void AArch64InstrInfo::genAlternativeCodeSequence( break; } // Floating Point Support + case MachineCombinerPattern::FMULADDH_OP1: + Opc = AArch64::FMADDHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; case MachineCombinerPattern::FMULADDS_OP1: + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; case MachineCombinerPattern::FMULADDD_OP1: - // MUL I=A,B,0 - // ADD R,I,C - // ==> MADD R,A,B,C - // --- Create(MADD); - if (Pattern == MachineCombinerPattern::FMULADDS_OP1) { - Opc = AArch64::FMADDSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMADDDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; + + case MachineCombinerPattern::FMULADDH_OP2: + Opc = AArch64::FMADDHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; case MachineCombinerPattern::FMULADDS_OP2: + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; case MachineCombinerPattern::FMULADDD_OP2: - // FMUL I=A,B,0 - // FADD R,C,I - // ==> FMADD R,A,B,C - // --- Create(FMADD); - if (Pattern == MachineCombinerPattern::FMULADDS_OP2) { - Opc = AArch64::FMADDSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMADDDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; @@ -4285,6 +4338,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence( FMAInstKind::Indexed); break; + case MachineCombinerPattern::FMLAv4i16_indexed_OP1: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLAv4i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv4f16_OP1: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLAv4f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLAv4i16_indexed_OP2: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLAv4i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv4f16_OP2: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLAv4f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: case MachineCombinerPattern::FMLAv2f32_OP1: RC = &AArch64::FPR64RegClass; @@ -4312,6 +4390,31 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; + case MachineCombinerPattern::FMLAv8i16_indexed_OP1: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLAv8i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv8f16_OP1: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLAv8f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLAv8i16_indexed_OP2: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLAv8i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv8f16_OP2: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLAv8f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: case MachineCombinerPattern::FMLAv2f64_OP1: RC = &AArch64::FPR128RegClass; @@ -4367,56 +4470,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; + case MachineCombinerPattern::FMULSUBH_OP1: + Opc = AArch64::FNMSUBHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; case MachineCombinerPattern::FMULSUBS_OP1: - case MachineCombinerPattern::FMULSUBD_OP1: { - // FMUL I=A,B,0 - // FSUB R,I,C - // ==> FNMSUB R,A,B,C // = -C + A*B - // --- Create(FNMSUB); - if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) { - Opc = AArch64::FNMSUBSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FNMSUBDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FNMSUBSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::FMULSUBD_OP1: + Opc = AArch64::FNMSUBDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; - } + case MachineCombinerPattern::FNMULSUBH_OP1: + Opc = AArch64::FNMADDHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; case MachineCombinerPattern::FNMULSUBS_OP1: - case MachineCombinerPattern::FNMULSUBD_OP1: { - // FNMUL I=A,B,0 - // FSUB R,I,C - // ==> FNMADD R,A,B,C // = -A*B - C - // --- Create(FNMADD); - if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) { - Opc = AArch64::FNMADDSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FNMADDDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FNMADDSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::FNMULSUBD_OP1: + Opc = AArch64::FNMADDDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; - } + case MachineCombinerPattern::FMULSUBH_OP2: + Opc = AArch64::FMSUBHrrr; + RC = &AArch64::FPR16RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; case MachineCombinerPattern::FMULSUBS_OP2: - case MachineCombinerPattern::FMULSUBD_OP2: { - // FMUL I=A,B,0 - // FSUB R,C,I - // ==> FMSUB R,A,B,C (computes C - A*B) - // --- Create(FMSUB); - if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) { - Opc = AArch64::FMSUBSrrr; - RC = &AArch64::FPR32RegClass; - } else { - Opc = AArch64::FMSUBDrrr; - RC = &AArch64::FPR64RegClass; - } + Opc = AArch64::FMSUBSrrr; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::FMULSUBD_OP2: + Opc = AArch64::FMSUBDrrr; + RC = &AArch64::FPR64RegClass; MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; - } case MachineCombinerPattern::FMLSv1i32_indexed_OP2: Opc = AArch64::FMLSv1i32_indexed; @@ -4432,6 +4532,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence( FMAInstKind::Indexed); break; + case MachineCombinerPattern::FMLSv4f16_OP1: + case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { + RC = &AArch64::FPR64RegClass; + Register NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { + Opc = AArch64::FMLAv4f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } else { + Opc = AArch64::FMLAv4i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } + break; + } + case MachineCombinerPattern::FMLSv4f16_OP2: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLSv4f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLSv4i16_indexed_OP2: + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FMLSv4i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLSv2f32_OP2: case MachineCombinerPattern::FMLSv2i32_indexed_OP2: RC = &AArch64::FPR64RegClass; @@ -4446,6 +4579,39 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; + case MachineCombinerPattern::FMLSv8f16_OP1: + case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { + RC = &AArch64::FPR128RegClass; + Register NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { + Opc = AArch64::FMLAv8f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } else { + Opc = AArch64::FMLAv8i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } + break; + } + case MachineCombinerPattern::FMLSv8f16_OP2: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLSv8f16; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + break; + case MachineCombinerPattern::FMLSv8i16_indexed_OP2: + RC = &AArch64::FPR128RegClass; + Opc = AArch64::FMLSv8i16_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLSv2f64_OP2: case MachineCombinerPattern::FMLSv2i64_indexed_OP2: RC = &AArch64::FPR128RegClass; @@ -4476,7 +4642,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( case MachineCombinerPattern::FMLSv2f32_OP1: case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { RC = &AArch64::FPR64RegClass; - unsigned NewVR = MRI.createVirtualRegister(RC); + Register NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) .add(Root.getOperand(2)); @@ -4496,7 +4662,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( case MachineCombinerPattern::FMLSv4f32_OP1: case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { RC = &AArch64::FPR128RegClass; - unsigned NewVR = MRI.createVirtualRegister(RC); + Register NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) .add(Root.getOperand(2)); @@ -4516,7 +4682,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( case MachineCombinerPattern::FMLSv2f64_OP1: case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { RC = &AArch64::FPR128RegClass; - unsigned NewVR = MRI.createVirtualRegister(RC); + Register NewVR = MRI.createVirtualRegister(RC); MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) .add(Root.getOperand(2)); @@ -4617,15 +4783,15 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - unsigned VReg = MI.getOperand(0).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(VReg)) + Register VReg = MI.getOperand(0).getReg(); + if (!Register::isVirtualRegister(VReg)) return false; MachineInstr *DefMI = MRI->getVRegDef(VReg); // Look through COPY instructions to find definition. while (DefMI->isCopy()) { - unsigned CopyVReg = DefMI->getOperand(1).getReg(); + Register CopyVReg = DefMI->getOperand(1).getReg(); if (!MRI->hasOneNonDBGUse(CopyVReg)) return false; if (!MRI->hasOneDef(CopyVReg)) @@ -4653,8 +4819,8 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { return false; MachineOperand &MO = DefMI->getOperand(1); - unsigned NewReg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(NewReg)) + Register NewReg = MO.getReg(); + if (!Register::isVirtualRegister(NewReg)) return false; assert(!MRI->def_empty(NewReg) && "Register must be defined."); @@ -4737,9 +4903,13 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { static const std::pair<unsigned, const char *> TargetFlags[] = { {MO_COFFSTUB, "aarch64-coffstub"}, - {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"}, - {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"}, - {MO_DLLIMPORT, "aarch64-dllimport"}}; + {MO_GOT, "aarch64-got"}, + {MO_NC, "aarch64-nc"}, + {MO_S, "aarch64-s"}, + {MO_TLS, "aarch64-tls"}, + {MO_DLLIMPORT, "aarch64-dllimport"}, + {MO_PREL, "aarch64-prel"}, + {MO_TAGGED, "aarch64-tagged"}}; return makeArrayRef(TargetFlags); } diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 7be4daba7dc4..1688045e4fb8 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -15,6 +15,7 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" +#include "AArch64StackOffset.h" #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -55,8 +56,7 @@ public: bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, - const MachineInstr &MIb, - AliasAnalysis *AA = nullptr) const override; + const MachineInstr &MIb) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; @@ -299,7 +299,7 @@ private: /// if necessary, to be replaced by the scavenger at the end of PEI. void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, - int Offset, const TargetInstrInfo *TII, + StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag = MachineInstr::NoFlags, bool SetNZCV = false, bool NeedsWinCFI = false, bool *HasWinCFI = nullptr); @@ -308,7 +308,7 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, /// FP. Return false if the offset could not be handled directly in MI, and /// return the left-over portion by reference. bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII); /// Use to report the frame offset status in isAArch64FrameOffsetLegal. @@ -332,10 +332,10 @@ enum AArch64FrameOffsetStatus { /// If set, @p EmittableOffset contains the amount that can be set in @p MI /// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that /// is a legal offset. -int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, +int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp = nullptr, unsigned *OutUnscaledOp = nullptr, - int *EmittableOffset = nullptr); + int64_t *EmittableOffset = nullptr); static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; } diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index eed53f36d574..1981bd5d3bf0 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -62,6 +62,9 @@ def HasAM : Predicate<"Subtarget->hasAM()">, def HasSEL2 : Predicate<"Subtarget->hasSEL2()">, AssemblerPredicate<"FeatureSEL2", "sel2">; +def HasPMU : Predicate<"Subtarget->hasPMU()">, + AssemblerPredicate<"FeaturePMU", "pmu">; + def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">; @@ -116,7 +119,7 @@ def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">, def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">, AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">; def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">, - AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">; + AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, AssemblerPredicate<"FeatureRCPC", "rcpc">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, @@ -133,6 +136,12 @@ def HasBTI : Predicate<"Subtarget->hasBTI()">, AssemblerPredicate<"FeatureBranchTargetId", "bti">; def HasMTE : Predicate<"Subtarget->hasMTE()">, AssemblerPredicate<"FeatureMTE", "mte">; +def HasTME : Predicate<"Subtarget->hasTME()">, + AssemblerPredicate<"FeatureTME", "tme">; +def HasETE : Predicate<"Subtarget->hasETE()">, + AssemblerPredicate<"FeatureETE", "ete">; +def HasTRBE : Predicate<"Subtarget->hasTRBE()">, + AssemblerPredicate<"FeatureTRBE", "trbe">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; @@ -415,6 +424,14 @@ def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, S def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def SDT_AArch64unpk : SDTypeProfile<1, 1, [ + SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0> +]>; +def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>; +def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>; +def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>; +def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -431,6 +448,13 @@ let RecomputePerFunction = 1 in { def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; + + // Toggles patterns which aren't beneficial in GlobalISel when we aren't + // optimizing. This allows us to selectively use patterns without impacting + // SelectionDAG's behaviour. + // FIXME: One day there will probably be a nicer way to check for this, but + // today is not that day. + def OptimizedGISelOrOtherSelector : Predicate<"!MF->getFunction().hasOptNone() || MF->getProperties().hasProperty(MachineFunctionProperties::Property::FailedISel) || !MF->getProperties().hasProperty(MachineFunctionProperties::Property::Legalized)">; } include "AArch64InstrFormats.td" @@ -785,7 +809,11 @@ def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in { def HWASAN_CHECK_MEMACCESS : Pseudo< (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), - [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>, + [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, + Sched<[]>; +def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo< + (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), + [(int_hwasan_check_memaccess_shortgranules X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, Sched<[]>; } @@ -804,6 +832,23 @@ def : InstAlias<"sys $op1, $Cn, $Cm, $op2", (SYSxt imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, XZR)>; + +let Predicates = [HasTME] in { + +def TSTART : TMSystemI<0b0000, "tstart", + [(set GPR64:$Rt, (int_aarch64_tstart))]>; + +def TCOMMIT : TMSystemINoOperand<0b0000, "tcommit", [(int_aarch64_tcommit)]>; + +def TCANCEL : TMSystemException<0b011, "tcancel", + [(int_aarch64_tcancel i64_imm0_65535:$imm)]>; + +def TTEST : TMSystemI<0b0001, "ttest", [(set GPR64:$Rt, (int_aarch64_ttest))]> { + let mayLoad = 0; + let mayStore = 0; +} +} // HasTME + //===----------------------------------------------------------------------===// // Move immediate instructions. //===----------------------------------------------------------------------===// @@ -815,37 +860,37 @@ let PostEncoderMethod = "fixMOVZ" in defm MOVZ : MoveImmediate<0b10, "movz">; // First group of aliases covers an implicit "lsl #0". -def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>; -def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>; -def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, i32_imm0_65535:$imm, 0), 0>; +def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, i32_imm0_65535:$imm, 0), 0>; +def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, i32_imm0_65535:$imm, 0)>; +def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, i32_imm0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, i32_imm0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, i32_imm0_65535:$imm, 0)>; // Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax. -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g3:$sym, 48), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g2:$sym, 32), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g0:$sym, 0), 0>; -def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g0:$sym, 0), 0>; // Final group of aliases covers true "mov $Rd, $imm" cases. multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR, @@ -917,8 +962,12 @@ def trunc_imm : SDNodeXForm<imm, [{ def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">, GISDNodeXFormEquiv<trunc_imm>; +let Predicates = [OptimizedGISelOrOtherSelector] in { +// The SUBREG_TO_REG isn't eliminated at -O0, which can result in pointless +// copies. def : Pat<(i64 i64imm_32bit:$src), (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>; +} // Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model). def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ @@ -1012,10 +1061,10 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm), def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm), (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>; let AddedComplexity = 1 in { -def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3), - (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>; -def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3), - (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>; +def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3), + (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>; +def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3), + (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>; } // Because of the immediate format for add/sub-imm instructions, the @@ -2165,8 +2214,8 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>; def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{ if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) { const DataLayout &DL = MF->getDataLayout(); - unsigned Align = G->getGlobal()->getPointerAlignment(DL); - return Align >= 4 && G->getOffset() % 4 == 0; + MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL); + return Align && *Align >= 4 && G->getOffset() % 4 == 0; } if (auto *C = dyn_cast<ConstantPoolSDNode>(N)) return C->getAlignment() >= 4 && C->getOffset() % 4 == 0; @@ -3281,20 +3330,37 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub", // N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike // the NEON variant. + +// Here we handle first -(a + b*c) for FNMADD: + +let Predicates = [HasNEON, HasFullFP16] in +def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, FPR16:$Ra)), + (FMSUBHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>; + def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)), (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)), (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; -// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and -// "(-a) + b*(-c)". +// Now it's time for "(-a) + (-b)*c" + +let Predicates = [HasNEON, HasFullFP16] in +def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, (fneg FPR16:$Ra))), + (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>; + def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))), (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))), (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; +// And here "(-a) + b*(-c)" + +let Predicates = [HasNEON, HasFullFP16] in +def : Pat<(f16 (fma FPR16:$Rn, (fneg FPR16:$Rm), (fneg FPR16:$Ra))), + (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>; + def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))), (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; @@ -6939,5 +7005,124 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>; def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>; +// Extracting lane zero is a special case where we can just use a plain +// EXTRACT_SUBREG instruction, which will become FMOV. This is easier for the +// rest of the compiler, especially the register allocator and copy propagation, +// to reason about, so is preferred when it's possible to use it. +let AddedComplexity = 10 in { + def : Pat<(i64 (extractelt (v2i64 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, dsub)>; + def : Pat<(i32 (extractelt (v4i32 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, ssub)>; + def : Pat<(i32 (extractelt (v2i32 V64:$V), (i64 0))), (EXTRACT_SUBREG V64:$V, ssub)>; +} + +// dot_v4i8 +class mul_v4i8<SDPatternOperator ldop> : + PatFrag<(ops node:$Rn, node:$Rm, node:$offset), + (mul (ldop (add node:$Rn, node:$offset)), + (ldop (add node:$Rm, node:$offset)))>; +class mulz_v4i8<SDPatternOperator ldop> : + PatFrag<(ops node:$Rn, node:$Rm), + (mul (ldop node:$Rn), (ldop node:$Rm))>; + +def load_v4i8 : + OutPatFrag<(ops node:$R), + (INSERT_SUBREG + (v2i32 (IMPLICIT_DEF)), + (i32 (COPY_TO_REGCLASS (LDRWui node:$R, (i64 0)), FPR32)), + ssub)>; + +class dot_v4i8<Instruction DOT, SDPatternOperator ldop> : + Pat<(i32 (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 3)), + (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 2)), + (add (mul_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm, (i64 1)), + (mulz_v4i8<ldop> GPR64sp:$Rn, GPR64sp:$Rm))))), + (EXTRACT_SUBREG (i64 (DOT (DUPv2i32gpr WZR), + (load_v4i8 GPR64sp:$Rn), + (load_v4i8 GPR64sp:$Rm))), + sub_32)>, Requires<[HasDotProd]>; + +// dot_v8i8 +class ee_v8i8<SDPatternOperator extend> : + PatFrag<(ops node:$V, node:$K), + (v4i16 (extract_subvector (v8i16 (extend node:$V)), node:$K))>; + +class mul_v8i8<SDPatternOperator mulop, SDPatternOperator extend> : + PatFrag<(ops node:$M, node:$N, node:$K), + (mulop (v4i16 (ee_v8i8<extend> node:$M, node:$K)), + (v4i16 (ee_v8i8<extend> node:$N, node:$K)))>; + +class idot_v8i8<SDPatternOperator mulop, SDPatternOperator extend> : + PatFrag<(ops node:$M, node:$N), + (i32 (extractelt + (v4i32 (AArch64uaddv + (add (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 0)), + (mul_v8i8<mulop, extend> node:$M, node:$N, (i64 4))))), + (i64 0)))>; + +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def VADDV_32 : OutPatFrag<(ops node:$R), (ADDPv2i32 node:$R, node:$R)>; + +class odot_v8i8<Instruction DOT> : + OutPatFrag<(ops node:$Vm, node:$Vn), + (EXTRACT_SUBREG + (VADDV_32 + (i64 (DOT (DUPv2i32gpr WZR), + (v8i8 node:$Vm), + (v8i8 node:$Vn)))), + sub_32)>; + +class dot_v8i8<Instruction DOT, SDPatternOperator mulop, + SDPatternOperator extend> : + Pat<(idot_v8i8<mulop, extend> V64:$Vm, V64:$Vn), + (odot_v8i8<DOT> V64:$Vm, V64:$Vn)>, + Requires<[HasDotProd]>; + +// dot_v16i8 +class ee_v16i8<SDPatternOperator extend> : + PatFrag<(ops node:$V, node:$K1, node:$K2), + (v4i16 (extract_subvector + (v8i16 (extend + (v8i8 (extract_subvector node:$V, node:$K1)))), node:$K2))>; + +class mul_v16i8<SDPatternOperator mulop, SDPatternOperator extend> : + PatFrag<(ops node:$M, node:$N, node:$K1, node:$K2), + (v4i32 + (mulop (v4i16 (ee_v16i8<extend> node:$M, node:$K1, node:$K2)), + (v4i16 (ee_v16i8<extend> node:$N, node:$K1, node:$K2))))>; + +class idot_v16i8<SDPatternOperator m, SDPatternOperator x> : + PatFrag<(ops node:$M, node:$N), + (i32 (extractelt + (v4i32 (AArch64uaddv + (add + (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 0)), + (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 0))), + (add (mul_v16i8<m, x> node:$M, node:$N, (i64 0), (i64 4)), + (mul_v16i8<m, x> node:$M, node:$N, (i64 8), (i64 4)))))), + (i64 0)))>; + +class odot_v16i8<Instruction DOT> : + OutPatFrag<(ops node:$Vm, node:$Vn), + (i32 (ADDVv4i32v + (DOT (DUPv4i32gpr WZR), node:$Vm, node:$Vn)))>; + +class dot_v16i8<Instruction DOT, SDPatternOperator mulop, + SDPatternOperator extend> : + Pat<(idot_v16i8<mulop, extend> V128:$Vm, V128:$Vn), + (odot_v16i8<DOT> V128:$Vm, V128:$Vn)>, + Requires<[HasDotProd]>; + +let AddedComplexity = 10 in { + def : dot_v4i8<SDOTv8i8, sextloadi8>; + def : dot_v4i8<UDOTv8i8, zextloadi8>; + def : dot_v8i8<SDOTv8i8, AArch64smull, sext>; + def : dot_v8i8<UDOTv8i8, AArch64umull, zext>; + def : dot_v16i8<SDOTv16i8, AArch64smull, sext>; + def : dot_v16i8<UDOTv16i8, AArch64umull, zext>; + + // FIXME: add patterns to generate vector by element dot product. + // FIXME: add SVE dot-product patterns. +} + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 4e13fb8e2027..961f38cad1e4 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -51,9 +51,19 @@ public: const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } + void setupMF(MachineFunction &MF, GISelKnownBits &KB, + CodeGenCoverage &CoverageInfo) override { + InstructionSelector::setupMF(MF, KB, CoverageInfo); + + // hasFnAttribute() is expensive to call on every BRCOND selection, so + // cache it here for each run of the selector. + ProduceNonFlagSettingCondBr = + !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); + } + private: /// tblgen-erated 'select' implementation, used as the initial selector for /// the patterns that don't require complex C++. @@ -68,6 +78,10 @@ private: bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; + /// Eliminate same-sized cross-bank copies into stores before selectImpl(). + void contractCrossBankCopyIntoStore(MachineInstr &I, + MachineRegisterInfo &MRI) const; + bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, @@ -101,8 +115,6 @@ private: bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const; - void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI, - SmallVectorImpl<Optional<int>> &Idxs) const; bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const; @@ -116,6 +128,7 @@ private: bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const; MachineInstr *emitLoadFromConstantPool(Constant *CPVal, @@ -128,6 +141,8 @@ private: MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; MachineInstr *emitTST(const Register &LHS, const Register &RHS, @@ -155,7 +170,9 @@ private: ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const; ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const; + ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const; ComplexRendererFns selectArithImmed(MachineOperand &Root) const; + ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const; ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, unsigned Size) const; @@ -183,11 +200,48 @@ private: return selectAddrModeIndexed(Root, Width / 8); } + bool isWorthFoldingIntoExtendedReg(MachineInstr &MI, + const MachineRegisterInfo &MRI) const; + ComplexRendererFns + selectAddrModeShiftedExtendXReg(MachineOperand &Root, + unsigned SizeInBytes) const; + ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const; + template <int Width> + ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const { + return selectAddrModeXRO(Root, Width / 8); + } + + ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const; + + ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { + return selectShiftedRegister(Root); + } + + ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const { + // TODO: selectShiftedRegister should allow for rotates on logical shifts. + // For now, make them the same. The only difference between the two is that + // logical shifts are allowed to fold in rotates. Otherwise, these are + // functionally the same. + return selectShiftedRegister(Root); + } + + /// Instructions that accept extend modifiers like UXTW expect the register + /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a + /// subregister copy if necessary. Return either ExtReg, or the result of the + /// new copy. + Register narrowExtendRegIfNeeded(Register ExtReg, + MachineIRBuilder &MIB) const; + ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; + void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const; + void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I) const; + void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I) const; // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. void materializeLargeCMVal(MachineInstr &I, const Value *V, - unsigned char OpFlags) const; + unsigned OpFlags) const; // Optimization methods. bool tryOptVectorShuffle(MachineInstr &I) const; @@ -197,12 +251,22 @@ private: MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const; + /// Return true if \p MI is a load or store of \p NumBytes bytes. + bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const; + + /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit + /// register zeroed out. In other words, the result of MI has been explicitly + /// zero extended. + bool isDef32(const MachineInstr &MI) const; + const AArch64TargetMachine &TM; const AArch64Subtarget &STI; const AArch64InstrInfo &TII; const AArch64RegisterInfo &TRI; const AArch64RegisterBankInfo &RBI; + bool ProduceNonFlagSettingCondBr = false; + #define GET_GLOBALISEL_PREDICATES_DECL #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_PREDICATES_DECL @@ -312,7 +376,7 @@ static bool getSubRegForClass(const TargetRegisterClass *RC, SubReg = AArch64::hsub; break; case 32: - if (RC == &AArch64::GPR32RegClass) + if (RC != &AArch64::FPR32RegClass) SubReg = AArch64::sub_32; else SubReg = AArch64::ssub; @@ -357,7 +421,7 @@ static bool unsupportedBinOp(const MachineInstr &I, // so, this will need to be taught about that, and we'll need to get the // bank out of the minimal class for the register. // Either way, this needs to be documented (and possibly verified). - if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + if (!Register::isVirtualRegister(MO.getReg())) { LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n"); return true; } @@ -492,8 +556,8 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); @@ -502,7 +566,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, (DstSize == SrcSize || // Copies are a mean to setup initial types, the number of // bits may not exactly match. - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || + (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) || // Copies are a mean to copy bits around, as long as we are // on the same register class, that's fine. Otherwise, that // means we need some SUBREG_TO_REG or AND & co. @@ -526,7 +590,7 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank, /// SubRegCopy (To class) = COPY CopyReg:SubReg /// Dst = COPY SubRegCopy static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI, - const RegisterBankInfo &RBI, unsigned SrcReg, + const RegisterBankInfo &RBI, Register SrcReg, const TargetRegisterClass *From, const TargetRegisterClass *To, unsigned SubReg) { @@ -539,7 +603,7 @@ static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI, // It's possible that the destination register won't be constrained. Make // sure that happens. - if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg())) + if (!Register::isPhysicalRegister(I.getOperand(0).getReg())) RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI); return true; @@ -553,8 +617,8 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg = I.getOperand(1).getReg(); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); @@ -579,8 +643,8 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg = I.getOperand(1).getReg(); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); @@ -607,11 +671,10 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, // result. auto CheckCopy = [&]() { // If we have a bitcast or something, we can't have physical registers. - assert( - (I.isCopy() || - (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) && - !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg()))) && - "No phys reg on generic operator!"); + assert((I.isCopy() || + (!Register::isPhysicalRegister(I.getOperand(0).getReg()) && + !Register::isPhysicalRegister(I.getOperand(1).getReg()))) && + "No phys reg on generic operator!"); assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI)); (void)KnownValid; return true; @@ -626,38 +689,38 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return false; } - // Is this a cross-bank copy? - if (DstRegBank.getID() != SrcRegBank.getID()) { - // If we're doing a cross-bank copy on different-sized registers, we need - // to do a bit more work. - unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); - unsigned DstSize = TRI.getRegSizeInBits(*DstRC); - - if (SrcSize > DstSize) { - // We're doing a cross-bank copy into a smaller register. We need a - // subregister copy. First, get a register class that's on the same bank - // as the destination, but the same size as the source. - const TargetRegisterClass *SubregRC = - getMinClassForRegBank(DstRegBank, SrcSize, true); - assert(SubregRC && "Didn't get a register class for subreg?"); - - // Get the appropriate subregister for the destination. - unsigned SubReg = 0; - if (!getSubRegForClass(DstRC, TRI, SubReg)) { - LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n"); - return false; - } - - // Now, insert a subregister copy using the new register class. - selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg); - return CheckCopy(); + unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC); + unsigned DstSize = TRI.getRegSizeInBits(*DstRC); + + // If we're doing a cross-bank copy on different-sized registers, we need + // to do a bit more work. + if (SrcSize > DstSize) { + // We're doing a cross-bank copy into a smaller register. We need a + // subregister copy. First, get a register class that's on the same bank + // as the destination, but the same size as the source. + const TargetRegisterClass *SubregRC = + getMinClassForRegBank(DstRegBank, SrcSize, true); + assert(SubregRC && "Didn't get a register class for subreg?"); + + // Get the appropriate subregister for the destination. + unsigned SubReg = 0; + if (!getSubRegForClass(DstRC, TRI, SubReg)) { + LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n"); + return false; } - else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 && - SrcSize == 16) { + // Now, insert a subregister copy using the new register class. + selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg); + return CheckCopy(); + } + + // Is this a cross-bank copy? + if (DstRegBank.getID() != SrcRegBank.getID()) { + if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 && + SrcSize == 16) { // Special case for FPR16 to GPR32. // FIXME: This can probably be generalized like the above case. - unsigned PromoteReg = + Register PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG), PromoteReg) @@ -674,7 +737,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, // If the destination is a physical register, then there's nothing to // change, so we're done. - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + if (Register::isPhysicalRegister(DstReg)) return CheckCopy(); } @@ -955,7 +1018,9 @@ bool AArch64InstructionSelector::selectVectorSHL( return false; unsigned Opc = 0; - if (Ty == LLT::vector(4, 32)) { + if (Ty == LLT::vector(2, 64)) { + Opc = AArch64::USHLv2i64; + } else if (Ty == LLT::vector(4, 32)) { Opc = AArch64::USHLv4i32; } else if (Ty == LLT::vector(2, 32)) { Opc = AArch64::USHLv2i32; @@ -989,7 +1054,11 @@ bool AArch64InstructionSelector::selectVectorASHR( unsigned Opc = 0; unsigned NegOpc = 0; const TargetRegisterClass *RC = nullptr; - if (Ty == LLT::vector(4, 32)) { + if (Ty == LLT::vector(2, 64)) { + Opc = AArch64::SSHLv2i64; + NegOpc = AArch64::NEGv2i64; + RC = &AArch64::FPR128RegClass; + } else if (Ty == LLT::vector(4, 32)) { Opc = AArch64::SSHLv4i32; NegOpc = AArch64::NEGv4i32; RC = &AArch64::FPR128RegClass; @@ -1044,7 +1113,7 @@ bool AArch64InstructionSelector::selectVaStartDarwin( } void AArch64InstructionSelector::materializeLargeCMVal( - MachineInstr &I, const Value *V, unsigned char OpFlags) const { + MachineInstr &I, const Value *V, unsigned OpFlags) const { MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -1097,8 +1166,8 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { // some reason we receive input GMIR that has an s64 shift amount that's not // a G_CONSTANT, insert a truncate so that we can still select the s32 // register-register variant. - unsigned SrcReg = I.getOperand(1).getReg(); - unsigned ShiftReg = I.getOperand(2).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + Register ShiftReg = I.getOperand(2).getReg(); const LLT ShiftTy = MRI.getType(ShiftReg); const LLT SrcTy = MRI.getType(SrcReg); if (SrcTy.isVector()) @@ -1118,6 +1187,9 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const { } return; } + case TargetOpcode::G_STORE: + contractCrossBankCopyIntoStore(I, MRI); + return; default: return; } @@ -1158,6 +1230,48 @@ bool AArch64InstructionSelector::earlySelectSHL( return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI); } +void AArch64InstructionSelector::contractCrossBankCopyIntoStore( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE"); + // If we're storing a scalar, it doesn't matter what register bank that + // scalar is on. All that matters is the size. + // + // So, if we see something like this (with a 32-bit scalar as an example): + // + // %x:gpr(s32) = ... something ... + // %y:fpr(s32) = COPY %x:gpr(s32) + // G_STORE %y:fpr(s32) + // + // We can fix this up into something like this: + // + // G_STORE %x:gpr(s32) + // + // And then continue the selection process normally. + MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI); + if (!Def) + return; + Register DefDstReg = Def->getOperand(0).getReg(); + LLT DefDstTy = MRI.getType(DefDstReg); + Register StoreSrcReg = I.getOperand(0).getReg(); + LLT StoreSrcTy = MRI.getType(StoreSrcReg); + + // If we get something strange like a physical register, then we shouldn't + // go any further. + if (!DefDstTy.isValid()) + return; + + // Are the source and dst types the same size? + if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits()) + return; + + if (RBI.getRegBank(StoreSrcReg, MRI, TRI) == + RBI.getRegBank(DefDstReg, MRI, TRI)) + return; + + // We have a cross-bank copy, which is entering a store. Let's fold it. + I.getOperand(0).setReg(DefDstReg); +} + bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -1169,13 +1283,37 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { switch (I.getOpcode()) { case TargetOpcode::G_SHL: return earlySelectSHL(I, MRI); + case TargetOpcode::G_CONSTANT: { + bool IsZero = false; + if (I.getOperand(1).isCImm()) + IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0; + else if (I.getOperand(1).isImm()) + IsZero = I.getOperand(1).getImm() == 0; + + if (!IsZero) + return false; + + Register DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32)) + return false; + + if (Ty == LLT::scalar(64)) { + I.getOperand(1).ChangeToRegister(AArch64::XZR, false); + RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI); + } else { + I.getOperand(1).ChangeToRegister(AArch64::WZR, false); + RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI); + } + I.setDesc(TII.get(TargetOpcode::COPY)); + return true; + } default: return false; } } -bool AArch64InstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool AArch64InstructionSelector::select(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -1244,7 +1382,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, if (earlySelect(I)) return true; - if (selectImpl(I, CoverageInfo)) + if (selectImpl(I, *CoverageInfo)) return true; LLT Ty = @@ -1439,14 +1577,43 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return true; } case TargetOpcode::G_EXTRACT: { - LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); - LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT DstTy = MRI.getType(DstReg); (void)DstTy; unsigned SrcSize = SrcTy.getSizeInBits(); - // Larger extracts are vectors, same-size extracts should be something else - // by now (either split up or simplified to a COPY). - if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32) - return false; + + if (SrcTy.getSizeInBits() > 64) { + // This should be an extract of an s128, which is like a vector extract. + if (SrcTy.getSizeInBits() != 128) + return false; + // Only support extracting 64 bits from an s128 at the moment. + if (DstTy.getSizeInBits() != 64) + return false; + + const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + // Check we have the right regbank always. + assert(SrcRB.getID() == AArch64::FPRRegBankID && + DstRB.getID() == AArch64::FPRRegBankID && + "Wrong extract regbank!"); + (void)SrcRB; + + // Emit the same code as a vector extract. + // Offset must be a multiple of 64. + unsigned Offset = I.getOperand(2).getImm(); + if (Offset % 64 != 0) + return false; + unsigned LaneIdx = Offset / 64; + MachineIRBuilder MIB(I); + MachineInstr *Extract = emitExtractVectorElt( + DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB); + if (!Extract) + return false; + I.eraseFromParent(); + return true; + } I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + @@ -1458,7 +1625,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - Register DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator())); MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {}) .addReg(DstReg, 0, AArch64::sub_32); @@ -1521,11 +1688,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_GLOBAL_VALUE: { auto GV = I.getOperand(1).getGlobal(); - if (GV->isThreadLocal()) { - // FIXME: we don't support TLS yet. - return false; - } - unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM); + if (GV->isThreadLocal()) + return selectTLSGlobalValue(I, MRI); + + unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM); if (OpFlags & AArch64II::MO_GOT) { I.setDesc(TII.get(AArch64::LOADgot)); I.getOperand(1).setTargetFlags(OpFlags); @@ -1562,8 +1728,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I, } auto &MemOp = **I.memoperands_begin(); - if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { - LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + if (MemOp.isAtomic()) { + // For now we just support s8 acquire loads to be able to compile stack + // protector code. + if (MemOp.getOrdering() == AtomicOrdering::Acquire && + MemOp.getSize() == 1) { + I.setDesc(TII.get(AArch64::LDARB)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n"); return false; } unsigned MemSizeInBits = MemOp.getSize() * 8; @@ -1598,7 +1771,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, const unsigned Size = MemSizeInBits / 8; const unsigned Scale = Log2_32(Size); if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) { - unsigned Ptr2Reg = PtrMI->getOperand(1).getReg(); + Register Ptr2Reg = PtrMI->getOperand(1).getReg(); I.getOperand(1).setReg(Ptr2Reg); PtrMI = MRI.getVRegDef(Ptr2Reg); Offset = Imm / Size; @@ -1688,8 +1861,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return selectVectorSHL(I, MRI); LLVM_FALLTHROUGH; case TargetOpcode::G_OR: - case TargetOpcode::G_LSHR: - case TargetOpcode::G_GEP: { + case TargetOpcode::G_LSHR: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) return false; @@ -1711,6 +1883,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } + case TargetOpcode::G_GEP: { + MachineIRBuilder MIRBuilder(I); + emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), + MIRBuilder); + I.eraseFromParent(); + return true; + } case TargetOpcode::G_UADDO: { // TODO: Support other types. unsigned OpSize = Ty.getSizeInBits(); @@ -1816,6 +1995,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I, constrainSelectedInstRegOperands(I, TII, TRI, RBI); return true; } + + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) { + MachineIRBuilder MIB(I); + MachineInstr *Extract = emitExtractVectorElt( + DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB); + if (!Extract) + return false; + I.eraseFromParent(); + return true; + } } return false; @@ -1868,21 +2057,41 @@ bool AArch64InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_ZEXT: case TargetOpcode::G_SEXT: { unsigned Opcode = I.getOpcode(); - const LLT DstTy = MRI.getType(I.getOperand(0).getReg()), - SrcTy = MRI.getType(I.getOperand(1).getReg()); - const bool isSigned = Opcode == TargetOpcode::G_SEXT; + const bool IsSigned = Opcode == TargetOpcode::G_SEXT; const Register DefReg = I.getOperand(0).getReg(); const Register SrcReg = I.getOperand(1).getReg(); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + const LLT DstTy = MRI.getType(DefReg); + const LLT SrcTy = MRI.getType(SrcReg); + unsigned DstSize = DstTy.getSizeInBits(); + unsigned SrcSize = SrcTy.getSizeInBits(); - if (RB.getID() != AArch64::GPRRegBankID) { - LLVM_DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB - << ", expected: GPR\n"); - return false; - } + assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == + AArch64::GPRRegBankID && + "Unexpected ext regbank"); + MachineIRBuilder MIB(I); MachineInstr *ExtI; - if (DstTy == LLT::scalar(64)) { + if (DstTy.isVector()) + return false; // Should be handled by imported patterns. + + // First check if we're extending the result of a load which has a dest type + // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest + // GPR register on AArch64 and all loads which are smaller automatically + // zero-extend the upper bits. E.g. + // %v(s8) = G_LOAD %p, :: (load 1) + // %v2(s32) = G_ZEXT %v(s8) + if (!IsSigned) { + auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI); + if (LoadMI && + RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) { + const MachineMemOperand *MemOp = *LoadMI->memoperands_begin(); + unsigned BytesLoaded = MemOp->getSize(); + if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded) + return selectCopy(I, TII, MRI, TRI, RBI); + } + } + + if (DstSize == 64) { // FIXME: Can we avoid manually doing this? if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) { LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode) @@ -1890,33 +2099,26 @@ bool AArch64InstructionSelector::select(MachineInstr &I, return false; } - const Register SrcXReg = - MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) - .addDef(SrcXReg) - .addImm(0) - .addUse(SrcReg) - .addImm(AArch64::sub_32); - - const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri; - ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc)) - .addDef(DefReg) - .addUse(SrcXReg) - .addImm(0) - .addImm(SrcTy.getSizeInBits() - 1); - } else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) { - const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri; - ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc)) - .addDef(DefReg) - .addUse(SrcReg) - .addImm(0) - .addImm(SrcTy.getSizeInBits() - 1); + auto SubregToReg = + MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {}) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::sub_32); + + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri, + {DefReg}, {SubregToReg}) + .addImm(0) + .addImm(SrcSize - 1); + } else if (DstSize <= 32) { + ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri, + {DefReg}, {SrcReg}) + .addImm(0) + .addImm(SrcSize - 1); } else { return false; } constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); - I.eraseFromParent(); return true; } @@ -2163,6 +2365,37 @@ bool AArch64InstructionSelector::selectJumpTable( return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); } +bool AArch64InstructionSelector::selectTLSGlobalValue( + MachineInstr &I, MachineRegisterInfo &MRI) const { + if (!STI.isTargetMachO()) + return false; + MachineFunction &MF = *I.getParent()->getParent(); + MF.getFrameInfo().setAdjustsStack(true); + + const GlobalValue &GV = *I.getOperand(1).getGlobal(); + MachineIRBuilder MIB(I); + + MIB.buildInstr(AArch64::LOADgot, {AArch64::X0}, {}) + .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); + + auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, + {Register(AArch64::X0)}) + .addImm(0); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be + // silly). + MIB.buildInstr(AArch64::BLR, {}, {Load}) + .addDef(AArch64::X0, RegState::Implicit) + .addRegMask(TRI.getTLSCallPreservedMask()); + + MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0)); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass, + MRI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::selectIntrinsicTrunc( MachineInstr &I, MachineRegisterInfo &MRI) const { const LLT SrcTy = MRI.getType(I.getOperand(0).getReg()); @@ -2478,16 +2711,40 @@ bool AArch64InstructionSelector::selectMergeValues( const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation"); + const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); - // At the moment we only support merging two s32s into an s64. if (I.getNumOperands() != 3) return false; - if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) - return false; - const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI); + + // Merging 2 s64s into an s128. + if (DstTy == LLT::scalar(128)) { + if (SrcTy.getSizeInBits() != 64) + return false; + MachineIRBuilder MIB(I); + Register DstReg = I.getOperand(0).getReg(); + Register Src1Reg = I.getOperand(1).getReg(); + Register Src2Reg = I.getOperand(2).getReg(); + auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {}); + MachineInstr *InsMI = + emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB); + if (!InsMI) + return false; + MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(), + Src2Reg, /* LaneIdx */ 1, RB, MIB); + if (!Ins2MI) + return false; + constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI); + constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI); + I.eraseFromParent(); + return true; + } + if (RB.getID() != AArch64::GPRRegBankID) return false; + if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32) + return false; + auto *DstRC = &AArch64::GPR64RegClass; Register SubToRegDef = MRI.createVirtualRegister(DstRC); MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(), @@ -2695,7 +2952,8 @@ bool AArch64InstructionSelector::selectUnmergeValues( const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg()); const LLT WideTy = MRI.getType(SrcReg); (void)WideTy; - assert(WideTy.isVector() && "can only unmerge from vector types!"); + assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) && + "can only unmerge from vector or s128 types!"); assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() && "source register size too small!"); @@ -2802,29 +3060,6 @@ bool AArch64InstructionSelector::selectConcatVectors( return true; } -void AArch64InstructionSelector::collectShuffleMaskIndices( - MachineInstr &I, MachineRegisterInfo &MRI, - SmallVectorImpl<Optional<int>> &Idxs) const { - MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg()); - assert( - MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR && - "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR"); - // Find the constant indices. - for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) { - // Look through copies. - MachineInstr *ScalarDef = - getDefIgnoringCopies(MaskDef->getOperand(i).getReg(), MRI); - assert(ScalarDef && "Could not find vreg def of shufflevec index op"); - if (ScalarDef->getOpcode() != TargetOpcode::G_CONSTANT) { - // This be an undef if not a constant. - assert(ScalarDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF); - Idxs.push_back(None); - } else { - Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue()); - } - } -} - unsigned AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const { @@ -2906,6 +3141,31 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { } MachineInstr * +AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri}, + {AArch64::ADDWrr, AArch64::ADDWri}}; + bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32; + auto ImmFns = selectArithImmed(RHS); + unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()]; + auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()}); + + // If we matched a valid constant immediate, add those operands. + if (ImmFns) { + for (auto &RenderFn : *ImmFns) + RenderFn(AddMI); + } else { + AddMI.addUse(RHS.getReg()); + } + + constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI); + return &*AddMI; +} + +MachineInstr * AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); @@ -3151,7 +3411,7 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { // Can't see past copies from physregs. if (Opc == TargetOpcode::COPY && - TargetRegisterInfo::isPhysicalRegister(CondDef->getOperand(1).getReg())) + Register::isPhysicalRegister(CondDef->getOperand(1).getReg())) return false; CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg()); @@ -3342,16 +3602,9 @@ bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const { return false; // The shuffle's second operand doesn't matter if the mask is all zero. - auto *ZeroVec = getOpcodeDef(G_BUILD_VECTOR, I.getOperand(3).getReg(), MRI); - if (!ZeroVec) + const Constant *Mask = I.getOperand(3).getShuffleMask(); + if (!isa<ConstantAggregateZero>(Mask)) return false; - int64_t Zero = 0; - if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero) - return false; - for (unsigned i = 1, e = ZeroVec->getNumOperands() - 1; i < e; ++i) { - if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg()) - return false; // This wasn't an all zeros vector. - } // We're done, now find out what kind of splat we need. LLT VecTy = MRI.getType(I.getOperand(0).getReg()); @@ -3399,19 +3652,14 @@ bool AArch64InstructionSelector::selectShuffleVector( const LLT Src1Ty = MRI.getType(Src1Reg); Register Src2Reg = I.getOperand(2).getReg(); const LLT Src2Ty = MRI.getType(Src2Reg); + const Constant *ShuffleMask = I.getOperand(3).getShuffleMask(); MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); LLVMContext &Ctx = MF.getFunction().getContext(); - // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask - // operand, it comes in as a normal vector value which we have to analyze to - // find the mask indices. If the mask element is undef, then - // collectShuffleMaskIndices() will add a None entry for that index into - // the list. - SmallVector<Optional<int>, 8> Mask; - collectShuffleMaskIndices(I, MRI, Mask); - assert(!Mask.empty() && "Expected to find mask indices"); + SmallVector<int, 8> Mask; + ShuffleVectorInst::getShuffleMask(ShuffleMask, Mask); // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if // it's originated from a <1 x T> type. Those should have been lowered into @@ -3424,10 +3672,10 @@ bool AArch64InstructionSelector::selectShuffleVector( unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8; SmallVector<Constant *, 64> CstIdxs; - for (auto &MaybeVal : Mask) { + for (int Val : Mask) { // For now, any undef indexes we'll just assume to be 0. This should be // optimized in future, e.g. to select DUP etc. - int Val = MaybeVal.hasValue() ? *MaybeVal : 0; + Val = Val < 0 ? 0 : Val; for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset)); @@ -3684,21 +3932,6 @@ static unsigned findIntrinsicID(MachineInstr &I) { return IntrinOp->getIntrinsicID(); } -/// Helper function to emit the correct opcode for a llvm.aarch64.stlxr -/// intrinsic. -static unsigned getStlxrOpcode(unsigned NumBytesToStore) { - switch (NumBytesToStore) { - // TODO: 1, 2, and 4 byte stores. - case 8: - return AArch64::STLXRX; - default: - LLVM_DEBUG(dbgs() << "Unexpected number of bytes to store! (" - << NumBytesToStore << ")\n"); - break; - } - return 0; -} - bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( MachineInstr &I, MachineRegisterInfo &MRI) const { // Find the intrinsic ID. @@ -3719,32 +3952,6 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( return false; MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); break; - case Intrinsic::aarch64_stlxr: - Register StatReg = I.getOperand(0).getReg(); - assert(RBI.getSizeInBits(StatReg, MRI, TRI) == 32 && - "Status register must be 32 bits!"); - Register SrcReg = I.getOperand(2).getReg(); - - if (RBI.getSizeInBits(SrcReg, MRI, TRI) != 64) { - LLVM_DEBUG(dbgs() << "Only support 64-bit sources right now.\n"); - return false; - } - - Register PtrReg = I.getOperand(3).getReg(); - assert(MRI.getType(PtrReg).isPointer() && "Expected pointer operand"); - - // Expect only one memory operand. - if (!I.hasOneMemOperand()) - return false; - - const MachineMemOperand *MemOp = *I.memoperands_begin(); - unsigned NumBytesToStore = MemOp->getSize(); - unsigned Opc = getStlxrOpcode(NumBytesToStore); - if (!Opc) - return false; - - auto StoreMI = MIRBuilder.buildInstr(Opc, {StatReg}, {SrcReg, PtrReg}); - constrainSelectedInstRegOperands(*StoreMI, TII, TRI, RBI); } I.eraseFromParent(); @@ -3860,6 +4067,30 @@ AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const { return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}}; } +/// Helper to select an immediate value that can be represented as a 12-bit +/// value shifted left by either 0 or 12. If it is possible to do so, return +/// the immediate and shift value. If not, return None. +/// +/// Used by selectArithImmed and selectNegArithImmed. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::select12BitValueWithLeftShift( + uint64_t Immed) const { + unsigned ShiftAmt; + if (Immed >> 12 == 0) { + ShiftAmt = 0; + } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { + ShiftAmt = 12; + Immed = Immed >> 12; + } else + return None; + + unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, + }}; +} + /// SelectArithImmed - Select an immediate value that can be represented as /// a 12-bit value shifted left by either 0 or 12. If so, return true with /// Val set to the 12-bit value and Shift set to the shifter operand. @@ -3873,22 +4104,229 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { auto MaybeImmed = getImmedFromMO(Root); if (MaybeImmed == None) return None; + return select12BitValueWithLeftShift(*MaybeImmed); +} + +/// SelectNegArithImmed - As above, but negates the value before trying to +/// select it. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const { + // We need a register here, because we need to know if we have a 64 or 32 + // bit immediate. + if (!Root.isReg()) + return None; + auto MaybeImmed = getImmedFromMO(Root); + if (MaybeImmed == None) + return None; uint64_t Immed = *MaybeImmed; - unsigned ShiftAmt; - if (Immed >> 12 == 0) { - ShiftAmt = 0; - } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { - ShiftAmt = 12; - Immed = Immed >> 12; - } else + // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" + // have the opposite effect on the C flag, so this pattern mustn't match under + // those circumstances. + if (Immed == 0) return None; - unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, - }}; + // Check if we're dealing with a 32-bit type on the root or a 64-bit type on + // the root. + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + if (MRI.getType(Root.getReg()).getSizeInBits() == 32) + Immed = ~((uint32_t)Immed) + 1; + else + Immed = ~Immed + 1ULL; + + if (Immed & 0xFFFFFFFFFF000000ULL) + return None; + + Immed &= 0xFFFFFFULL; + return select12BitValueWithLeftShift(Immed); +} + +/// Return true if it is worth folding MI into an extended register. That is, +/// if it's safe to pull it into the addressing mode of a load or store as a +/// shift. +bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( + MachineInstr &MI, const MachineRegisterInfo &MRI) const { + // Always fold if there is one use, or if we're optimizing for size. + Register DefReg = MI.getOperand(0).getReg(); + if (MRI.hasOneUse(DefReg) || + MI.getParent()->getParent()->getFunction().hasMinSize()) + return true; + + // It's better to avoid folding and recomputing shifts when we don't have a + // fastpath. + if (!STI.hasLSLFast()) + return false; + + // We have a fastpath, so folding a shift in and potentially computing it + // many times may be beneficial. Check if this is only used in memory ops. + // If it is, then we should fold. + return all_of(MRI.use_instructions(DefReg), + [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3, lsl #3] +/// +/// Where x2 is the base register, and x3 is an offset register. The shift-left +/// is a constant value specific to this load instruction. That is, we'll never +/// see anything other than a 3 here (which corresponds to the size of the +/// element being loaded.) +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( + MachineOperand &Root, unsigned SizeInBytes) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // Make sure that the memory op is a valid size. + int64_t LegalShiftVal = Log2_32(SizeInBytes); + if (LegalShiftVal == 0) + return None; + + // We want to find something like this: + // + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_GEP base_reg shift + // x = G_LOAD ptr + // + // And fold it into this addressing mode: + // + // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] + + // Check if we can find the G_GEP. + MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI); + if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI)) + return None; + + // Now, try to match an opcode which will match our specific offset. + // We want a G_SHL or a G_MUL. + MachineInstr *OffsetInst = getDefIgnoringCopies(Gep->getOperand(2).getReg(), MRI); + if (!OffsetInst) + return None; + + unsigned OffsetOpc = OffsetInst->getOpcode(); + if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) + return None; + + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + return None; + + // Now, try to find the specific G_CONSTANT. Start by assuming that the + // register we will offset is the LHS, and the register containing the + // constant is the RHS. + Register OffsetReg = OffsetInst->getOperand(1).getReg(); + Register ConstantReg = OffsetInst->getOperand(2).getReg(); + auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!ValAndVReg) { + // We didn't get a constant on the RHS. If the opcode is a shift, then + // we're done. + if (OffsetOpc == TargetOpcode::G_SHL) + return None; + + // If we have a G_MUL, we can use either register. Try looking at the RHS. + std::swap(OffsetReg, ConstantReg); + ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!ValAndVReg) + return None; + } + + // The value must fit into 3 bits, and must be positive. Make sure that is + // true. + int64_t ImmVal = ValAndVReg->Value; + + // Since we're going to pull this into a shift, the constant value must be + // a power of 2. If we got a multiply, then we need to check this. + if (OffsetOpc == TargetOpcode::G_MUL) { + if (!isPowerOf2_32(ImmVal)) + return None; + + // Got a power of 2. So, the amount we'll shift is the log base-2 of that. + ImmVal = Log2_32(ImmVal); + } + + if ((ImmVal & 0x7) != ImmVal) + return None; + + // We are only allowed to shift by LegalShiftVal. This shift value is built + // into the instruction, so we can't just use whatever we want. + if (ImmVal != LegalShiftVal) + return None; + + // We can use the LHS of the GEP as the base, and the LHS of the shift as an + // offset. Signify that we are shifting by setting the shift flag to 1. + return {{[=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(1).getReg()); + }, + [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, + [=](MachineInstrBuilder &MIB) { + // Need to add both immediates here to make sure that they are both + // added to the instruction. + MIB.addImm(0); + MIB.addImm(1); + }}}; +} + +/// This is used for computing addresses like this: +/// +/// ldr x1, [x2, x3] +/// +/// Where x2 is the base register, and x3 is an offset register. +/// +/// When possible (or profitable) to fold a G_GEP into the address calculation, +/// this will do so. Otherwise, it will return None. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeRegisterOffset( + MachineOperand &Root) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // We need a GEP. + MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); + if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP) + return None; + + // If this is used more than once, let's not bother folding. + // TODO: Check if they are memory ops. If they are, then we can still fold + // without having to recompute anything. + if (!MRI.hasOneUse(Gep->getOperand(0).getReg())) + return None; + + // Base is the GEP's LHS, offset is its RHS. + return {{[=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(1).getReg()); + }, + [=](MachineInstrBuilder &MIB) { + MIB.addUse(Gep->getOperand(2).getReg()); + }, + [=](MachineInstrBuilder &MIB) { + // Need to add both immediates here to make sure that they are both + // added to the instruction. + MIB.addImm(0); + MIB.addImm(0); + }}}; +} + +/// This is intended to be equivalent to selectAddrModeXRO in +/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, + unsigned SizeInBytes) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // If we have a constant offset, then we probably don't want to match a + // register offset. + if (isBaseWithConstantOffset(Root, MRI)) + return None; + + // Try to fold shifts into the addressing mode. + auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); + if (AddrModeFns) + return AddrModeFns; + + // If that doesn't work, see if it's possible to fold in registers from + // a GEP. + return selectAddrModeRegisterOffset(Root); } /// Select a "register plus unscaled signed 9-bit immediate" address. This @@ -3994,6 +4432,205 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, }}; } +/// Given a shift instruction, return the correct shift type for that +/// instruction. +static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) { + // TODO: Handle AArch64_AM::ROR + switch (MI.getOpcode()) { + default: + return AArch64_AM::InvalidShiftExtend; + case TargetOpcode::G_SHL: + return AArch64_AM::LSL; + case TargetOpcode::G_LSHR: + return AArch64_AM::LSR; + case TargetOpcode::G_ASHR: + return AArch64_AM::ASR; + } +} + +/// Select a "shifted register" operand. If the value is not shifted, set the +/// shift operand to a default value of "lsl 0". +/// +/// TODO: Allow shifted register to be rotated in logical instructions. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + // Check if the operand is defined by an instruction which corresponds to + // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc. + // + // TODO: Handle AArch64_AM::ROR for logical instructions. + MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg()); + if (!ShiftInst) + return None; + AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst); + if (ShType == AArch64_AM::InvalidShiftExtend) + return None; + if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI)) + return None; + + // Need an immediate on the RHS. + MachineOperand &ShiftRHS = ShiftInst->getOperand(2); + auto Immed = getImmedFromMO(ShiftRHS); + if (!Immed) + return None; + + // We have something that we can fold. Fold in the shift's LHS and RHS into + // the instruction. + MachineOperand &ShiftLHS = ShiftInst->getOperand(1); + Register ShiftReg = ShiftLHS.getReg(); + + unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits(); + unsigned Val = *Immed & (NumBits - 1); + unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val); + + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; +} + +/// Get the correct ShiftExtendType for an extend instruction. +static AArch64_AM::ShiftExtendType +getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) { + unsigned Opc = MI.getOpcode(); + + // Handle explicit extend instructions first. + if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + assert(Size != 64 && "Extend from 64 bits?"); + switch (Size) { + case 8: + return AArch64_AM::SXTB; + case 16: + return AArch64_AM::SXTH; + case 32: + return AArch64_AM::SXTW; + default: + return AArch64_AM::InvalidShiftExtend; + } + } + + if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) { + unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + assert(Size != 64 && "Extend from 64 bits?"); + switch (Size) { + case 8: + return AArch64_AM::UXTB; + case 16: + return AArch64_AM::UXTH; + case 32: + return AArch64_AM::UXTW; + default: + return AArch64_AM::InvalidShiftExtend; + } + } + + // Don't have an explicit extend. Try to handle a G_AND with a constant mask + // on the RHS. + if (Opc != TargetOpcode::G_AND) + return AArch64_AM::InvalidShiftExtend; + + Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2)); + if (!MaybeAndMask) + return AArch64_AM::InvalidShiftExtend; + uint64_t AndMask = *MaybeAndMask; + switch (AndMask) { + default: + return AArch64_AM::InvalidShiftExtend; + case 0xFF: + return AArch64_AM::UXTB; + case 0xFFFF: + return AArch64_AM::UXTH; + case 0xFFFFFFFF: + return AArch64_AM::UXTW; + } +} + +Register AArch64InstructionSelector::narrowExtendRegIfNeeded( + Register ExtReg, MachineIRBuilder &MIB) const { + MachineRegisterInfo &MRI = *MIB.getMRI(); + if (MRI.getType(ExtReg).getSizeInBits() == 32) + return ExtReg; + + // Insert a copy to move ExtReg to GPR32. + Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg}); + + // Select the copy into a subregister copy. + selectCopy(*Copy, TII, MRI, TRI, RBI); + return Copy.getReg(0); +} + +/// Select an "extended register" operand. This operand folds in an extend +/// followed by an optional left shift. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectArithExtendedRegister( + MachineOperand &Root) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + uint64_t ShiftVal = 0; + Register ExtReg; + AArch64_AM::ShiftExtendType Ext; + MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI); + if (!RootDef) + return None; + + if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI)) + return None; + + // Check if we can fold a shift and an extend. + if (RootDef->getOpcode() == TargetOpcode::G_SHL) { + // Look for a constant on the RHS of the shift. + MachineOperand &RHS = RootDef->getOperand(2); + Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS); + if (!MaybeShiftVal) + return None; + ShiftVal = *MaybeShiftVal; + if (ShiftVal > 4) + return None; + // Look for a valid extend instruction on the LHS of the shift. + MachineOperand &LHS = RootDef->getOperand(1); + MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI); + if (!ExtDef) + return None; + Ext = getExtendTypeForInst(*ExtDef, MRI); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + ExtReg = ExtDef->getOperand(1).getReg(); + } else { + // Didn't get a shift. Try just folding an extend. + Ext = getExtendTypeForInst(*RootDef, MRI); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + ExtReg = RootDef->getOperand(1).getReg(); + + // If we have a 32 bit instruction which zeroes out the high half of a + // register, we get an implicit zero extend for free. Check if we have one. + // FIXME: We actually emit the extend right now even though we don't have + // to. + if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) { + MachineInstr *ExtInst = MRI.getVRegDef(ExtReg); + if (ExtInst && isDef32(*ExtInst)) + return None; + } + } + + // We require a GPR32 here. Narrow the ExtReg if needed using a subregister + // copy. + MachineIRBuilder MIB(*RootDef); + ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB); + + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addImm(getArithExtendImm(Ext, ShiftVal)); + }}}; +} + void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -4003,6 +4640,51 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, MIB.addImm(CstVal.getValue()); } +void AArch64InstructionSelector::renderLogicalImm32( + MachineInstrBuilder &MIB, const MachineInstr &I) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); + uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); + MIB.addImm(Enc); +} + +void AArch64InstructionSelector::renderLogicalImm64( + MachineInstrBuilder &MIB, const MachineInstr &I) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); + uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); + MIB.addImm(Enc); +} + +bool AArch64InstructionSelector::isLoadStoreOfNumBytes( + const MachineInstr &MI, unsigned NumBytes) const { + if (!MI.mayLoadOrStore()) + return false; + assert(MI.hasOneMemOperand() && + "Expected load/store to have only one mem op!"); + return (*MI.memoperands_begin())->getSize() == NumBytes; +} + +bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32) + return false; + + // Only return true if we know the operation will zero-out the high half of + // the 64-bit register. Truncates can be subregister copies, which don't + // zero out the high bits. Copies and other copy-like instructions can be + // fed by truncates, or could be lowered as subregister copies. + switch (MI.getOpcode()) { + default: + return true; + case TargetOpcode::COPY: + case TargetOpcode::G_BITCAST: + case TargetOpcode::G_TRUNC: + case TargetOpcode::G_PHI: + return false; + } +} + namespace llvm { InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &TM, diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index a985b330eafa..7a1901bd5b1e 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -13,7 +13,9 @@ #include "AArch64LegalizerInfo.h" #include "AArch64Subtarget.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" @@ -50,6 +52,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { const LLT v2s64 = LLT::vector(2, 64); const LLT v2p0 = LLT::vector(2, p0); + // FIXME: support subtargets which have neon/fp-armv8 disabled. + if (!ST.hasNEON() || !ST.hasFPARMv8()) { + computeTables(); + return; + } + getActionDefinitionsBuilder(G_IMPLICIT_DEF) .legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64}) .clampScalar(0, s1, s64) @@ -74,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder(G_BSWAP) .legalFor({s32, s64, v4s32, v2s32, v2s64}) - .clampScalar(0, s16, s64) + .clampScalar(0, s32, s64) .widenScalarToNextPow2(0); getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) @@ -104,6 +112,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder({G_SDIV, G_UDIV}) .legalFor({s32, s64}) + .libcallFor({s128}) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0) .scalarize(0); @@ -115,8 +124,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && AmtTy.getSizeInBits() == 32; }) - .legalFor( - {{s32, s32}, {s32, s64}, {s64, s64}, {v2s32, v2s32}, {v4s32, v4s32}}) + .legalFor({{s32, s32}, + {s32, s64}, + {s64, s64}, + {v2s32, v2s32}, + {v4s32, v4s32}, + {v2s64, v2s64}}) .clampScalar(1, s32, s64) .clampScalar(0, s32, s64) .minScalarSameAs(1, 0); @@ -191,14 +204,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .legalIf([=](const LegalityQuery &Query) { const LLT &Ty0 = Query.Types[0]; const LLT &Ty1 = Query.Types[1]; - if (Ty1 != s32 && Ty1 != s64) + if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128) return false; if (Ty1 == p0) return true; return isPowerOf2_32(Ty0.getSizeInBits()) && (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8); }) - .clampScalar(1, s32, s64) + .clampScalar(1, s32, s128) .widenScalarToNextPow2(1) .maxScalarIf(typeInSet(1, {s32}), 0, s16) .maxScalarIf(typeInSet(1, {s64}), 0, s32) @@ -236,6 +249,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { {s32, p0, 32, 8}, {s64, p0, 64, 8}, {p0, p0, 64, 8}, + {s128, p0, 128, 8}, {v8s8, p0, 64, 8}, {v16s8, p0, 128, 8}, {v4s16, p0, 64, 8}, @@ -247,14 +261,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}}) .clampScalar(0, s8, s64) - .widenScalarToNextPow2(0) - // TODO: We could support sum-of-pow2's but the lowering code doesn't know - // how to do that yet. - .unsupportedIfMemSizeNotPow2() + .lowerIfMemSizeNotPow2() // Lower any any-extending loads left into G_ANYEXT and G_LOAD .lowerIf([=](const LegalityQuery &Query) { return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; }) + .widenScalarToNextPow2(0) .clampMaxNumElements(0, s32, 2) .clampMaxNumElements(0, s64, 1) .customIf(IsPtrVecPred); @@ -262,9 +274,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder(G_STORE) .legalForTypesWithMemDesc({{s8, p0, 8, 8}, {s16, p0, 16, 8}, + {s32, p0, 8, 8}, + {s32, p0, 16, 8}, {s32, p0, 32, 8}, {s64, p0, 64, 8}, {p0, p0, 64, 8}, + {s128, p0, 128, 8}, {v16s8, p0, 128, 8}, {v4s16, p0, 64, 8}, {v8s16, p0, 128, 8}, @@ -272,10 +287,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { {v4s32, p0, 128, 8}, {v2s64, p0, 128, 8}}) .clampScalar(0, s8, s64) - .widenScalarToNextPow2(0) - // TODO: We could support sum-of-pow2's but the lowering code doesn't know - // how to do that yet. - .unsupportedIfMemSizeNotPow2() + .lowerIfMemSizeNotPow2() .lowerIf([=](const LegalityQuery &Query) { return Query.Types[0].isScalar() && Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits; @@ -305,8 +317,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { {v8s16, v8s16}, {v8s8, v8s8}, {v16s8, v16s8}}) - .clampScalar(0, s32, s32) .clampScalar(1, s32, s64) + .clampScalar(0, s32, s32) .minScalarEltSameAsIf( [=](const LegalityQuery &Query) { const LLT &Ty = Query.Types[0]; @@ -330,33 +342,40 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .widenScalarToNextPow2(1); // Extensions - getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) - .legalIf([=](const LegalityQuery &Query) { - unsigned DstSize = Query.Types[0].getSizeInBits(); - - // Make sure that we have something that will fit in a register, and - // make sure it's a power of 2. - if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) - return false; + auto ExtLegalFunc = [=](const LegalityQuery &Query) { + unsigned DstSize = Query.Types[0].getSizeInBits(); + + if (DstSize == 128 && !Query.Types[0].isVector()) + return false; // Extending to a scalar s128 needs narrowing. + + // Make sure that we have something that will fit in a register, and + // make sure it's a power of 2. + if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) + return false; - const LLT &SrcTy = Query.Types[1]; + const LLT &SrcTy = Query.Types[1]; - // Special case for s1. - if (SrcTy == s1) - return true; + // Special case for s1. + if (SrcTy == s1) + return true; - // Make sure we fit in a register otherwise. Don't bother checking that - // the source type is below 128 bits. We shouldn't be allowing anything - // through which is wider than the destination in the first place. - unsigned SrcSize = SrcTy.getSizeInBits(); - if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) - return false; + // Make sure we fit in a register otherwise. Don't bother checking that + // the source type is below 128 bits. We shouldn't be allowing anything + // through which is wider than the destination in the first place. + unsigned SrcSize = SrcTy.getSizeInBits(); + if (SrcSize < 8 || !isPowerOf2_32(SrcSize)) + return false; - return true; - }); + return true; + }; + getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}) + .legalIf(ExtLegalFunc) + .clampScalar(0, s64, s64); // Just for s128, others are handled above. getActionDefinitionsBuilder(G_TRUNC).alwaysLegal(); + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + // FP conversions getActionDefinitionsBuilder(G_FPTRUNC).legalFor( {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}); @@ -591,6 +610,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { return Query.Types[0] == p0 && Query.Types[1] == s64; }); + getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); + computeTables(); verify(*ST.getInstrInfo()); } @@ -617,6 +638,24 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, llvm_unreachable("expected switch to return"); } +bool AArch64LegalizerInfo::legalizeIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + if (createMemLibcall(MIRBuilder, MRI, MI) == + LegalizerHelper::UnableToLegalize) + return false; + MI.eraseFromParent(); + return true; + default: + break; + } + return true; +} + bool AArch64LegalizerInfo::legalizeShlAshrLshr( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const { @@ -655,7 +694,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore( // legalized. In order to allow further legalization of the inst, we create // a new instruction and erase the existing one. - unsigned ValReg = MI.getOperand(0).getReg(); + Register ValReg = MI.getOperand(0).getReg(); const LLT ValTy = MRI.getType(ValReg); if (!ValTy.isVector() || !ValTy.getElementType().isPointer() || @@ -672,7 +711,7 @@ bool AArch64LegalizerInfo::legalizeLoadStore( auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg}); MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO); } else { - unsigned NewReg = MRI.createGenericVirtualRegister(NewTy); + Register NewReg = MRI.createGenericVirtualRegister(NewTy); auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO); MIRBuilder.buildBitcast({ValReg}, {NewLoad}); } diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h index f3362a18620f..15161bab466c 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/lib/Target/AArch64/AArch64LegalizerInfo.h @@ -31,6 +31,9 @@ public: MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const override; + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; + private: bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const; diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 65b5f906e3f6..a0c4a25bb5b9 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -201,8 +201,22 @@ static bool isNarrowStore(unsigned Opc) { } } +// These instruction set memory tag and either keep memory contents unchanged or +// set it to zero, ignoring the address part of the source register. +static bool isTagStore(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + return true; + } +} + // Scaling factor for unscaled load or store. -static int getMemScale(MachineInstr &MI) { +static int getMemScale(const MachineInstr &MI) { switch (MI.getOpcode()) { default: llvm_unreachable("Opcode has unknown scale!"); @@ -255,6 +269,11 @@ static int getMemScale(MachineInstr &MI) { case AArch64::STURQi: case AArch64::LDPQi: case AArch64::STPQi: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + case AArch64::STGPi: return 16; } } @@ -449,6 +468,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::STPWpre; case AArch64::STPXi: return AArch64::STPXpre; + case AArch64::STGOffset: + return AArch64::STGPreIndex; + case AArch64::STZGOffset: + return AArch64::STZGPreIndex; + case AArch64::ST2GOffset: + return AArch64::ST2GPreIndex; + case AArch64::STZ2GOffset: + return AArch64::STZ2GPreIndex; + case AArch64::STGPi: + return AArch64::STGPpre; } } @@ -518,6 +547,16 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::STPWpost; case AArch64::STPXi: return AArch64::STPXpost; + case AArch64::STGOffset: + return AArch64::STGPostIndex; + case AArch64::STZGOffset: + return AArch64::STZGPostIndex; + case AArch64::ST2GOffset: + return AArch64::ST2GPostIndex; + case AArch64::STZ2GOffset: + return AArch64::STZ2GPostIndex; + case AArch64::STGPi: + return AArch64::STGPpost; } } @@ -536,10 +575,30 @@ static bool isPairedLdSt(const MachineInstr &MI) { case AArch64::STPQi: case AArch64::STPWi: case AArch64::STPXi: + case AArch64::STGPi: return true; } } +// Returns the scale and offset range of pre/post indexed variants of MI. +static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, + int &MinOffset, int &MaxOffset) { + bool IsPaired = isPairedLdSt(MI); + bool IsTagStore = isTagStore(MI); + // ST*G and all paired ldst have the same scale in pre/post-indexed variants + // as in the "unsigned offset" variant. + // All other pre/post indexed ldst instructions are unscaled. + Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1; + + if (IsPaired) { + MinOffset = -64; + MaxOffset = 63; + } else { + MinOffset = -256; + MaxOffset = 255; + } +} + static const MachineOperand &getLdStRegOp(const MachineInstr &MI, unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); @@ -618,6 +677,11 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) { case AArch64::LDRWui: case AArch64::LDRHHui: case AArch64::LDRBBui: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + case AArch64::STGPi: // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: @@ -808,7 +872,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // STRWui %w1, ... // USE kill %w1 ; need to clear kill flag when moving STRWui downwards // STRW %w0 - unsigned Reg = getLdStRegOp(*I).getReg(); + Register Reg = getLdStRegOp(*I).getReg(); for (MachineInstr &MI : make_range(std::next(I), Paired)) MI.clearRegisterKills(Reg, TRI); } @@ -837,9 +901,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineOperand &DstMO = MIB->getOperand(SExtIdx); // Right now, DstMO has the extended register, since it comes from an // extended opcode. - unsigned DstRegX = DstMO.getReg(); + Register DstRegX = DstMO.getReg(); // Get the W variant of that register. - unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); + Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); // Update the result of LDP to use the W instead of the X variant. DstMO.setReg(DstRegW); LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); @@ -882,9 +946,9 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, int LoadSize = getMemScale(*LoadI); int StoreSize = getMemScale(*StoreI); - unsigned LdRt = getLdStRegOp(*LoadI).getReg(); + Register LdRt = getLdStRegOp(*LoadI).getReg(); const MachineOperand &StMO = getLdStRegOp(*StoreI); - unsigned StRt = getLdStRegOp(*StoreI).getReg(); + Register StRt = getLdStRegOp(*StoreI).getReg(); bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); assert((IsStoreXReg || @@ -933,10 +997,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, ? getLdStOffsetOp(*StoreI).getImm() : getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; - unsigned DestReg = IsStoreXReg - ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, - &AArch64::GPR64RegClass) - : LdRt; + unsigned DestReg = + IsStoreXReg ? Register(TRI->getMatchingSuperReg( + LdRt, AArch64::sub_32, &AArch64::GPR64RegClass)) + : LdRt; assert((UnscaledLdOffset >= UnscaledStOffset && (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && @@ -1042,7 +1106,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; MachineInstr &LoadMI = *I; - unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); + Register BaseReg = getLdStBaseOp(LoadMI).getReg(); // If the load is the first instruction in the block, there's obviously // not any matching store. @@ -1156,8 +1220,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, bool MayLoad = FirstMI.mayLoad(); bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); - unsigned Reg = getLdStRegOp(FirstMI).getReg(); - unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + Register Reg = getLdStRegOp(FirstMI).getReg(); + Register BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); @@ -1188,7 +1252,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. - unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); + Register MIBaseReg = getLdStBaseOp(MI).getReg(); int MIOffset = getLdStOffsetOp(MI).getImm(); bool MIIsUnscaled = TII->isUnscaledLdSt(MI); if (IsUnscaled != MIIsUnscaled) { @@ -1328,18 +1392,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); MachineInstrBuilder MIB; + int Scale, MinOffset, MaxOffset; + getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I)) .add(getLdStBaseOp(*I)) - .addImm(Value) + .addImm(Value / Scale) .setMemRefs(I->memoperands()) .setMIFlags(I->mergeFlagsWith(*Update)); } else { // Paired instruction. - int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) .add(getLdStRegOp(*I, 0)) @@ -1395,28 +1460,21 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, MI.getOperand(1).getReg() != BaseReg) break; - bool IsPairedInsn = isPairedLdSt(MemMI); int UpdateOffset = MI.getOperand(2).getImm(); if (MI.getOpcode() == AArch64::SUBXri) UpdateOffset = -UpdateOffset; - // For non-paired load/store instructions, the immediate must fit in a - // signed 9-bit integer. - if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + // The immediate must be a multiple of the scaling factor of the pre/post + // indexed instruction. + int Scale, MinOffset, MaxOffset; + getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset); + if (UpdateOffset % Scale != 0) break; - // For paired load/store instructions, the immediate must be a multiple of - // the scaling factor. The scaled offset must also fit into a signed 7-bit - // integer. - if (IsPairedInsn) { - int Scale = getMemScale(MemMI); - if (UpdateOffset % Scale != 0) - break; - - int ScaledOffset = UpdateOffset / Scale; - if (ScaledOffset > 63 || ScaledOffset < -64) - break; - } + // Scaled offset must fit in the instruction immediate. + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset) + break; // If we have a non-zero Offset, we check that it matches the amount // we're adding to the register. @@ -1433,7 +1491,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; - unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + Register BaseReg = getLdStBaseOp(MemMI).getReg(); int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions @@ -1442,13 +1500,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( if (MIUnscaledOffset != UnscaledOffset) return E; - // If the base register overlaps a destination register, we can't - // merge the update. - bool IsPairedInsn = isPairedLdSt(MemMI); - for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { - unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + // If the base register overlaps a source/destination register, we can't + // merge the update. This does not apply to tag store instructions which + // ignore the address part of the source register. + // This does not apply to STGPi as well, which does not have unpredictable + // behavior in this case unlike normal stores, and always performs writeback + // after reading the source register value. + if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + Register DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } } // Track which register units have been modified and used between the first @@ -1487,7 +1551,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; - unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + Register BaseReg = getLdStBaseOp(MemMI).getReg(); int Offset = getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously @@ -1496,11 +1560,13 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; // If the base register overlaps a destination register, we can't // merge the update. - bool IsPairedInsn = isPairedLdSt(MemMI); - for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { - unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + if (!isTagStore(MemMI)) { + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + Register DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } } // Track which register units have been modified and used between the first @@ -1659,7 +1725,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate // however, is not, so adjust here. int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); - // Look forward to try to find a post-index instruction. For example, + // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index e7d4a2789a28..afd5ae6bcbf2 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -148,6 +148,8 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, RefFlags |= AArch64MCExpr::VK_TLSDESC; break; } + } else if (MO.getTargetFlags() & AArch64II::MO_PREL) { + RefFlags |= AArch64MCExpr::VK_PREL; } else { // No modifier means this is a generic reference, classified as absolute for // the cases where it matters (:abs_g0: etc). diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 0efeeb272ec1..0009fb7b5520 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/Function.h" #include "llvm/MC/MCLinkerOptimizationHint.h" #include <cassert> @@ -95,6 +96,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// returned struct in a register. This field holds the virtual register into /// which the sret argument is passed. unsigned SRetReturnReg = 0; + /// SVE stack size (for predicates and data vectors) are maintained here + /// rather than in FrameInfo, as the placement and Stack IDs are target + /// specific. + uint64_t StackSizeSVE = 0; + + /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. @@ -131,6 +139,15 @@ public: ArgumentStackToRestore = bytes; } + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + void setStackSizeSVE(uint64_t S) { + HasCalculatedStackSizeSVE = true; + StackSizeSVE = S; + } + + uint64_t getStackSizeSVE() const { return StackSizeSVE; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index aff861aae6be..d503c39b1f90 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -162,11 +162,11 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, LiveIntervals &LIs = G.getMetadata().LIS; - if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) { - LLVM_DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd) - << '\n'); - LLVM_DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra) - << '\n'); + if (Register::isPhysicalRegister(Rd) || Register::isPhysicalRegister(Ra)) { + LLVM_DEBUG(dbgs() << "Rd is a physical reg:" + << Register::isPhysicalRegister(Rd) << '\n'); + LLVM_DEBUG(dbgs() << "Ra is a physical reg:" + << Register::isPhysicalRegister(Ra) << '\n'); return false; } @@ -359,8 +359,8 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) { case AArch64::FMADDDrrr: case AArch64::FNMSUBDrrr: case AArch64::FNMADDDrrr: { - unsigned Rd = MI.getOperand(0).getReg(); - unsigned Ra = MI.getOperand(3).getReg(); + Register Rd = MI.getOperand(0).getReg(); + Register Ra = MI.getOperand(3).getReg(); if (addIntraChainConstraint(G, Rd, Ra)) addInterChainConstraint(G, Rd, Ra); @@ -369,7 +369,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) { case AArch64::FMLAv2f32: case AArch64::FMLSv2f32: { - unsigned Rd = MI.getOperand(0).getReg(); + Register Rd = MI.getOperand(0).getReg(); addInterChainConstraint(G, Rd, Rd); break; } diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index 5f7245bfbd74..d30ea120bae4 100644 --- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -15,7 +15,9 @@ #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" @@ -25,12 +27,31 @@ using namespace llvm; using namespace MIPatternMatch; +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + namespace { +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + class AArch64PreLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + public: - AArch64PreLegalizerCombinerInfo() + AArch64GenPreLegalizerCombinerHelper Generated; + + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr) {} + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!Generated.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; @@ -38,24 +59,50 @@ public: bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { - CombinerHelper Helper(Observer, B); + CombinerHelper Helper(Observer, B, KB, MDT); switch (MI.getOpcode()) { - default: - return false; - case TargetOpcode::COPY: - return Helper.tryCombineCopy(MI); - case TargetOpcode::G_BR: - return Helper.tryCombineBr(MI); + case TargetOpcode::G_CONCAT_VECTORS: + return Helper.tryCombineConcatVectors(MI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return Helper.tryCombineShuffleVector(MI); case TargetOpcode::G_LOAD: case TargetOpcode::G_SEXTLOAD: - case TargetOpcode::G_ZEXTLOAD: - return Helper.tryCombineExtendingLoads(MI); + case TargetOpcode::G_ZEXTLOAD: { + bool Changed = false; + Changed |= Helper.tryCombineExtendingLoads(MI); + Changed |= Helper.tryCombineIndexedLoadStore(MI); + return Changed; + } + case TargetOpcode::G_STORE: + return Helper.tryCombineIndexedLoadStore(MI); + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: { + // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other + // heuristics decide. + unsigned MaxLen = EnableOpt ? 0 : 32; + // Try to inline memcpy type calls if optimizations are enabled. + return (!EnableMinSize) ? Helper.tryCombineMemCpyFamily(MI, MaxLen) + : false; + } + default: + break; + } } + if (Generated.tryCombineAll(Observer, MI, B)) + return true; + return false; } +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + // Pass boilerplate // ================ @@ -63,24 +110,33 @@ class AArch64PreLegalizerCombiner : public MachineFunctionPass { public: static char ID; - AArch64PreLegalizerCombiner(); + AArch64PreLegalizerCombiner(bool IsOptNone = false); StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; } bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; }; -} +} // end anonymous namespace void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetPassConfig>(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } MachineFunctionPass::getAnalysisUsage(AU); } -AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() : MachineFunctionPass(ID) { +AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); } @@ -89,7 +145,14 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { MachineFunctionProperties::Property::FailedISel)) return false; auto *TPC = &getAnalysis<TargetPassConfig>(); - AArch64PreLegalizerCombinerInfo PCInfo; + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), KB, MDT); Combiner C(PCInfo, TPC); return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); } @@ -99,13 +162,14 @@ INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 machine instrs before legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 machine instrs before legalization", false, false) namespace llvm { -FunctionPass *createAArch64PreLegalizeCombiner() { - return new AArch64PreLegalizerCombiner(); +FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone) { + return new AArch64PreLegalizerCombiner(IsOptNone); } } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index b52259cc9acd..8ec73aa3c040 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -563,12 +563,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { return getSameKindOfOperandsMapping(MI); } case TargetOpcode::COPY: { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); // Check if one of the register is not a generic register. - if ((TargetRegisterInfo::isPhysicalRegister(DstReg) || + if ((Register::isPhysicalRegister(DstReg) || !MRI.getType(DstReg).isValid()) || - (TargetRegisterInfo::isPhysicalRegister(SrcReg) || + (Register::isPhysicalRegister(SrcReg) || !MRI.getType(SrcReg).isValid())) { const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI); const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI); @@ -635,6 +635,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Some of the floating-point instructions have mixed GPR and FPR operands: // fine-tune the computed mapping. switch (Opc) { + case TargetOpcode::G_TRUNC: { + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + break; + } case TargetOpcode::G_SITOFP: case TargetOpcode::G_UITOFP: if (MRI.getType(MI.getOperand(0).getReg()).isVector()) @@ -687,7 +693,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_STORE: // Check if that store is fed by fp instructions. if (OpRegBankIdx[0] == PMI_FirstGPR) { - unsigned VReg = MI.getOperand(0).getReg(); + Register VReg = MI.getOperand(0).getReg(); if (!VReg) break; MachineInstr *DefMI = MRI.getVRegDef(VReg); @@ -702,11 +708,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; // If we're taking in vectors, we have no choice but to put everything on - // FPRs. + // FPRs, except for the condition. The condition must always be on a GPR. LLT SrcTy = MRI.getType(MI.getOperand(2).getReg()); if (SrcTy.isVector()) { - for (unsigned Idx = 0; Idx < 4; ++Idx) - OpRegBankIdx[Idx] = PMI_FirstFPR; + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; break; } @@ -740,7 +745,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // This doesn't check the condition, since it's just whatever is in NZCV. // This isn't passed explicitly in a register to fcsel/csel. for (unsigned Idx = 2; Idx < 4; ++Idx) { - unsigned VReg = MI.getOperand(Idx).getReg(); + Register VReg = MI.getOperand(Idx).getReg(); MachineInstr *DefMI = MRI.getVRegDef(VReg); if (getRegBank(VReg, MRI, TRI) == &AArch64::FPRRegBank || onlyDefinesFP(*DefMI, MRI, TRI)) @@ -750,8 +755,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // If we have more FP constraints than not, then move everything over to // FPR. if (NumFP >= 2) - for (unsigned Idx = 0; Idx < 4; ++Idx) - OpRegBankIdx[Idx] = PMI_FirstFPR; + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR, PMI_FirstFPR, PMI_FirstFPR}; break; } @@ -764,7 +768,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { LLT SrcTy = MRI.getType(MI.getOperand(MI.getNumOperands()-1).getReg()); // UNMERGE into scalars from a vector should always use FPR. // Likewise if any of the uses are FP instructions. - if (SrcTy.isVector() || + if (SrcTy.isVector() || SrcTy == LLT::scalar(128) || any_of(MRI.use_instructions(MI.getOperand(0).getReg()), [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) { // Set the register bank of every operand to FPR. @@ -795,12 +799,21 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Index needs to be a GPR. OpRegBankIdx[3] = PMI_FirstGPR; break; + case TargetOpcode::G_EXTRACT: { + // For s128 sources we have to use fpr. + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + if (SrcTy.getSizeInBits() == 128) { + OpRegBankIdx[0] = PMI_FirstFPR; + OpRegBankIdx[1] = PMI_FirstFPR; + } + break; + } case TargetOpcode::G_BUILD_VECTOR: // If the first source operand belongs to a FPR register bank, then make // sure that we preserve that. if (OpRegBankIdx[1] != PMI_FirstGPR) break; - unsigned VReg = MI.getOperand(1).getReg(); + Register VReg = MI.getOperand(1).getReg(); if (!VReg) break; diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 6d5a4e3d2f76..de176088595d 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64StackOffset.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" @@ -23,10 +24,10 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; @@ -63,8 +64,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) return CSR_AArch64_RT_MostRegs_SaveList; - else - return CSR_AArch64_AAPCS_SaveList; + if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) + return CSR_Darwin_AArch64_AAPCS_SaveList; + return CSR_AArch64_AAPCS_SaveList; } const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( @@ -120,6 +122,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, : CSR_AArch64_CXX_TLS_Darwin_RegMask; if (CC == CallingConv::AArch64_VectorCall) return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask; + if (CC == CallingConv::AArch64_SVE_VectorCall) + return CSR_AArch64_SVE_AAPCS_RegMask; if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering() ->supportSwiftError() && MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) @@ -388,7 +392,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const { assert(Offset <= INT_MAX && "Offset too big to fit in int."); assert(MI && "Unable to get the legal offset for nil instruction."); - int SaveOffset = Offset; + StackOffset SaveOffset(Offset, MVT::i8); return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal; } @@ -418,7 +422,9 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const { - int Off = Offset; // ARM doesn't need the general 64-bit offsets + // ARM doesn't need the general 64-bit offsets + StackOffset Off(Offset, MVT::i8); + unsigned i = 0; while (!MI.getOperand(i).isFI()) { @@ -441,40 +447,69 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64InstrInfo *TII = MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); const AArch64FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + bool Tagged = + MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED; unsigned FrameReg; - int Offset; // Special handling of dbg_value, stackmap and patchpoint instructions. if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || MI.getOpcode() == TargetOpcode::PATCHPOINT) { - Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, - /*PreferFP=*/true, - /*ForSimm=*/false); - Offset += MI.getOperand(FIOperandNum + 1).getImm(); + StackOffset Offset = + TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, + /*PreferFP=*/true, + /*ForSimm=*/false); + Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8); MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/); - MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes()); return; } if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); - Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex); + int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex); FI.ChangeToImmediate(Offset); return; } + StackOffset Offset; if (MI.getOpcode() == AArch64::TAGPstack) { // TAGPstack must use the virtual frame register in its 3rd operand. - const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); FrameReg = MI.getOperand(3).getReg(); - Offset = - MFI.getObjectOffset(FrameIndex) + AFI->getTaggedBasePointerOffset(); + Offset = {MFI.getObjectOffset(FrameIndex) + + AFI->getTaggedBasePointerOffset(), + MVT::i8}; + } else if (Tagged) { + StackOffset SPOffset = { + MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8}; + if (MFI.hasVarSizedObjects() || + isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) != + (AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) { + // Can't update to SP + offset in place. Precalculate the tagged pointer + // in a scratch register. + Offset = TFI->resolveFrameIndexReference( + MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); + Register ScratchReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, + TII); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(0); + MI.getOperand(FIOperandNum) + .ChangeToRegister(ScratchReg, false, false, true); + return; + } + FrameReg = AArch64::SP; + Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), + MVT::i8}; } else { Offset = TFI->resolveFrameIndexReference( MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); @@ -490,7 +525,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - unsigned ScratchReg = + Register ScratchReg = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); diff --git a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index 854670079e40..28a7e680849b 100644 --- a/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -426,16 +426,16 @@ bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // Get the operands of the current SIMD arithmetic instruction. - unsigned MulDest = MI.getOperand(0).getReg(); - unsigned SrcReg0 = MI.getOperand(1).getReg(); + Register MulDest = MI.getOperand(0).getReg(); + Register SrcReg0 = MI.getOperand(1).getReg(); unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); - unsigned SrcReg1 = MI.getOperand(2).getReg(); + Register SrcReg1 = MI.getOperand(2).getReg(); unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); unsigned DupDest; // Instructions of interest have either 4 or 5 operands. if (MI.getNumOperands() == 5) { - unsigned SrcReg2 = MI.getOperand(3).getReg(); + Register SrcReg2 = MI.getOperand(3).getReg(); unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); unsigned LaneNumber = MI.getOperand(4).getImm(); // Create a new DUP instruction. Note that if an equivalent DUP instruction diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td index 79ab42f4c080..b573eac76754 100644 --- a/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -82,11 +82,11 @@ let Predicates = [HasSVE] in { defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">; defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">; - defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">; - defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">; + defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>; + defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>; - defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">; - defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">; + defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>; + defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>; defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">; defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">; @@ -94,14 +94,14 @@ let Predicates = [HasSVE] in { defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">; defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">; defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">; - defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs">; - defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg">; - - defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">; - defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">; - defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">; - defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">; - defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">; + defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>; + defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>; + + defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", null_frag>; + defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", null_frag>; + defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>; + defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", null_frag>; + defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", null_frag>; defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">; defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">; @@ -138,12 +138,12 @@ let Predicates = [HasSVE] in { defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">; defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">; - defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">; - defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">; - defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">; - defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">; - defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">; - defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">; + defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; + defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>; + defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>; + defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>; + defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>; + defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>; defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">; @@ -187,7 +187,7 @@ let Predicates = [HasSVE] in { defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">; // Splat scalar register (unpredicated, GPR or vector + element index) - defm DUP_ZR : sve_int_perm_dup_r<"dup">; + defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>; defm DUP_ZZI : sve_int_perm_dup_i<"dup">; // Splat scalar register (predicated) @@ -211,13 +211,13 @@ let Predicates = [HasSVE] in { defm REV_PP : sve_int_perm_reverse_p<"rev">; defm REV_ZZ : sve_int_perm_reverse_z<"rev">; - defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">; - defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">; - defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">; - defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">; + defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>; + defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>; + defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>; + defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>; - def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">; - def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">; + defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>; + defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>; defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">; defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">; @@ -1020,6 +1020,56 @@ let Predicates = [HasSVE] in { (FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>; def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn", (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>; + + def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + + def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + + def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + + def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>; + + def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + + def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + + def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + } let Predicates = [HasSVE2] in { @@ -1164,6 +1214,13 @@ let Predicates = [HasSVE2] in { defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">; defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">; + // SVE2 predicated shifts + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">; + defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">; + // SVE2 integer add/subtract long defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">; defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">; @@ -1199,14 +1256,14 @@ let Predicates = [HasSVE2] in { defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">; // SVE2 bitwise shift and insert - defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">; - defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">; + defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">; + defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">; // SVE2 bitwise shift right and accumulate - defm SSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">; - defm USRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">; - defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">; - defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">; + defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">; + defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">; + defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">; + defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">; // SVE2 complex integer add defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">; @@ -1228,41 +1285,47 @@ let Predicates = [HasSVE2] in { defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">; defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">; - // SVE2 bitwise shift right narrow - defm SQSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">; - defm SQSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">; - defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">; - defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">; - defm SHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">; - defm SHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">; - defm RSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">; - defm RSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">; - defm SQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">; - defm SQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">; - defm SQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">; - defm SQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">; - defm UQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">; - defm UQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">; - defm UQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">; - defm UQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">; - - // SVE2 integer add/subtract narrow high part - defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b000, "addhnb">; - defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b001, "addhnt">; - defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">; - defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">; - defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b100, "subhnb">; - defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b101, "subhnt">; - defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">; - defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">; - - // SVE2 saturating extract narrow - defm SQXTNB_ZZ : sve2_int_sat_extract_narrow<0b000, "sqxtnb">; - defm SQXTNT_ZZ : sve2_int_sat_extract_narrow<0b001, "sqxtnt">; - defm UQXTNB_ZZ : sve2_int_sat_extract_narrow<0b010, "uqxtnb">; - defm UQXTNT_ZZ : sve2_int_sat_extract_narrow<0b011, "uqxtnt">; - defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">; - defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">; + // SVE2 bitwise shift right narrow (bottom) + defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">; + defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">; + defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">; + defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">; + defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">; + defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">; + defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">; + defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">; + + // SVE2 bitwise shift right narrow (top) + defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">; + defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">; + defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">; + defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">; + defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">; + defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">; + defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">; + defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">; + + // SVE2 integer add/subtract narrow high part (bottom) + defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">; + defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">; + defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">; + defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">; + + // SVE2 integer add/subtract narrow high part (top) + defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt">; + defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">; + defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt">; + defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">; + + // SVE2 saturating extract narrow (bottom) + defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">; + defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">; + defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">; + + // SVE2 saturating extract narrow (top) + defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">; + defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">; + defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">; // SVE2 character match defm MATCH_PPzZZ : sve2_char_match<0b0, "match">; @@ -1289,10 +1352,14 @@ let Predicates = [HasSVE2] in { // SVE2 histogram generation (vector) defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">; + // SVE2 floating-point base 2 logarithm as integer + defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">; + // SVE2 floating-point convert precision defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">; defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">; defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">; + def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>; // SVE2 floating-point pairwise operations defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">; @@ -1321,58 +1388,45 @@ let Predicates = [HasSVE2] in { def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">; def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">; - // sve_int_rotate_imm + // SVE2 bitwise xor and rotate right by immediate defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">; // SVE2 extract vector (immediate offset, constructive) def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; - // SVE floating-point convert precision - def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>; - - // SVE floating-point convert to integer - defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">; - - // Non-temporal contiguous loads (vector + register) - defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; - defm LDNT1B_ZZR_S : sve2_mem_cldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; - defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; - defm LDNT1H_ZZR_S : sve2_mem_cldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; - defm LDNT1W_ZZR_S : sve2_mem_cldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; - - defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; - defm LDNT1B_ZZR_D : sve2_mem_cldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; - defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; - defm LDNT1H_ZZR_D : sve2_mem_cldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; - defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; - defm LDNT1W_ZZR_D : sve2_mem_cldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; - defm LDNT1D_ZZR_D : sve2_mem_cldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; + // SVE2 non-temporal gather loads + defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; + defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; + defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; + defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; + defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; + + defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; + defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; + defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; + defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; + defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; + defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; + defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; // SVE2 vector splice (constructive) defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; - // Predicated shifts - defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; - defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; - defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">; - defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">; - defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">; - - // Non-temporal contiguous stores (vector + register) - defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; - defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; - defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; + // SVE2 non-temporal scatter stores + defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; + defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; + defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; - defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; - defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; - defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; - defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; + defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; + defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; + defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; + defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; - // SVE table lookup (three sources) + // SVE2 table lookup (three sources) defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">; defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">; - // SVE integer compare scalar count and limit + // SVE2 integer compare scalar count and limit defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">; defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">; defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">; @@ -1383,7 +1437,7 @@ let Predicates = [HasSVE2] in { defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">; defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">; - // SVE pointer conflict compare + // SVE2 pointer conflict compare defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">; defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">; } diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 60dbace03ca6..ba61ed726e84 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -32,7 +32,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( const AArch64TargetLowering &TLI = *STI.getTargetLowering(); EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); - Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; diff --git a/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/lib/Target/AArch64/AArch64SpeculationHardening.cpp index 3087e6ce441d..7307961ddb5f 100644 --- a/lib/Target/AArch64/AArch64SpeculationHardening.cpp +++ b/lib/Target/AArch64/AArch64SpeculationHardening.cpp @@ -106,6 +106,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" #include <cassert> @@ -115,9 +116,9 @@ using namespace llvm; #define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass" -cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden, - cl::desc("Sanitize loads from memory."), - cl::init(true)); +static cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden, + cl::desc("Sanitize loads from memory."), + cl::init(true)); namespace { @@ -521,7 +522,7 @@ bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) { for (auto Use : MI.uses()) { if (!Use.isReg()) continue; - unsigned Reg = Use.getReg(); + Register Reg = Use.getReg(); // Some loads of floating point data have implicit defs/uses on a // super register of that floating point data. Some examples: // $s0 = LDRSui $sp, 22, implicit-def $q0 @@ -561,8 +562,8 @@ bool AArch64SpeculationHardening::expandSpeculationSafeValue( // miss-speculation isn't happening because we're already inserting barriers // to guarantee that. if (!UseControlFlowSpeculationBarrier && !UsesFullSpeculationBarrier) { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); // Mark this register and all its aliasing registers as needing to be // value speculation hardened before its next use, by using a CSDB // barrier instruction. diff --git a/lib/Target/AArch64/AArch64StackOffset.h b/lib/Target/AArch64/AArch64StackOffset.h new file mode 100644 index 000000000000..13f12a6c9c30 --- /dev/null +++ b/lib/Target/AArch64/AArch64StackOffset.h @@ -0,0 +1,138 @@ +//==--AArch64StackOffset.h ---------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the StackOffset class, which is used to +// describe scalable and non-scalable offsets during frame lowering. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64STACKOFFSET_H + +#include "llvm/Support/MachineValueType.h" + +namespace llvm { + +/// StackOffset is a wrapper around scalable and non-scalable offsets and is +/// used in several functions such as 'isAArch64FrameOffsetLegal' and +/// 'emitFrameOffset()'. StackOffsets are described by MVTs, e.g. +// +/// StackOffset(1, MVT::nxv16i8) +// +/// would describe an offset as being the size of a single SVE vector. +/// +/// The class also implements simple arithmetic (addition/subtraction) on these +/// offsets, e.g. +// +/// StackOffset(1, MVT::nxv16i8) + StackOffset(1, MVT::i64) +// +/// describes an offset that spans the combined storage required for an SVE +/// vector and a 64bit GPR. +class StackOffset { + int64_t Bytes; + int64_t ScalableBytes; + + explicit operator int() const; + +public: + using Part = std::pair<int64_t, MVT>; + + StackOffset() : Bytes(0), ScalableBytes(0) {} + + StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() { + assert(MVT(T).getSizeInBits() % 8 == 0 && + "Offset type is not a multiple of bytes"); + *this += Part(Offset, T); + } + + StackOffset(const StackOffset &Other) + : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {} + + StackOffset &operator=(const StackOffset &) = default; + + StackOffset &operator+=(const StackOffset::Part &Other) { + int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8); + if (Other.second.isScalableVector()) + ScalableBytes += OffsetInBytes; + else + Bytes += OffsetInBytes; + return *this; + } + + StackOffset &operator+=(const StackOffset &Other) { + Bytes += Other.Bytes; + ScalableBytes += Other.ScalableBytes; + return *this; + } + + StackOffset operator+(const StackOffset &Other) const { + StackOffset Res(*this); + Res += Other; + return Res; + } + + StackOffset &operator-=(const StackOffset &Other) { + Bytes -= Other.Bytes; + ScalableBytes -= Other.ScalableBytes; + return *this; + } + + StackOffset operator-(const StackOffset &Other) const { + StackOffset Res(*this); + Res -= Other; + return Res; + } + + StackOffset operator-() const { + StackOffset Res = {}; + const StackOffset Other(*this); + Res -= Other; + return Res; + } + + /// Returns the scalable part of the offset in bytes. + int64_t getScalableBytes() const { return ScalableBytes; } + + /// Returns the non-scalable part of the offset in bytes. + int64_t getBytes() const { return Bytes; } + + /// Returns the offset in parts to which this frame offset can be + /// decomposed for the purpose of describing a frame offset. + /// For non-scalable offsets this is simply its byte size. + void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors, + int64_t &NumDataVectors) const { + assert(isValid() && "Invalid frame offset"); + + NumBytes = Bytes; + NumDataVectors = 0; + NumPredicateVectors = ScalableBytes / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into NumDataVectors so that it + // uses ADDVL for part of it, reducing the number of ADDPL instructions. + if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || + NumPredicateVectors > 62) { + NumDataVectors = NumPredicateVectors / 8; + NumPredicateVectors -= NumDataVectors * 8; + } + } + + /// Returns whether the offset is known zero. + explicit operator bool() const { return Bytes || ScalableBytes; } + + bool isValid() const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + return ScalableBytes % 2 == 0; + } +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp index 6e99c48bf1d7..e6dbe01d3807 100644 --- a/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/lib/Target/AArch64/AArch64StackTagging.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -55,9 +56,215 @@ using namespace llvm; #define DEBUG_TYPE "stack-tagging" -static constexpr unsigned kTagGranuleSize = 16; +static cl::opt<bool> ClMergeInit( + "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore, + cl::desc("merge stack variable initializers with tagging when possible")); + +static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit", + cl::init(40), cl::Hidden); + +static const Align kTagGranuleSize = Align(16); namespace { + +class InitializerBuilder { + uint64_t Size; + const DataLayout *DL; + Value *BasePtr; + Function *SetTagFn; + Function *SetTagZeroFn; + Function *StgpFn; + + // List of initializers sorted by start offset. + struct Range { + uint64_t Start, End; + Instruction *Inst; + }; + SmallVector<Range, 4> Ranges; + // 8-aligned offset => 8-byte initializer + // Missing keys are zero initialized. + std::map<uint64_t, Value *> Out; + +public: + InitializerBuilder(uint64_t Size, const DataLayout *DL, Value *BasePtr, + Function *SetTagFn, Function *SetTagZeroFn, + Function *StgpFn) + : Size(Size), DL(DL), BasePtr(BasePtr), SetTagFn(SetTagFn), + SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {} + + bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) { + auto I = std::lower_bound( + Ranges.begin(), Ranges.end(), Start, + [](const Range &LHS, uint64_t RHS) { return LHS.End <= RHS; }); + if (I != Ranges.end() && End > I->Start) { + // Overlap - bail. + return false; + } + Ranges.insert(I, {Start, End, Inst}); + return true; + } + + bool addStore(uint64_t Offset, StoreInst *SI, const DataLayout *DL) { + int64_t StoreSize = DL->getTypeStoreSize(SI->getOperand(0)->getType()); + if (!addRange(Offset, Offset + StoreSize, SI)) + return false; + IRBuilder<> IRB(SI); + applyStore(IRB, Offset, Offset + StoreSize, SI->getOperand(0)); + return true; + } + + bool addMemSet(uint64_t Offset, MemSetInst *MSI) { + uint64_t StoreSize = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + if (!addRange(Offset, Offset + StoreSize, MSI)) + return false; + IRBuilder<> IRB(MSI); + applyMemSet(IRB, Offset, Offset + StoreSize, + cast<ConstantInt>(MSI->getValue())); + return true; + } + + void applyMemSet(IRBuilder<> &IRB, int64_t Start, int64_t End, + ConstantInt *V) { + // Out[] does not distinguish between zero and undef, and we already know + // that this memset does not overlap with any other initializer. Nothing to + // do for memset(0). + if (V->isZero()) + return; + for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) { + uint64_t Cst = 0x0101010101010101UL; + int LowBits = Offset < Start ? (Start - Offset) * 8 : 0; + if (LowBits) + Cst = (Cst >> LowBits) << LowBits; + int HighBits = End - Offset < 8 ? (8 - (End - Offset)) * 8 : 0; + if (HighBits) + Cst = (Cst << HighBits) >> HighBits; + ConstantInt *C = + ConstantInt::get(IRB.getInt64Ty(), Cst * V->getZExtValue()); + + Value *&CurrentV = Out[Offset]; + if (!CurrentV) { + CurrentV = C; + } else { + CurrentV = IRB.CreateOr(CurrentV, C); + } + } + } + + // Take a 64-bit slice of the value starting at the given offset (in bytes). + // Offset can be negative. Pad with zeroes on both sides when necessary. + Value *sliceValue(IRBuilder<> &IRB, Value *V, int64_t Offset) { + if (Offset > 0) { + V = IRB.CreateLShr(V, Offset * 8); + V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty()); + } else if (Offset < 0) { + V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty()); + V = IRB.CreateShl(V, -Offset * 8); + } else { + V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty()); + } + return V; + } + + void applyStore(IRBuilder<> &IRB, int64_t Start, int64_t End, + Value *StoredValue) { + StoredValue = flatten(IRB, StoredValue); + for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) { + Value *V = sliceValue(IRB, StoredValue, Offset - Start); + Value *&CurrentV = Out[Offset]; + if (!CurrentV) { + CurrentV = V; + } else { + CurrentV = IRB.CreateOr(CurrentV, V); + } + } + } + + void generate(IRBuilder<> &IRB) { + LLVM_DEBUG(dbgs() << "Combined initializer\n"); + // No initializers => the entire allocation is undef. + if (Ranges.empty()) { + emitUndef(IRB, 0, Size); + return; + } + + // Look through 8-byte initializer list 16 bytes at a time; + // If one of the two 8-byte halfs is non-zero non-undef, emit STGP. + // Otherwise, emit zeroes up to next available item. + uint64_t LastOffset = 0; + for (uint64_t Offset = 0; Offset < Size; Offset += 16) { + auto I1 = Out.find(Offset); + auto I2 = Out.find(Offset + 8); + if (I1 == Out.end() && I2 == Out.end()) + continue; + + if (Offset > LastOffset) + emitZeroes(IRB, LastOffset, Offset - LastOffset); + + Value *Store1 = I1 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty()) + : I1->second; + Value *Store2 = I2 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty()) + : I2->second; + emitPair(IRB, Offset, Store1, Store2); + LastOffset = Offset + 16; + } + + // memset(0) does not update Out[], therefore the tail can be either undef + // or zero. + if (LastOffset < Size) + emitZeroes(IRB, LastOffset, Size - LastOffset); + + for (const auto &R : Ranges) { + R.Inst->eraseFromParent(); + } + } + + void emitZeroes(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) { + LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + Size + << ") zero\n"); + Value *Ptr = BasePtr; + if (Offset) + Ptr = IRB.CreateConstGEP1_32(Ptr, Offset); + IRB.CreateCall(SetTagZeroFn, + {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)}); + } + + void emitUndef(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) { + LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + Size + << ") undef\n"); + Value *Ptr = BasePtr; + if (Offset) + Ptr = IRB.CreateConstGEP1_32(Ptr, Offset); + IRB.CreateCall(SetTagFn, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)}); + } + + void emitPair(IRBuilder<> &IRB, uint64_t Offset, Value *A, Value *B) { + LLVM_DEBUG(dbgs() << " [" << Offset << ", " << Offset + 16 << "):\n"); + LLVM_DEBUG(dbgs() << " " << *A << "\n " << *B << "\n"); + Value *Ptr = BasePtr; + if (Offset) + Ptr = IRB.CreateConstGEP1_32(Ptr, Offset); + IRB.CreateCall(StgpFn, {Ptr, A, B}); + } + + Value *flatten(IRBuilder<> &IRB, Value *V) { + if (V->getType()->isIntegerTy()) + return V; + // vector of pointers -> vector of ints + if (VectorType *VecTy = dyn_cast<VectorType>(V->getType())) { + LLVMContext &Ctx = IRB.getContext(); + Type *EltTy = VecTy->getElementType(); + if (EltTy->isPointerTy()) { + uint32_t EltSize = DL->getTypeSizeInBits(EltTy); + Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize), + VecTy->getNumElements()); + V = IRB.CreatePointerCast(V, NewTy); + } + } + return IRB.CreateBitOrPointerCast( + V, IRB.getIntNTy(DL->getTypeStoreSize(V->getType()) * 8)); + } +}; + class AArch64StackTagging : public FunctionPass { struct AllocaInfo { AllocaInst *AI; @@ -67,10 +274,15 @@ class AArch64StackTagging : public FunctionPass { int Tag; // -1 for non-tagged allocations }; + bool MergeInit; + public: static char ID; // Pass ID, replacement for typeid - AArch64StackTagging() : FunctionPass(ID) { + AArch64StackTagging(bool MergeInit = true) + : FunctionPass(ID), + MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit + : MergeInit) { initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry()); } @@ -81,6 +293,9 @@ public: uint64_t Size); void untagAlloca(AllocaInst *AI, Instruction *InsertBefore, uint64_t Size); + Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr, + uint64_t Size, InitializerBuilder &IB); + Instruction * insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas, const DominatorTree *DT); @@ -92,9 +307,12 @@ private: Function *F; Function *SetTagFunc; const DataLayout *DL; + AAResults *AA; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (MergeInit) + AU.addRequired<AAResultsWrapperPass>(); } }; @@ -107,8 +325,68 @@ INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging", false, false) -FunctionPass *llvm::createAArch64StackTaggingPass() { - return new AArch64StackTagging(); +FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) { + return new AArch64StackTagging(MergeInit); +} + +Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst, + Value *StartPtr, + uint64_t Size, + InitializerBuilder &IB) { + MemoryLocation AllocaLoc{StartPtr, Size}; + Instruction *LastInst = StartInst; + BasicBlock::iterator BI(StartInst); + + unsigned Count = 0; + for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) { + if (!isa<DbgInfoIntrinsic>(*BI)) + ++Count; + + if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc))) + continue; + + if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { + // If the instruction is readnone, ignore it, otherwise bail out. We + // don't even allow readonly here because we don't want something like: + // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). + if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) + break; + continue; + } + + if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { + if (!NextStore->isSimple()) + break; + + // Check to see if this store is to a constant offset from the start ptr. + Optional<int64_t> Offset = + isPointerOffset(StartPtr, NextStore->getPointerOperand(), *DL); + if (!Offset) + break; + + if (!IB.addStore(*Offset, NextStore, DL)) + break; + LastInst = NextStore; + } else { + MemSetInst *MSI = cast<MemSetInst>(BI); + + if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) + break; + + if (!isa<ConstantInt>(MSI->getValue())) + break; + + // Check to see if this store is to a constant offset from the start ptr. + Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), *DL); + if (!Offset) + break; + + if (!IB.addMemSet(*Offset, MSI)) + break; + LastInst = MSI; + } + } + return LastInst; } bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) { @@ -127,8 +405,23 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) { void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr, uint64_t Size) { + auto SetTagZeroFunc = + Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero); + auto StgpFunc = + Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp); + + InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc); + bool LittleEndian = + Triple(AI->getModule()->getTargetTriple()).isLittleEndian(); + // Current implementation of initializer merging assumes little endianness. + if (MergeInit && !F->hasOptNone() && LittleEndian) { + LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI + << ", size = " << Size << "\n"); + InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB); + } + IRBuilder<> IRB(InsertBefore); - IRB.CreateCall(SetTagFunc, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)}); + IB.generate(IRB); } void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore, @@ -166,7 +459,8 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( } void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { - unsigned NewAlignment = std::max(Info.AI->getAlignment(), kTagGranuleSize); + const Align NewAlignment = + max(MaybeAlign(Info.AI->getAlignment()), kTagGranuleSize); Info.AI->setAlignment(NewAlignment); uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; @@ -179,7 +473,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { Info.AI->isArrayAllocation() ? ArrayType::get( Info.AI->getAllocatedType(), - dyn_cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue()) + cast<ConstantInt>(Info.AI->getArraySize())->getZExtValue()) : Info.AI->getAllocatedType(); Type *PaddingType = ArrayType::get(Type::getInt8Ty(F->getContext()), AlignedSize - Size); @@ -187,7 +481,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { auto *NewAI = new AllocaInst( TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI); NewAI->takeName(Info.AI); - NewAI->setAlignment(Info.AI->getAlignment()); + NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment())); NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca()); NewAI->setSwiftError(Info.AI->isSwiftError()); NewAI->copyMetadata(*Info.AI); @@ -198,6 +492,24 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { Info.AI = NewAI; } +// Helper function to check for post-dominance. +static bool postDominates(const PostDominatorTree *PDT, const IntrinsicInst *A, + const IntrinsicInst *B) { + const BasicBlock *ABB = A->getParent(); + const BasicBlock *BBB = B->getParent(); + + if (ABB != BBB) + return PDT->dominates(ABB, BBB); + + for (const Instruction &I : *ABB) { + if (&I == B) + return true; + if (&I == A) + return false; + } + llvm_unreachable("Corrupt instruction list"); +} + // FIXME: check for MTE extension bool AArch64StackTagging::runOnFunction(Function &Fn) { if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag)) @@ -205,6 +517,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { F = &Fn; DL = &Fn.getParent()->getDataLayout(); + if (MergeInit) + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order SmallVector<Instruction *, 8> RetVec; @@ -270,23 +584,31 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { if (NumInterestingAllocas == 0) return true; + std::unique_ptr<DominatorTree> DeleteDT; + DominatorTree *DT = nullptr; + if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>()) + DT = &P->getDomTree(); + + if (DT == nullptr && (NumInterestingAllocas > 1 || + !F->hasFnAttribute(Attribute::OptimizeNone))) { + DeleteDT = std::make_unique<DominatorTree>(*F); + DT = DeleteDT.get(); + } + + std::unique_ptr<PostDominatorTree> DeletePDT; + PostDominatorTree *PDT = nullptr; + if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>()) + PDT = &P->getPostDomTree(); + + if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) { + DeletePDT = std::make_unique<PostDominatorTree>(*F); + PDT = DeletePDT.get(); + } + SetTagFunc = Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag); - // Compute DT only if the function has the attribute, there are more than 1 - // interesting allocas, and it is not available for free. - Instruction *Base; - if (NumInterestingAllocas > 1) { - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - if (DTWP) { - Base = insertBaseTaggedPointer(Allocas, &DTWP->getDomTree()); - } else { - DominatorTree DT(*F); - Base = insertBaseTaggedPointer(Allocas, &DT); - } - } else { - Base = insertBaseTaggedPointer(Allocas, nullptr); - } + Instruction *Base = insertBaseTaggedPointer(Allocas, DT); for (auto &I : Allocas) { const AllocaInfo &Info = I.second; @@ -309,11 +631,37 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { if (UnrecognizedLifetimes.empty() && Info.LifetimeStart.size() == 1 && Info.LifetimeEnd.size() == 1) { IntrinsicInst *Start = Info.LifetimeStart[0]; + IntrinsicInst *End = Info.LifetimeEnd[0]; uint64_t Size = dyn_cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue(); Size = alignTo(Size, kTagGranuleSize); tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size); - untagAlloca(AI, Info.LifetimeEnd[0], Size); + // We need to ensure that if we tag some object, we certainly untag it + // before the function exits. + if (PDT != nullptr && postDominates(PDT, End, Start)) { + untagAlloca(AI, End, Size); + } else { + SmallVector<Instruction *, 8> ReachableRetVec; + unsigned NumCoveredExits = 0; + for (auto &RI : RetVec) { + if (!isPotentiallyReachable(Start, RI, nullptr, DT)) + continue; + ReachableRetVec.push_back(RI); + if (DT != nullptr && DT->dominates(End, RI)) + ++NumCoveredExits; + } + // If there's a mix of covered and non-covered exits, just put the untag + // on exits, so we avoid the redundancy of untagging twice. + if (NumCoveredExits == ReachableRetVec.size()) { + untagAlloca(AI, End, Size); + } else { + for (auto &RI : ReachableRetVec) + untagAlloca(AI, RI, Size); + // We may have inserted untag outside of the lifetime interval. + // Remove the lifetime end call for this alloca. + End->eraseFromParent(); + } + } } else { uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; Value *Ptr = IRB.CreatePointerCast(TagPCall, IRB.getInt8PtrTy()); diff --git a/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp new file mode 100644 index 000000000000..3cc556f74aea --- /dev/null +++ b/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -0,0 +1,209 @@ +//===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + + +#include "AArch64.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64InstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-stack-tagging-pre-ra" + +enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways }; + +cl::opt<UncheckedLdStMode> ClUncheckedLdSt( + "stack-tagging-unchecked-ld-st", cl::Hidden, + cl::init(UncheckedSafe), + cl::desc( + "Unconditionally apply unchecked-ld-st optimization (even for large " + "stack frames, or in the presence of variable sized allocas)."), + cl::values( + clEnumValN(UncheckedNever, "never", "never apply unchecked-ld-st"), + clEnumValN( + UncheckedSafe, "safe", + "apply unchecked-ld-st when the target is definitely within range"), + clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st"))); + +namespace { + +class AArch64StackTaggingPreRA : public MachineFunctionPass { + MachineFunction *MF; + AArch64FunctionInfo *AFI; + MachineFrameInfo *MFI; + MachineRegisterInfo *MRI; + const AArch64RegisterInfo *TRI; + const AArch64InstrInfo *TII; + + SmallVector<MachineInstr*, 16> ReTags; + +public: + static char ID; + AArch64StackTaggingPreRA() : MachineFunctionPass(ID) { + initializeAArch64StackTaggingPreRAPass(*PassRegistry::getPassRegistry()); + } + + bool mayUseUncheckedLoadStore(); + void uncheckUsesOf(unsigned TaggedReg, int FI); + void uncheckLoadsAndStores(); + + bool runOnMachineFunction(MachineFunction &Func) override; + StringRef getPassName() const override { + return "AArch64 Stack Tagging PreRA"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // end anonymous namespace + +char AArch64StackTaggingPreRA::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra", + "AArch64 Stack Tagging PreRA Pass", false, false) +INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra", + "AArch64 Stack Tagging PreRA Pass", false, false) + +FunctionPass *llvm::createAArch64StackTaggingPreRAPass() { + return new AArch64StackTaggingPreRA(); +} + +static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) { + switch (Opcode) { + case AArch64::LDRWui: + case AArch64::LDRSHWui: + case AArch64::LDRXui: + case AArch64::LDRBui: + case AArch64::LDRBBui: + case AArch64::LDRHui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::STRWui: + case AArch64::STRXui: + case AArch64::STRBui: + case AArch64::STRBBui: + case AArch64::STRHui: + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + return true; + default: + return false; + } +} + +bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() { + if (ClUncheckedLdSt == UncheckedNever) + return false; + else if (ClUncheckedLdSt == UncheckedAlways) + return true; + + // This estimate can be improved if we had harder guarantees about stack frame + // layout. With LocalStackAllocation we can estimate SP offset to any + // preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged + // objects ahead of non-tagged ones, but that's not always desirable. + // + // Underestimating SP offset here may require the use of LDG to materialize + // the tagged address of the stack slot, along with a scratch register + // allocation (post-regalloc!). + // + // For now we do the safe thing here and require that the entire stack frame + // is within range of the shortest of the unchecked instructions. + unsigned FrameSize = 0; + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) + FrameSize += MFI->getObjectSize(i); + bool EntireFrameReachableFromSP = FrameSize < 0xf00; + return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP; +} + +void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) { + for (auto UI = MRI->use_instr_begin(TaggedReg), E = MRI->use_instr_end(); + UI != E;) { + MachineInstr *UseI = &*(UI++); + if (isUncheckedLoadOrStoreOpcode(UseI->getOpcode())) { + // FI operand is always the one before the immediate offset. + unsigned OpIdx = TII->getLoadStoreImmIdx(UseI->getOpcode()) - 1; + if (UseI->getOperand(OpIdx).isReg() && + UseI->getOperand(OpIdx).getReg() == TaggedReg) { + UseI->getOperand(OpIdx).ChangeToFrameIndex(FI); + UseI->getOperand(OpIdx).setTargetFlags(AArch64II::MO_TAGGED); + } + } else if (UseI->isCopy() && + Register::isVirtualRegister(UseI->getOperand(0).getReg())) { + uncheckUsesOf(UseI->getOperand(0).getReg(), FI); + } + } +} + +void AArch64StackTaggingPreRA::uncheckLoadsAndStores() { + for (auto *I : ReTags) { + unsigned TaggedReg = I->getOperand(0).getReg(); + int FI = I->getOperand(1).getIndex(); + uncheckUsesOf(TaggedReg, FI); + } +} + +bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; + MRI = &MF->getRegInfo(); + AFI = MF->getInfo<AArch64FunctionInfo>(); + TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo()); + TRI = static_cast<const AArch64RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + MFI = &MF->getFrameInfo(); + ReTags.clear(); + + assert(MRI->isSSA()); + + LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n" + << "********** Function: " << MF->getName() << '\n'); + + SmallSetVector<int, 8> TaggedSlots; + for (auto &BB : *MF) { + for (auto &I : BB) { + if (I.getOpcode() == AArch64::TAGPstack) { + ReTags.push_back(&I); + int FI = I.getOperand(1).getIndex(); + TaggedSlots.insert(FI); + // There should be no offsets in TAGP yet. + assert(I.getOperand(2).getImm() == 0); + } + } + } + + if (ReTags.empty()) + return false; + + if (mayUseUncheckedLoadStore()) + uncheckLoadsAndStores(); + + return true; +} diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 0e84a00df006..5deb601822b8 100644 --- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -151,7 +151,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { int64_t Offset; if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) && BaseOp->isReg()) { - unsigned BaseReg = BaseOp->getReg(); + Register BaseReg = BaseOp->getReg(); if (PrevBaseReg == BaseReg) { // If this block can take STPs, skip ahead to the next block. if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index 3bc89b91c3f7..558bea368eff 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -71,19 +71,22 @@ void AArch64Subtarget::initializeProperties() { case CortexA35: break; case CortexA53: - PrefFunctionAlignment = 3; + PrefFunctionLogAlignment = 3; break; case CortexA55: break; case CortexA57: MaxInterleaveFactor = 4; - PrefFunctionAlignment = 4; + PrefFunctionLogAlignment = 4; + break; + case CortexA65: + PrefFunctionLogAlignment = 3; break; case CortexA72: case CortexA73: case CortexA75: case CortexA76: - PrefFunctionAlignment = 4; + PrefFunctionLogAlignment = 4; break; case Cyclone: CacheLineSize = 64; @@ -94,14 +97,14 @@ void AArch64Subtarget::initializeProperties() { case ExynosM1: MaxInterleaveFactor = 4; MaxJumpTableSize = 8; - PrefFunctionAlignment = 4; - PrefLoopAlignment = 3; + PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 3; break; case ExynosM3: MaxInterleaveFactor = 4; MaxJumpTableSize = 20; - PrefFunctionAlignment = 5; - PrefLoopAlignment = 4; + PrefFunctionLogAlignment = 5; + PrefLoopLogAlignment = 4; break; case Falkor: MaxInterleaveFactor = 4; @@ -122,6 +125,12 @@ void AArch64Subtarget::initializeProperties() { // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; + case NeoverseE1: + PrefFunctionLogAlignment = 3; + break; + case NeoverseN1: + PrefFunctionLogAlignment = 4; + break; case Saphira: MaxInterleaveFactor = 4; // FIXME: remove this to enable 64-bit SLP if performance looks good. @@ -129,8 +138,8 @@ void AArch64Subtarget::initializeProperties() { break; case ThunderX2T99: CacheLineSize = 64; - PrefFunctionAlignment = 3; - PrefLoopAlignment = 2; + PrefFunctionLogAlignment = 3; + PrefLoopLogAlignment = 2; MaxInterleaveFactor = 4; PrefetchDistance = 128; MinPrefetchStride = 1024; @@ -143,15 +152,15 @@ void AArch64Subtarget::initializeProperties() { case ThunderXT81: case ThunderXT83: CacheLineSize = 128; - PrefFunctionAlignment = 3; - PrefLoopAlignment = 2; + PrefFunctionLogAlignment = 3; + PrefLoopLogAlignment = 2; // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; case TSV110: CacheLineSize = 64; - PrefFunctionAlignment = 4; - PrefLoopAlignment = 2; + PrefFunctionLogAlignment = 4; + PrefLoopLogAlignment = 2; break; } } @@ -187,7 +196,7 @@ const CallLowering *AArch64Subtarget::getCallLowering() const { return CallLoweringInfo.get(); } -const InstructionSelector *AArch64Subtarget::getInstructionSelector() const { +InstructionSelector *AArch64Subtarget::getInstructionSelector() const { return InstSelector.get(); } @@ -201,7 +210,7 @@ const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { /// Find the target operand flags that describe how a global value should be /// referenced for the current subtarget. -unsigned char +unsigned AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // MachO large model always goes via a GOT, simply to get a single 8-byte @@ -224,10 +233,17 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, GV->hasExternalWeakLinkage()) return AArch64II::MO_GOT; + // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate + // that their nominal addresses are tagged and outside of the code model. In + // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the + // tag if necessary based on MO_TAGGED. + if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) + return AArch64II::MO_NC | AArch64II::MO_TAGGED; + return AArch64II::MO_NO_FLAG; } -unsigned char AArch64Subtarget::classifyGlobalFunctionReference( +unsigned AArch64Subtarget::classifyGlobalFunctionReference( const GlobalValue *GV, const TargetMachine &TM) const { // MachO large model always goes via a GOT, because we don't have the // relocations available to do anything else.. @@ -275,7 +291,7 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const { std::unique_ptr<PBQPRAConstraint> AArch64Subtarget::getCustomPBQPConstraints() const { - return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr; + return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; } void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 0c84cfb8329a..f3212fae8e5e 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -42,6 +42,7 @@ public: CortexA53, CortexA55, CortexA57, + CortexA65, CortexA72, CortexA73, CortexA75, @@ -51,6 +52,8 @@ public: ExynosM3, Falkor, Kryo, + NeoverseE1, + NeoverseN1, Saphira, ThunderX2T99, ThunderX, @@ -113,6 +116,7 @@ protected: bool HasTRACEV8_4 = false; bool HasAM = false; bool HasSEL2 = false; + bool HasPMU = false; bool HasTLB_RMI = false; bool HasFMI = false; bool HasRCPC_IMMO = false; @@ -134,6 +138,7 @@ protected: bool HasBTI = false; bool HasRandGen = false; bool HasMTE = false; + bool HasTME = false; // Arm SVE2 extensions bool HasSVE2AES = false; @@ -141,6 +146,10 @@ protected: bool HasSVE2SHA3 = false; bool HasSVE2BitPerm = false; + // Future architecture extensions. + bool HasETE = false; + bool HasTRBE = false; + // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -183,14 +192,15 @@ protected: bool UseEL1ForTP = false; bool UseEL2ForTP = false; bool UseEL3ForTP = false; + bool AllowTaggedGlobals = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; uint16_t PrefetchDistance = 0; uint16_t MinPrefetchStride = 1; unsigned MaxPrefetchIterationsAhead = UINT_MAX; - unsigned PrefFunctionAlignment = 0; - unsigned PrefLoopAlignment = 0; + unsigned PrefFunctionLogAlignment = 0; + unsigned PrefLoopLogAlignment = 0; unsigned MaxJumpTableSize = 0; unsigned WideningBaseCost = 0; @@ -247,7 +257,7 @@ public: return &getInstrInfo()->getRegisterInfo(); } const CallLowering *getCallLowering() const override; - const InstructionSelector *getInstructionSelector() const override; + InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; const Triple &getTargetTriple() const { return TargetTriple; } @@ -344,14 +354,16 @@ public: unsigned getVectorInsertExtractBaseCost() const { return VectorInsertExtractBaseCost; } - unsigned getCacheLineSize() const { return CacheLineSize; } - unsigned getPrefetchDistance() const { return PrefetchDistance; } - unsigned getMinPrefetchStride() const { return MinPrefetchStride; } - unsigned getMaxPrefetchIterationsAhead() const { + unsigned getCacheLineSize() const override { return CacheLineSize; } + unsigned getPrefetchDistance() const override { return PrefetchDistance; } + unsigned getMinPrefetchStride() const override { return MinPrefetchStride; } + unsigned getMaxPrefetchIterationsAhead() const override { return MaxPrefetchIterationsAhead; } - unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; } - unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; } + unsigned getPrefFunctionLogAlignment() const { + return PrefFunctionLogAlignment; + } + unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; } unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; } @@ -380,6 +392,7 @@ public: bool hasBTI() const { return HasBTI; } bool hasRandGen() const { return HasRandGen; } bool hasMTE() const { return HasMTE; } + bool hasTME() const { return HasTME; } // Arm SVE2 extensions bool hasSVE2AES() const { return HasSVE2AES; } bool hasSVE2SM4() const { return HasSVE2SM4; } @@ -399,6 +412,8 @@ public: bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + bool isTargetILP32() const { return TargetTriple.isArch32Bit(); } + bool useAA() const override { return UseAA; } bool hasVH() const { return HasVH; } @@ -421,10 +436,17 @@ public: bool hasTRACEV8_4() const { return HasTRACEV8_4; } bool hasAM() const { return HasAM; } bool hasSEL2() const { return HasSEL2; } + bool hasPMU() const { return HasPMU; } bool hasTLB_RMI() const { return HasTLB_RMI; } bool hasFMI() const { return HasFMI; } bool hasRCPC_IMMO() const { return HasRCPC_IMMO; } + bool addrSinkUsingGEPs() const override { + // Keeping GEPs inbounds is important for exploiting AArch64 + // addressing-modes in ILP32 mode. + return useAA() || isTargetILP32(); + } + bool useSmallAddressing() const { switch (TLInfo.getTargetMachine().getCodeModel()) { case CodeModel::Kernel: @@ -443,11 +465,11 @@ public: /// ClassifyGlobalReference - Find the target operand flags that describe /// how a global value should be referenced for the current subtarget. - unsigned char ClassifyGlobalReference(const GlobalValue *GV, - const TargetMachine &TM) const; + unsigned ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM) const; - unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, - const TargetMachine &TM) const; + unsigned classifyGlobalFunctionReference(const GlobalValue *GV, + const TargetMachine &TM) const; void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td index 536a6591478b..05249a4ea6a8 100644 --- a/lib/Target/AArch64/AArch64SystemOperands.td +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -612,6 +612,7 @@ def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>; def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>; def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>; def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>; +def : ROSysReg<"ID_MMFR5_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b110>; // Trace registers // Op0 Op1 CRn CRm Op2 @@ -1321,6 +1322,12 @@ def : RWSysReg<"CNTHPS_CTL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b001>; def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>; } // FeatureSEL2 +// v8.4a PMU registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeaturePMU} }] in { +def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>; +} // FeaturePMU + // v8.4a RAS registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::FeatureRASv8_4} }] in { @@ -1452,14 +1459,37 @@ let Requires = [{ {AArch64::FeatureMTE} }] in { def : RWSysReg<"TCO", 0b11, 0b011, 0b0100, 0b0010, 0b111>; def : RWSysReg<"GCR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b110>; def : RWSysReg<"RGSR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b101>; -def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0110, 0b0101, 0b000>; -def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0110, 0b0101, 0b000>; -def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0110, 0b0110, 0b000>; -def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0110, 0b0110, 0b000>; -def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0110, 0b0110, 0b001>; +def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b001>; def : ROSysReg<"GMID_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b100>; } // HasMTE +// Embedded Trace Extension R/W System registers +let Requires = [{ {AArch64::FeatureETE} }] in { +// Name Op0 Op1 CRn CRm Op2 +def : RWSysReg<"TRCRSR", 0b10, 0b001, 0b0000, 0b1010, 0b000>; +// TRCEXTINSELR0 has the same encoding as ETM TRCEXTINSELR +def : RWSysReg<"TRCEXTINSELR0", 0b10, 0b001, 0b0000, 0b1000, 0b100>; +def : RWSysReg<"TRCEXTINSELR1", 0b10, 0b001, 0b0000, 0b1001, 0b100>; +def : RWSysReg<"TRCEXTINSELR2", 0b10, 0b001, 0b0000, 0b1010, 0b100>; +def : RWSysReg<"TRCEXTINSELR3", 0b10, 0b001, 0b0000, 0b1011, 0b100>; +} // FeatureETE + +// Trace Buffer Extension System registers +let Requires = [{ {AArch64::FeatureTRBE} }] in { +// Name Op0 Op1 CRn CRm Op2 +def : RWSysReg<"TRBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b000>; +def : RWSysReg<"TRBPTR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b001>; +def : RWSysReg<"TRBBASER_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b010>; +def : RWSysReg<"TRBSR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b011>; +def : RWSysReg<"TRBMAR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b100>; +def : RWSysReg<"TRBTRG_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b110>; +def : ROSysReg<"TRBIDR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b111>; +} // FeatureTRBE + // Cyclone specific system registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::ProcCyclone} }] in diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 865461480499..b3ed96e815be 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -157,6 +157,8 @@ extern "C" void LLVMInitializeAArch64Target() { RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget()); RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget()); RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target()); + RegisterTargetMachine<AArch64leTargetMachine> W(getTheARM64_32Target()); + RegisterTargetMachine<AArch64leTargetMachine> V(getTheAArch64_32Target()); auto PR = PassRegistry::getPassRegistry(); initializeGlobalISel(*PR); initializeAArch64A53Fix835769Pass(*PR); @@ -180,6 +182,7 @@ extern "C" void LLVMInitializeAArch64Target() { initializeLDTLSCleanupPass(*PR); initializeAArch64SpeculationHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); + initializeAArch64StackTaggingPreRAPass(*PR); } //===----------------------------------------------------------------------===// @@ -187,11 +190,11 @@ extern "C" void LLVMInitializeAArch64Target() { //===----------------------------------------------------------------------===// static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) - return llvm::make_unique<AArch64_MachoTargetObjectFile>(); + return std::make_unique<AArch64_MachoTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) - return llvm::make_unique<AArch64_COFFTargetObjectFile>(); + return std::make_unique<AArch64_COFFTargetObjectFile>(); - return llvm::make_unique<AArch64_ELFTargetObjectFile>(); + return std::make_unique<AArch64_ELFTargetObjectFile>(); } // Helper function to build a DataLayout string @@ -200,8 +203,11 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) { if (Options.getABIName() == "ilp32") return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128"; - if (TT.isOSBinFormatMachO()) + if (TT.isOSBinFormatMachO()) { + if (TT.getArch() == Triple::aarch64_32) + return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128"; return "e-m:o-i64:64-i128:128-n32:64-S128"; + } if (TT.isOSBinFormatCOFF()) return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"; if (LittleEndian) @@ -277,8 +283,11 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, this->Options.TrapUnreachable = true; } - // Enable GlobalISel at or below EnableGlobalISelAt0. - if (getOptLevel() <= EnableGlobalISelAtO) { + // Enable GlobalISel at or below EnableGlobalISelAt0, unless this is + // MachO/CodeModel::Large, which GlobalISel does not support. + if (getOptLevel() <= EnableGlobalISelAtO && + TT.getArch() != Triple::aarch64_32 && + !(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO())) { setGlobalISel(true); setGlobalISelAbort(GlobalISelAbortMode::Disable); } @@ -310,7 +319,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, + I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, isLittle); } return I.get(); @@ -448,7 +457,8 @@ void AArch64PassConfig::addIRPasses() { addPass(createLICMPass()); } - addPass(createAArch64StackTaggingPass()); + addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() != + CodeGenOpt::None)); } // Pass Pipeline Configuration @@ -502,7 +512,8 @@ bool AArch64PassConfig::addIRTranslator() { } void AArch64PassConfig::addPreLegalizeMachineIR() { - addPass(createAArch64PreLegalizeCombiner()); + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAArch64PreLegalizeCombiner(IsOptNone)); } bool AArch64PassConfig::addLegalizeMachineIR() { @@ -516,9 +527,7 @@ bool AArch64PassConfig::addRegBankSelect() { } void AArch64PassConfig::addPreGlobalInstructionSelect() { - // Workaround the deficiency of the fast register allocator. - if (TM->getOptLevel() == CodeGenOpt::None) - addPass(new Localizer()); + addPass(new Localizer()); } bool AArch64PassConfig::addGlobalInstructionSelect() { @@ -540,6 +549,8 @@ bool AArch64PassConfig::addILPOpts() { if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); addPass(createAArch64SIMDInstrOptPass()); + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createAArch64StackTaggingPreRAPass()); return true; } diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 1c3d5d0743ad..54562094fcf5 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -59,8 +59,8 @@ MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol( } const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( - const MCSymbol *Sym, const MCValue &MV, int64_t Offset, - MachineModuleInfo *MMI, MCStreamer &Streamer) const { + const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV, + int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const { assert((Offset+MV.getConstant() == 0) && "Arch64 does not support GOT PC rel with extra offset"); // On ARM64 Darwin, we can reference symbols with foo@GOT-., which diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h index 7ead363d42fe..1cb4c028c80d 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -35,7 +35,8 @@ public: const TargetMachine &TM, MachineModuleInfo *MMI) const override; - const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV, + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const override; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a4b78f2a7d6b..dc916a7b3407 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } +AArch64TTIImpl::TTI::MemCmpExpansionOptions +AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { + TTI::MemCmpExpansionOptions Options; + Options.AllowOverlappingLoads = !ST->requiresStrictAlign(); + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); + Options.NumLoadsPerBlock = Options.MaxNumLoads; + // TODO: Though vector loads usually perform well on AArch64, in some targets + // they may wake up the FP unit, which raises the power consumption. Perhaps + // they could be used with no holds barred (-O3). + Options.LoadSizes = {8, 4, 2, 1}; + return Options; +} + int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { @@ -879,22 +892,6 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( return Considerable; } -unsigned AArch64TTIImpl::getCacheLineSize() { - return ST->getCacheLineSize(); -} - -unsigned AArch64TTIImpl::getPrefetchDistance() { - return ST->getPrefetchDistance(); -} - -unsigned AArch64TTIImpl::getMinPrefetchStride() { - return ST->getMinPrefetchStride(); -} - -unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { - return ST->getMaxPrefetchIterationsAhead(); -} - bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 10c15a139b4c..32c59f41e1c3 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -85,7 +85,8 @@ public: bool enableInterleavedAccessVectorization() { return true; } - unsigned getNumberOfRegisters(bool Vector) { + unsigned getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector) { if (ST->hasNEON()) return 32; @@ -130,6 +131,9 @@ public: int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, + bool IsZeroCmp) const; + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I = nullptr); @@ -153,14 +157,6 @@ public: shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); - unsigned getCacheLineSize(); - - unsigned getPrefetchDistance(); - - unsigned getMinPrefetchStride(); - - unsigned getMaxPrefetchIterationsAhead(); - bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index f4c55d48d215..4fb409f020d9 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -935,48 +935,34 @@ public: return false; } - bool isMovZSymbolG3() const { - return isMovWSymbol(AArch64MCExpr::VK_ABS_G3); + bool isMovWSymbolG3() const { + return isMovWSymbol({AArch64MCExpr::VK_ABS_G3, AArch64MCExpr::VK_PREL_G3}); } - bool isMovZSymbolG2() const { - return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S, - AArch64MCExpr::VK_TPREL_G2, - AArch64MCExpr::VK_DTPREL_G2}); - } - - bool isMovZSymbolG1() const { - return isMovWSymbol({ - AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S, - AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1, - AArch64MCExpr::VK_DTPREL_G1, - }); - } - - bool isMovZSymbolG0() const { - return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S, - AArch64MCExpr::VK_TPREL_G0, - AArch64MCExpr::VK_DTPREL_G0}); - } - - bool isMovKSymbolG3() const { - return isMovWSymbol(AArch64MCExpr::VK_ABS_G3); - } - - bool isMovKSymbolG2() const { - return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC); + bool isMovWSymbolG2() const { + return isMovWSymbol( + {AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S, + AArch64MCExpr::VK_ABS_G2_NC, AArch64MCExpr::VK_PREL_G2, + AArch64MCExpr::VK_PREL_G2_NC, AArch64MCExpr::VK_TPREL_G2, + AArch64MCExpr::VK_DTPREL_G2}); } - bool isMovKSymbolG1() const { - return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC, - AArch64MCExpr::VK_TPREL_G1_NC, - AArch64MCExpr::VK_DTPREL_G1_NC}); + bool isMovWSymbolG1() const { + return isMovWSymbol( + {AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S, + AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_PREL_G1, + AArch64MCExpr::VK_PREL_G1_NC, AArch64MCExpr::VK_GOTTPREL_G1, + AArch64MCExpr::VK_TPREL_G1, AArch64MCExpr::VK_TPREL_G1_NC, + AArch64MCExpr::VK_DTPREL_G1, AArch64MCExpr::VK_DTPREL_G1_NC}); } - bool isMovKSymbolG0() const { + bool isMovWSymbolG0() const { return isMovWSymbol( - {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC, - AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC}); + {AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S, + AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_PREL_G0, + AArch64MCExpr::VK_PREL_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC, + AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_TPREL_G0_NC, + AArch64MCExpr::VK_DTPREL_G0, AArch64MCExpr::VK_DTPREL_G0_NC}); } template<int RegWidth, int Shift> @@ -1814,7 +1800,7 @@ public: static std::unique_ptr<AArch64Operand> CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_Token, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Token, Ctx); Op->Tok.Data = Str.data(); Op->Tok.Length = Str.size(); Op->Tok.IsSuffix = IsSuffix; @@ -1829,7 +1815,7 @@ public: AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL, unsigned ShiftAmount = 0, unsigned HasExplicitAmount = false) { - auto Op = make_unique<AArch64Operand>(k_Register, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Register, Ctx); Op->Reg.RegNum = RegNum; Op->Reg.Kind = Kind; Op->Reg.ElementWidth = 0; @@ -1861,7 +1847,7 @@ public: CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements, unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_VectorList, Ctx); Op->VectorList.RegNum = RegNum; Op->VectorList.Count = Count; Op->VectorList.NumElements = NumElements; @@ -1874,7 +1860,7 @@ public: static std::unique_ptr<AArch64Operand> CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_VectorIndex, Ctx); Op->VectorIndex.Val = Idx; Op->StartLoc = S; Op->EndLoc = E; @@ -1883,7 +1869,7 @@ public: static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Immediate, Ctx); Op->Imm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -1894,7 +1880,7 @@ public: unsigned ShiftAmount, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_ShiftedImm, Ctx); Op->ShiftedImm .Val = Val; Op->ShiftedImm.ShiftAmount = ShiftAmount; Op->StartLoc = S; @@ -1904,7 +1890,7 @@ public: static std::unique_ptr<AArch64Operand> CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_CondCode, Ctx); Op->CondCode.Code = Code; Op->StartLoc = S; Op->EndLoc = E; @@ -1913,7 +1899,7 @@ public: static std::unique_ptr<AArch64Operand> CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_FPImm, Ctx); Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue(); Op->FPImm.IsExact = IsExact; Op->StartLoc = S; @@ -1925,7 +1911,7 @@ public: StringRef Str, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Barrier, Ctx); Op->Barrier.Val = Val; Op->Barrier.Data = Str.data(); Op->Barrier.Length = Str.size(); @@ -1939,7 +1925,7 @@ public: uint32_t MSRReg, uint32_t PStateField, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_SysReg, Ctx); Op->SysReg.Data = Str.data(); Op->SysReg.Length = Str.size(); Op->SysReg.MRSReg = MRSReg; @@ -1952,7 +1938,7 @@ public: static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_SysCR, Ctx); Op->SysCRImm.Val = Val; Op->StartLoc = S; Op->EndLoc = E; @@ -1963,7 +1949,7 @@ public: StringRef Str, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_Prefetch, Ctx); Op->Prefetch.Val = Val; Op->Barrier.Data = Str.data(); Op->Barrier.Length = Str.size(); @@ -1976,7 +1962,7 @@ public: StringRef Str, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_PSBHint, Ctx); Op->PSBHint.Val = Val; Op->PSBHint.Data = Str.data(); Op->PSBHint.Length = Str.size(); @@ -1989,7 +1975,7 @@ public: StringRef Str, SMLoc S, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_BTIHint, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_BTIHint, Ctx); Op->BTIHint.Val = Val << 1 | 32; Op->BTIHint.Data = Str.data(); Op->BTIHint.Length = Str.size(); @@ -2001,7 +1987,7 @@ public: static std::unique_ptr<AArch64Operand> CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { - auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx); + auto Op = std::make_unique<AArch64Operand>(k_ShiftExtend, Ctx); Op->ShiftExtend.Type = ShOp; Op->ShiftExtend.Amount = Val; Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount; @@ -2840,7 +2826,7 @@ static const struct Extension { {"sve2-aes", {AArch64::FeatureSVE2AES}}, {"sve2-sm4", {AArch64::FeatureSVE2SM4}}, {"sve2-sha3", {AArch64::FeatureSVE2SHA3}}, - {"bitperm", {AArch64::FeatureSVE2BitPerm}}, + {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}}, // FIXME: Unsupported extensions {"pan", {}}, {"lor", {}}, @@ -3260,6 +3246,13 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { .Case("abs_g0", AArch64MCExpr::VK_ABS_G0) .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S) .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC) + .Case("prel_g3", AArch64MCExpr::VK_PREL_G3) + .Case("prel_g2", AArch64MCExpr::VK_PREL_G2) + .Case("prel_g2_nc", AArch64MCExpr::VK_PREL_G2_NC) + .Case("prel_g1", AArch64MCExpr::VK_PREL_G1) + .Case("prel_g1_nc", AArch64MCExpr::VK_PREL_G1_NC) + .Case("prel_g0", AArch64MCExpr::VK_PREL_G0) + .Case("prel_g0_nc", AArch64MCExpr::VK_PREL_G0_NC) .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2) .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1) .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC) @@ -5283,7 +5276,7 @@ bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) { auto parseOp = [&]() -> bool { SMLoc L = getLoc(); - const MCExpr *Expr; + const MCExpr *Expr = nullptr; if (check(getParser().parseExpression(Expr), L, "expected expression")) return true; const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr); @@ -5542,43 +5535,43 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, switch (Kind) { default: return Match_InvalidOperand; - case MCK__35_0: + case MCK__HASH_0: ExpectedVal = 0; break; - case MCK__35_1: + case MCK__HASH_1: ExpectedVal = 1; break; - case MCK__35_12: + case MCK__HASH_12: ExpectedVal = 12; break; - case MCK__35_16: + case MCK__HASH_16: ExpectedVal = 16; break; - case MCK__35_2: + case MCK__HASH_2: ExpectedVal = 2; break; - case MCK__35_24: + case MCK__HASH_24: ExpectedVal = 24; break; - case MCK__35_3: + case MCK__HASH_3: ExpectedVal = 3; break; - case MCK__35_32: + case MCK__HASH_32: ExpectedVal = 32; break; - case MCK__35_4: + case MCK__HASH_4: ExpectedVal = 4; break; - case MCK__35_48: + case MCK__HASH_48: ExpectedVal = 48; break; - case MCK__35_6: + case MCK__HASH_6: ExpectedVal = 6; break; - case MCK__35_64: + case MCK__HASH_64: ExpectedVal = 64; break; - case MCK__35_8: + case MCK__HASH_8: ExpectedVal = 8; break; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 6418211a4f55..21ce5785ea5e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -153,9 +153,8 @@ static unsigned AdrImmBits(unsigned Value) { static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, uint64_t Value, MCContext &Ctx, const Triple &TheTriple, bool IsResolved) { - unsigned Kind = Fixup.getKind(); int64_t SignedValue = static_cast<int64_t>(Value); - switch (Kind) { + switch (Fixup.getTargetKind()) { default: llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: @@ -574,7 +573,7 @@ public: case MCCFIInstruction::OpDefCfa: { // Defines a frame pointer. unsigned XReg = - getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)); + getXRegFromWReg(*MRI.getLLVMRegNum(Inst.getRegister(), true)); // Other CFA registers than FP are not supported by compact unwind. // Fallback on DWARF. @@ -593,8 +592,8 @@ public: assert(FPPush.getOperation() == MCCFIInstruction::OpOffset && "Frame pointer not pushed!"); - unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true); - unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true); + unsigned LRReg = *MRI.getLLVMRegNum(LRPush.getRegister(), true); + unsigned FPReg = *MRI.getLLVMRegNum(FPPush.getRegister(), true); LRReg = getXRegFromWReg(LRReg); FPReg = getXRegFromWReg(FPReg); @@ -615,14 +614,14 @@ public: case MCCFIInstruction::OpOffset: { // Registers are saved in pairs. We expect there to be two consecutive // `.cfi_offset' instructions with the appropriate registers specified. - unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true); + unsigned Reg1 = *MRI.getLLVMRegNum(Inst.getRegister(), true); if (i + 1 == e) return CU::UNWIND_ARM64_MODE_DWARF; const MCCFIInstruction &Inst2 = Instrs[++i]; if (Inst2.getOperation() != MCCFIInstruction::OpOffset) return CU::UNWIND_ARM64_MODE_DWARF; - unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true); + unsigned Reg2 = *MRI.getLLVMRegNum(Inst2.getRegister(), true); // N.B. The encodings must be in register number order, and the X // registers before the D registers. diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index c871e2c62eac..0fd1ca187be7 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -57,7 +57,7 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) static bool isNonILP32reloc(const MCFixup &Fixup, AArch64MCExpr::VariantKind RefKind, MCContext &Ctx) { - if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw) + if (Fixup.getTargetKind() != AArch64::fixup_aarch64_movw) return false; switch (RefKind) { case AArch64MCExpr::VK_ABS_G3: @@ -120,7 +120,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, "Should only be expression-level modifiers here"); if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); return ELF::R_AARCH64_NONE; @@ -184,7 +184,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, } else { if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx)) return ELF::R_AARCH64_NONE; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { case FK_NONE: return ELF::R_AARCH64_NONE; case FK_Data_1: @@ -394,6 +394,20 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(MOVW_SABS_G0); if (RefKind == AArch64MCExpr::VK_ABS_G0_NC) return R_CLS(MOVW_UABS_G0_NC); + if (RefKind == AArch64MCExpr::VK_PREL_G3) + return ELF::R_AARCH64_MOVW_PREL_G3; + if (RefKind == AArch64MCExpr::VK_PREL_G2) + return ELF::R_AARCH64_MOVW_PREL_G2; + if (RefKind == AArch64MCExpr::VK_PREL_G2_NC) + return ELF::R_AARCH64_MOVW_PREL_G2_NC; + if (RefKind == AArch64MCExpr::VK_PREL_G1) + return R_CLS(MOVW_PREL_G1); + if (RefKind == AArch64MCExpr::VK_PREL_G1_NC) + return ELF::R_AARCH64_MOVW_PREL_G1_NC; + if (RefKind == AArch64MCExpr::VK_PREL_G0) + return R_CLS(MOVW_PREL_G0); + if (RefKind == AArch64MCExpr::VK_PREL_G0_NC) + return R_CLS(MOVW_PREL_G0_NC); if (RefKind == AArch64MCExpr::VK_DTPREL_G2) return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2; if (RefKind == AArch64MCExpr::VK_DTPREL_G1) @@ -434,5 +448,5 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createAArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) { - return llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32); + return std::make_unique<AArch64ELFObjectWriter>(OSABI, IsILP32); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index d0a544273b8b..1a16468484ad 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -172,7 +172,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, int ImmS = MI->getOperand(4).getImm(); if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) && - (ImmR == 0 || ImmS < ImmR)) { + (ImmR == 0 || ImmS < ImmR) && + STI.getFeatureBits()[AArch64::HasV8_2aOps]) { // BFC takes precedence over its entire range, sligtly differently to BFI. int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; int LSB = (BitWidth - ImmR) % BitWidth; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index ecff1ab0a8b3..5926a4f81616 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -30,7 +30,7 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant( cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"), clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"))); -AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { +AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) { // We prefer NEON instructions to be printed in the short, Apple-specific // form when targeting Darwin. AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant; @@ -39,7 +39,8 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { PrivateLabelPrefix = "L"; SeparatorString = "%%"; CommentString = ";"; - CodePointerSize = CalleeSaveStackSlotSize = 8; + CalleeSaveStackSlotSize = 8; + CodePointerSize = IsILP32 ? 4 : 8; AlignmentIsInBytes = false; UsesELFSectionDirectiveForBSS = true; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index 36ae92afc8c1..7274ae79f74a 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -23,7 +23,7 @@ class Target; class Triple; struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin { - explicit AArch64MCAsmInfoDarwin(); + explicit AArch64MCAsmInfoDarwin(bool IsILP32); const MCExpr * getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const override; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 0a529321edc8..548e399e05a3 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -42,6 +42,13 @@ StringRef AArch64MCExpr::getVariantKindName() const { case VK_ABS_G0: return ":abs_g0:"; case VK_ABS_G0_S: return ":abs_g0_s:"; case VK_ABS_G0_NC: return ":abs_g0_nc:"; + case VK_PREL_G3: return ":prel_g3:"; + case VK_PREL_G2: return ":prel_g2:"; + case VK_PREL_G2_NC: return ":prel_g2_nc:"; + case VK_PREL_G1: return ":prel_g1:"; + case VK_PREL_G1_NC: return ":prel_g1_nc:"; + case VK_PREL_G0: return ":prel_g0:"; + case VK_PREL_G0_NC: return ":prel_g0_nc:"; case VK_DTPREL_G2: return ":dtprel_g2:"; case VK_DTPREL_G1: return ":dtprel_g1:"; case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:"; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index ec9c95911628..a82ff2e91426 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -27,12 +27,13 @@ public: // symbol. E.g. direct, via the GOT, ... VK_ABS = 0x001, VK_SABS = 0x002, - VK_GOT = 0x003, - VK_DTPREL = 0x004, - VK_GOTTPREL = 0x005, - VK_TPREL = 0x006, - VK_TLSDESC = 0x007, - VK_SECREL = 0x008, + VK_PREL = 0x003, + VK_GOT = 0x004, + VK_DTPREL = 0x005, + VK_GOTTPREL = 0x006, + VK_TPREL = 0x007, + VK_TLSDESC = 0x008, + VK_SECREL = 0x009, VK_SymLocBits = 0x00f, // Variants specifying which part of the final address calculation is @@ -72,6 +73,13 @@ public: VK_ABS_G0_S = VK_SABS | VK_G0, VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC, VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC, + VK_PREL_G3 = VK_PREL | VK_G3, + VK_PREL_G2 = VK_PREL | VK_G2, + VK_PREL_G2_NC = VK_PREL | VK_G2 | VK_NC, + VK_PREL_G1 = VK_PREL | VK_G1, + VK_PREL_G1_NC = VK_PREL | VK_G1 | VK_NC, + VK_PREL_G0 = VK_PREL | VK_G0, + VK_PREL_G0_NC = VK_PREL | VK_G0 | VK_NC, VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC, VK_GOT_PAGE = VK_GOT | VK_PAGE, VK_DTPREL_G2 = VK_DTPREL | VK_G2, diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index df12274d9470..1d583ec0087b 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -241,7 +241,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, const Triple &TheTriple) { MCAsmInfo *MAI; if (TheTriple.isOSBinFormatMachO()) - MAI = new AArch64MCAsmInfoDarwin(); + MAI = new AArch64MCAsmInfoDarwin(TheTriple.getArch() == Triple::aarch64_32); else if (TheTriple.isWindowsMSVCEnvironment()) MAI = new AArch64MCAsmInfoMicrosoftCOFF(); else if (TheTriple.isOSBinFormatCOFF()) diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index b3ce5ef22eef..fc04d37eb362 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -54,7 +54,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED); Log2Size = ~0U; - switch ((unsigned)Fixup.getKind()) { + switch (Fixup.getTargetKind()) { default: return false; @@ -406,6 +406,6 @@ void AArch64MachObjectWriter::recordRelocation( std::unique_ptr<MCObjectTargetWriter> llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, bool IsILP32) { - return llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype, + return std::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype, IsILP32); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index a45880a07427..aa50bd05cb71 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -120,7 +120,7 @@ bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { namespace llvm { std::unique_ptr<MCObjectTargetWriter> createAArch64WinCOFFObjectWriter() { - return llvm::make_unique<AArch64WinCOFFObjectWriter>(); + return std::make_unique<AArch64WinCOFFObjectWriter>(); } } // end namespace llvm diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td index 808e59467081..8ccf6aa675ba 100644 --- a/lib/Target/AArch64/SVEInstrFormats.td +++ b/lib/Target/AArch64/SVEInstrFormats.td @@ -279,6 +279,19 @@ let Predicates = [HasSVE] in { defm PTRUES : sve_int_ptrue<0b001, "ptrues">; } +//===----------------------------------------------------------------------===// +// SVE pattern match helpers. +//===----------------------------------------------------------------------===// + +class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, + Instruction inst> +: Pat<(vtd (op vt1:$Op1)), + (inst $Op1)>; + +class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, + ValueType vt2, ValueType vt3, Instruction inst> +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)), + (inst $Op1, $Op2, $Op3)>; //===----------------------------------------------------------------------===// // SVE Predicate Misc Group @@ -403,12 +416,12 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm> { } class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, - ZPRRegOp zprty> -: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg), - asm, "\t$Zdn, $Pg", + ZPRRegOp zprty, PPRRegOp pprty> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm), + asm, "\t$Zdn, $Pm", "", []>, Sched<[]> { - bits<4> Pg; + bits<4> Pm; bits<5> Zdn; let Inst{31-24} = 0b00100101; let Inst{23-22} = sz8_64; @@ -416,7 +429,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, let Inst{18-16} = opc{4-2}; let Inst{15-11} = 0b10000; let Inst{10-9} = opc{1-0}; - let Inst{8-5} = Pg; + let Inst{8-5} = Pm; let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; @@ -425,9 +438,16 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, } multiclass sve_int_count_v<bits<5> opc, string asm> { - def _H : sve_int_count_v<0b01, opc, asm, ZPR16>; - def _S : sve_int_count_v<0b10, opc, asm, ZPR32>; - def _D : sve_int_count_v<0b11, opc, asm, ZPR64>; + def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>; + def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>; + def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>; + + def : InstAlias<asm # "\t$Zdn, $Pm", + (!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>; + def : InstAlias<asm # "\t$Zdn, $Pm", + (!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>; + def : InstAlias<asm # "\t$Zdn, $Pm", + (!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>; } class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm, @@ -609,11 +629,12 @@ multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> { //===----------------------------------------------------------------------===// class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty, - RegisterClass srcRegType> + ValueType vt, RegisterClass srcRegType, + SDPatternOperator op> : I<(outs zprty:$Zd), (ins srcRegType:$Rn), asm, "\t$Zd, $Rn", "", - []>, Sched<[]> { + [(set (vt zprty:$Zd), (op srcRegType:$Rn))]>, Sched<[]> { bits<5> Rn; bits<5> Zd; let Inst{31-24} = 0b00000101; @@ -623,11 +644,11 @@ class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_perm_dup_r<string asm> { - def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>; - def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>; - def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>; - def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>; +multiclass sve_int_perm_dup_r<string asm, SDPatternOperator op> { + def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, nxv16i8, GPR32sp, op>; + def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, nxv8i16, GPR32sp, op>; + def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, nxv4i32, GPR32sp, op>; + def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, nxv2i64, GPR64sp, op>; def : InstAlias<"mov $Zd, $Rn", (!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>; @@ -744,7 +765,7 @@ multiclass sve2_int_perm_tbl<string asm> { } class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty> -: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), +: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { @@ -758,6 +779,8 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty> let Inst{15-10} = 0b001011; let Inst{9-5} = Zn; let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; } multiclass sve2_int_perm_tbx<string asm> { @@ -826,10 +849,14 @@ class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_perm_unpk<bits<2> opc, string asm> { +multiclass sve_int_perm_unpk<bits<2> opc, string asm, SDPatternOperator op> { def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>; def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>; def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>; + + def : SVE_1_Op_Pat<nxv8i16, op, nxv16i8, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Pat<nxv4i32, op, nxv8i16, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Pat<nxv2i64, op, nxv4i32, !cast<Instruction>(NAME # _D)>; } class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty, @@ -1197,10 +1224,12 @@ multiclass sve_fp_ftmad<string asm> { //===----------------------------------------------------------------------===// class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, - ZPRRegOp zprty> + ZPRRegOp zprty, + ValueType vt, ValueType vt2, SDPatternOperator op> : I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", - "", []>, Sched<[]> { + "", + [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> { bits<5> Zd; bits<5> Zm; bits<5> Zn; @@ -1214,10 +1243,10 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> { - def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> { + def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>; + def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>; + def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>; } //===----------------------------------------------------------------------===// @@ -1489,7 +1518,7 @@ multiclass sve_fp_fcadd<string asm> { class sve2_fp_convert_precision<bits<4> opc, string asm, ZPRRegOp zprty1, ZPRRegOp zprty2> -: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn), +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn), asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> { @@ -1504,6 +1533,8 @@ class sve2_fp_convert_precision<bits<4> opc, string asm, let Inst{12-10} = Pg; let Inst{9-5} = Zn; let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; } multiclass sve2_fp_convert_down_narrow<string asm> { @@ -1998,12 +2029,14 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1, let Constraints = "$Zda = $_Zda"; let DestructiveInstType = Destructive; - let ElementSize = zprty1.ElementSize; } -multiclass sve_intx_dot<bit opc, string asm> { +multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> { def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>; def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>; + + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2028,22 +2061,27 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm, let Constraints = "$Zda = $_Zda"; let DestructiveInstType = Destructive; - let ElementSize = ElementSizeNone; } -multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> { - def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> { +multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm, + SDPatternOperator op> { + def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> { + def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> { bits<1> iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))), + (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>; + def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))), + (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>; } //===----------------------------------------------------------------------===// @@ -2399,21 +2437,40 @@ multiclass sve2_misc_bitwise<bits<4> opc, string asm> { def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>; } -multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> { - let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in { - def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8, ZPR8>; - def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>; - def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>; - def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>; - } -} - multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> { def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; } +class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-11} = 0b10010; + let Inst{10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> { + def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>; + def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>; + def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>; + def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>; +} + class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm, ZPRRegOp zprty1, ZPRRegOp zprty2, Operand immtype> @@ -2451,9 +2508,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> { // SVE2 Accumulate Group //===----------------------------------------------------------------------===// -class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm, - ZPRRegOp zprty, Operand immtype> -: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm), +class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm, + ZPRRegOp zprty, Operand immtype> +: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm), asm, "\t$Zd, $Zn, $imm", "", []>, Sched<[]> { bits<5> Zd; @@ -2468,38 +2525,40 @@ class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm, let Inst{10} = opc; let Inst{9-5} = Zn; let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; } -multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> { - def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; - def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { +multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> { + def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{19} = imm{3}; } - def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { let Inst{20-19} = imm{4-3}; } - def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } } -multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> { - def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; - def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { +multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> { + def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; } - def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{20-19} = imm{4-3}; } - def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } } -class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, - ZPRRegOp zprty, Operand immtype> +class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm), asm, "\t$Zda, $Zn, $imm", "", []>, Sched<[]> { @@ -2521,15 +2580,15 @@ class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm let ElementSize = ElementSizeNone; } -multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> { - def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; - def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { +multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> { + def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; } - def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{20-19} = imm{4-3}; } - def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } @@ -2607,9 +2666,9 @@ multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> { // SVE2 Narrowing Group //===----------------------------------------------------------------------===// -class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc, - string asm, ZPRRegOp zprty1, - ZPRRegOp zprty2, Operand immtype> +class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc, + string asm, ZPRRegOp zprty1, + ZPRRegOp zprty2, Operand immtype> : I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm), asm, "\t$Zd, $Zn, $imm", "", []>, Sched<[]> { @@ -2622,26 +2681,63 @@ class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc, let Inst{20-19} = tsz8_64{1-0}; let Inst{18-16} = imm{2-0}; // imm3 let Inst{15-14} = 0b00; - let Inst{13-10} = opc; + let Inst{13-11} = opc; + let Inst{10} = 0b0; let Inst{9-5} = Zn; let Inst{4-0} = Zd; } -multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> { - def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16, - vecshiftR8>; - def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32, - vecshiftR16> { +multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> { + def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16, + vecshiftR8>; + def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32, + vecshiftR16> { let Inst{19} = imm{3}; } - def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64, - vecshiftR32> { + def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64, + vecshiftR32> { let Inst{20-19} = imm{4-3}; } } -class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm, - ZPRRegOp zprty1, ZPRRegOp zprty2> +class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc, + string asm, ZPRRegOp zprty1, + ZPRRegOp zprty2, Operand immtype> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> imm; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-14} = 0b00; + let Inst{13-11} = opc; + let Inst{10} = 0b1; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> { + def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16, + vecshiftR8>; + def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32, + vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64, + vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } +} + +class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> : I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm), asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { bits<5> Zd; @@ -2652,19 +2748,46 @@ class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm, let Inst{21} = 0b1; let Inst{20-16} = Zm; let Inst{15-13} = 0b011; - let Inst{12-10} = opc; // S, R, T + let Inst{12-11} = opc; // S, R + let Inst{10} = 0b0; // Top let Inst{9-5} = Zn; let Inst{4-0} = Zd; } -multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> { - def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>; - def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>; - def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>; +multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm> { + def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>; +} + +class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b011; + let Inst{12-11} = opc; // S, R + let Inst{10} = 0b1; // Top + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm> { + def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>; } -class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm, - ZPRRegOp zprty1, ZPRRegOp zprty2> +class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> : I<(outs zprty1:$Zd), (ins zprty2:$Zn), asm, "\t$Zd, $Zn", "", []>, Sched<[]> { bits<5> Zd; @@ -2674,15 +2797,41 @@ class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm, let Inst{21} = 0b1; let Inst{20-19} = tsz8_64{1-0}; let Inst{18-13} = 0b000010; - let Inst{12-10} = opc; + let Inst{12-11} = opc; + let Inst{10} = 0b0; let Inst{9-5} = Zn; let Inst{4-0} = Zd; } -multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> { - def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>; - def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>; - def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>; +multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm> { + def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>; +} + +class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn), + asm, "\t$Zd, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-13} = 0b000010; + let Inst{12-11} = opc; + let Inst{10} = 0b1; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm> { + def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>; } //===----------------------------------------------------------------------===// @@ -2713,11 +2862,17 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc, let ElementSize = zprty.ElementSize; } -multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> { +multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm, + SDPatternOperator op> { def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>; def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>; def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> { @@ -2735,11 +2890,21 @@ multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> { def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; } -multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> { +multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm, + SDPatternOperator op> { def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>; def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>; def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>; + + def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + + def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> { @@ -3886,9 +4051,9 @@ multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty, (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; } -class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm, - RegisterOperand VecList> -: I<(outs VecList:$Zt), iops, +class sve2_mem_sstnt_vs_base<bits<3> opc, string asm, + RegisterOperand listty, ZPRRegOp zprty> +: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), asm, "\t$Zt, $Pg, [$Zn, $Rm]", "", []>, Sched<[]> { @@ -3908,17 +4073,14 @@ class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm, let mayStore = 1; } -multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm, +multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm, RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), - asm, listty>; + def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; - def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]", - (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]", (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; } @@ -4147,6 +4309,14 @@ class sve_int_perm_punpk<bit opc, string asm> let Inst{3-0} = Pd; } +multiclass sve_int_perm_punpk<bit opc, string asm, SDPatternOperator op> { + def NAME : sve_int_perm_punpk<opc, asm>; + + def : SVE_1_Op_Pat<nxv8i1, op, nxv16i1, !cast<Instruction>(NAME)>; + def : SVE_1_Op_Pat<nxv4i1, op, nxv8i1, !cast<Instruction>(NAME)>; + def : SVE_1_Op_Pat<nxv2i1, op, nxv4i1, !cast<Instruction>(NAME)>; +} + class sve_int_rdffr_pred<bit s, string asm> : I<(outs PPR8:$Pd), (ins PPRAny:$Pg), asm, "\t$Pd, $Pg/z", @@ -5094,7 +5264,7 @@ multiclass sve_mem_p_fill<string asm> { (!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>; } -class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm, +class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm, RegisterOperand VecList> : I<(outs VecList:$Zt), iops, asm, "\t$Zt, $Pg/z, [$Zn, $Rm]", @@ -5119,17 +5289,15 @@ class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm, let mayLoad = 1; } -multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm, +multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm, RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), + def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), asm, listty>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; - def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]", - (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]", (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index 7bb075c36e79..c27fc7a112ec 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -125,7 +125,7 @@ namespace llvm { uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) { // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name - Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$"); + static const Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$"); std::string UpperName = Name.upper(); SmallVector<StringRef, 5> Ops; diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index e5e2fc2cb0df..7a4fcac09ec4 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -313,9 +313,9 @@ struct SysAlias { uint16_t Encoding; FeatureBitset FeaturesRequired; - SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {}; - SysAlias (const char *N, uint16_t E, FeatureBitset F) : - Name(N), Encoding(E), FeaturesRequired(F) {}; + constexpr SysAlias(const char *N, uint16_t E) : Name(N), Encoding(E) {} + constexpr SysAlias(const char *N, uint16_t E, FeatureBitset F) + : Name(N), Encoding(E), FeaturesRequired(F) {} bool haveFeatures(FeatureBitset ActiveFeatures) const { return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; @@ -326,9 +326,10 @@ struct SysAlias { struct SysAliasReg : SysAlias { bool NeedsReg; - SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {}; - SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) : SysAlias(N, E, F), - NeedsReg(R) {}; + constexpr SysAliasReg(const char *N, uint16_t E, bool R) + : SysAlias(N, E), NeedsReg(R) {} + constexpr SysAliasReg(const char *N, uint16_t E, bool R, FeatureBitset F) + : SysAlias(N, E, F), NeedsReg(R) {} }; namespace AArch64AT{ @@ -627,6 +628,18 @@ namespace AArch64II { /// MO_S - Indicates that the bits of the symbol operand represented by /// MO_G0 etc are signed. MO_S = 0x100, + + /// MO_PREL - Indicates that the bits of the symbol operand represented by + /// MO_G0 etc are PC relative. + MO_PREL = 0x200, + + /// MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag + /// in bits 56-63. + /// On a FrameIndex operand, indicates that the underlying memory is tagged + /// with an unknown tag value (MTE); this needs to be lowered either to an + /// SP-relative load or store instruction (which do not check tags), or to + /// an LDG instruction to obtain the tag value. + MO_TAGGED = 0x400, }; } // end namespace AArch64II |