diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-20 14:16:56 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-20 14:16:56 +0000 |
commit | 2cab237b5dbfe1b3e9c7aa7a3c02d2b98fcf7462 (patch) | |
tree | 524fe828571f81358bba62fdb6d04c6e5e96a2a4 /contrib/llvm/lib/Target/AArch64 | |
parent | 6c7828a2807ea5e50c79ca42dbedf2b589ce63b2 (diff) | |
parent | 044eb2f6afba375a914ac9d8024f8f5142bb912e (diff) |
Merge llvm trunk r321017 to contrib/llvm.
Notes
Notes:
svn path=/projects/clang600-import/; revision=327023
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64')
85 files changed, 5822 insertions, 2338 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h index 1dda746a6be1..edda13ce97ef 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64.h @@ -39,7 +39,7 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); -FunctionPass *createAArch64VectorByElementOptPass(); +FunctionPass *createAArch64SIMDInstrOptPass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64A57FPLoadBalancing(); @@ -64,7 +64,7 @@ void initializeAArch64ConditionOptimizerPass(PassRegistry&); void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); -void initializeAArch64VectorByElementOptPass(PassRegistry&); +void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td index 436bf1193304..75fb937de9bf 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64.td @@ -61,6 +61,12 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", "Has zero-cycle zeroing instructions">; +/// ... but the floating-point version doesn't quite work in rare cases on older +/// CPUs. +def FeatureZCZeroingFPWorkaround : SubtargetFeature<"zcz-fp-workaround", + "HasZeroCycleZeroingFPWorkaround", "true", + "The zero-cycle floating-point zeroing instruction has a bug">; + def FeatureStrictAlign : SubtargetFeature<"strict-align", "StrictAlign", "true", "Disallow all unaligned memory " @@ -94,6 +100,9 @@ def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">; +def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow", + "true", "STR of Q register with register offset is slow">; + def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", "true", "Use alternative pattern for sextload convert to f32">; @@ -118,10 +127,17 @@ def FeatureDisableLatencySchedHeuristic : SubtargetFeature< "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; +def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true", + "Enable support for RCPC extension">; + def FeatureUseRSqrt : SubtargetFeature< "use-reciprocal-square-root", "UseRSqrt", "true", "Use the reciprocal square root approximation">; +def FeatureDotProd : SubtargetFeature< + "dotprod", "HasDotProd", "true", + "Enable dot product support">; + def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", "NegativeImmediates", "false", "Convert immediates and instructions " @@ -132,6 +148,7 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", def FeatureLSLFast : SubtargetFeature< "lsl-fast", "HasLSLFast", "true", "CPU has a fastpath logical shift of up to 3 places">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -142,6 +159,9 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>; +def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", + "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// @@ -200,6 +220,19 @@ def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", FeatureUseAA ]>; +def ProcA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", + "Cortex-A55 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureFullFP16, + FeatureDotProd, + FeatureRCPC, + FeaturePerfMon + ]>; + def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", [ FeatureBalanceFPOps, @@ -235,19 +268,36 @@ def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", FeaturePerfMon ]>; +def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", + "Cortex-A75 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureFullFP16, + FeatureDotProd, + FeatureRCPC, + FeaturePerfMon + ]>; + +// Note that cyclone does not fuse AES instructions, but newer apple chips do +// perform the fusion and cyclone is used by default when targetting apple OSes. def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", "Cyclone", [ FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, FeatureCrypto, FeatureDisableLatencySchedHeuristic, FeatureFPARMv8, - FeatureArithmeticBccFusion, - FeatureArithmeticCbzFusion, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeatureSlowMisaligned128Store, FeatureZCRegMove, - FeatureZCZeroing + FeatureZCZeroing, + FeatureZCZeroingFPWorkaround ]>; def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", @@ -305,9 +355,24 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", FeaturePredictableSelectIsExpensive, FeatureRDM, FeatureZCZeroing, - FeatureLSLFast + FeatureLSLFast, + FeatureSlowSTRQro ]>; +def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", + "Qualcomm Saphira processors", [ + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeatureSPE, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureZCZeroing, + FeatureLSLFast, + HasV8_3aOps]>; + def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99", "Cavium ThunderX2 processors", [ @@ -372,18 +437,21 @@ def : ProcessorModel<"generic", NoSchedModel, [ FeaturePostRAScheduler ]>; -// FIXME: Cortex-A35 is currently modeled as a Cortex-A53. +// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53. def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; +def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; -// FIXME: Cortex-A72 and Cortex-A73 are currently modeled as a Cortex-A57. +// FIXME: Cortex-A72, Cortex-A73 and Cortex-A75 are currently modeled as a Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; +def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>; def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>; def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>; +def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>; def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; // Cavium ThunderX/ThunderX T8X Processors def : ProcessorModel<"thunderx", ThunderXT8XModel, [ProcThunderX]>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp index e6afb42440a7..7de5d0ef66b1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -22,9 +22,9 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index db1fbe069f4d..38a7e331bb97 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -161,9 +161,9 @@ namespace { /// A Chain is a sequence of instructions that are linked together by /// an accumulation operand. For example: /// -/// fmul d0<def>, ? -/// fmla d1<def>, ?, ?, d0<kill> -/// fmla d2<def>, ?, ?, d1<kill> +/// fmul def d0, ? +/// fmla def d1, ?, ?, killed d0 +/// fmla def d2, ?, ?, killed d1 /// /// There may be other instructions interleaved in the sequence that /// do not belong to the chain. These other instructions must not use @@ -308,7 +308,7 @@ public: //===----------------------------------------------------------------------===// bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { - if (skipFunction(*F.getFunction())) + if (skipFunction(F.getFunction())) return false; if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps()) @@ -538,7 +538,7 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n"); return false; } - DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n"); + DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n"); std::map<unsigned, unsigned> Substs; for (MachineInstr &I : *G) { @@ -611,8 +611,8 @@ void AArch64A57FPLoadBalancing::scanInstruction( // unit. unsigned DestReg = MI->getOperand(0).getReg(); - DEBUG(dbgs() << "New chain started for register " - << TRI->getName(DestReg) << " at " << *MI); + DEBUG(dbgs() << "New chain started for register " << printReg(DestReg, TRI) + << " at " << *MI); auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); ActiveChains[DestReg] = G.get(); @@ -632,7 +632,7 @@ void AArch64A57FPLoadBalancing::scanInstruction( if (ActiveChains.find(AccumReg) != ActiveChains.end()) { DEBUG(dbgs() << "Chain found for accumulator register " - << TRI->getName(AccumReg) << " in MI " << *MI); + << printReg(AccumReg, TRI) << " in MI " << *MI); // For simplicity we only chain together sequences of MULs/MLAs where the // accumulator register is killed on each instruction. This means we don't @@ -657,7 +657,7 @@ void AArch64A57FPLoadBalancing::scanInstruction( } DEBUG(dbgs() << "Creating new chain for dest register " - << TRI->getName(DestReg) << "\n"); + << printReg(DestReg, TRI) << "\n"); auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); ActiveChains[DestReg] = G.get(); AllChains.push_back(std::move(G)); @@ -685,8 +685,8 @@ maybeKillChain(MachineOperand &MO, unsigned Idx, // If this is a KILL of a current chain, record it. if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) { - DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg()) - << "\n"); + DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI) + << "\n"); ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied()); } ActiveChains.erase(MO.getReg()); @@ -697,7 +697,7 @@ maybeKillChain(MachineOperand &MO, unsigned Idx, I != E;) { if (MO.clobbersPhysReg(I->first)) { DEBUG(dbgs() << "Kill (regmask) seen for chain " - << TRI->getName(I->first) << "\n"); + << printReg(I->first, TRI) << "\n"); I->second->setKill(MI, Idx, /*Immutable=*/true); ActiveChains.erase(I++); } else diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index bc2320dd20b3..338daecb49e5 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -36,7 +36,6 @@ #include "AArch64.h" #include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" -#include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -394,7 +393,7 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { bool Changed = false; DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); - if (skipFunction(*mf.getFunction())) + if (skipFunction(mf.getFunction())) return false; MRI = &mf.getRegInfo(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 5ce57926cc03..67138f41dda8 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1,4 +1,4 @@ -//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===// +//===- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer ---------------===// // // The LLVM Compiler Infrastructure // @@ -17,32 +17,42 @@ #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" +#include "AArch64TargetObjectFile.h" #include "InstPrinter/AArch64InstPrinter.h" #include "MCTargetDesc/AArch64AddressingModes.h" -#include "MCTargetDesc/AArch64MCExpr.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/StackMaps.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" -#include "llvm/MC/MCLinkerOptimizationHint.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/Debug.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <map> +#include <memory> + using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -57,7 +67,7 @@ class AArch64AsmPrinter : public AsmPrinter { public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this), - SM(*this), AArch64FI(nullptr) {} + SM(*this) {} StringRef getPassName() const override { return "AArch64 Assembly Printer"; } @@ -118,7 +128,8 @@ private: MCSymbol *GetCPISymbol(unsigned CPID) const override; void EmitEndOfAsmFile(Module &M) override; - AArch64FunctionInfo *AArch64FI; + + AArch64FunctionInfo *AArch64FI = nullptr; /// \brief Emit the LOHs contained in AArch64FI. void EmitLOHs(); @@ -126,13 +137,12 @@ private: /// Emit instruction to set float register to zero. void EmitFMov0(const MachineInstr &MI); - typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol; + using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>; + MInstToMCSymbol LOHInstToLabel; }; -} // end of anonymous namespace - -//===----------------------------------------------------------------------===// +} // end anonymous namespace void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) { @@ -200,6 +210,29 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); SM.serializeToStackMapSection(); } + + if (TT.isOSBinFormatCOFF()) { + const auto &TLOF = + static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering()); + + std::string Flags; + raw_string_ostream OS(Flags); + + for (const auto &Function : M) + TLOF.emitLinkerFlagsForGlobal(OS, &Function); + for (const auto &Global : M.globals()) + TLOF.emitLinkerFlagsForGlobal(OS, &Global); + for (const auto &Alias : M.aliases()) + TLOF.emitLinkerFlagsForGlobal(OS, &Alias); + + OS.flush(); + + // Output collected flags + if (!Flags.empty()) { + OutStreamer->SwitchSection(TLOF.getDrectveSection()); + OutStreamer->EmitBytes(Flags); + } + } } void AArch64AsmPrinter::EmitLOHs() { @@ -490,11 +523,13 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { unsigned DestReg = MI.getOperand(0).getReg(); - if (STI->hasZeroCycleZeroing()) { - // Convert S/D register to corresponding Q register - if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) { + if (STI->hasZeroCycleZeroing() && !STI->hasZeroCycleZeroingFPWorkaround()) { + // Convert H/S/D register to corresponding Q register + if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) + DestReg = AArch64::Q0 + (DestReg - AArch64::H0); + else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) DestReg = AArch64::Q0 + (DestReg - AArch64::S0); - } else { + else { assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31); DestReg = AArch64::Q0 + (DestReg - AArch64::D0); } @@ -507,6 +542,11 @@ void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { MCInst FMov; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode"); + case AArch64::FMOVH0: + FMov.setOpcode(AArch64::FMOVWHr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; case AArch64::FMOVS0: FMov.setOpcode(AArch64::FMOVWSr); FMov.addOperand(MCOperand::createReg(DestReg)); @@ -626,6 +666,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + case AArch64::FMOVH0: case AArch64::FMOVS0: case AArch64::FMOVD0: EmitFMov0(*MI); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp index 29f6d571d6bd..08152c0d83d9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp @@ -32,14 +32,14 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -47,13 +47,10 @@ using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "This shouldn't be built without GISel" -#endif - AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) : CallLowering(&TLI) {} +namespace { struct IncomingArgHandler : public CallLowering::ValueHandler { IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, CCAssignFn *AssignFn) @@ -73,8 +70,18 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { void assignValueToReg(unsigned ValVReg, unsigned PhysReg, CCValAssign &VA) override { markPhysRegUsed(PhysReg); - MIRBuilder.buildCopy(ValVReg, PhysReg); - // FIXME: assert extension + switch (VA.getLocInfo()) { + default: + MIRBuilder.buildCopy(ValVReg, PhysReg); + break; + case CCValAssign::LocInfo::SExt: + case CCValAssign::LocInfo::ZExt: + case CCValAssign::LocInfo::AExt: { + auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + break; + } + } } void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, @@ -171,10 +178,11 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { CCAssignFn *AssignFnVarArg; uint64_t StackSize; }; +} // namespace void AArch64CallLowering::splitToValueTypes( const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, MachineRegisterInfo &MRI, + const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv, const SplitArgTy &PerformArgSplit) const { const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); LLVMContext &Ctx = OrigArg.Ty->getContext(); @@ -192,14 +200,19 @@ void AArch64CallLowering::splitToValueTypes( } unsigned FirstRegIdx = SplitArgs.size(); + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( + OrigArg.Ty, CallConv, false); for (auto SplitVT : SplitVTs) { - // FIXME: set split flags if they're actually used (e.g. i128 on AAPCS). Type *SplitTy = SplitVT.getTypeForEVT(Ctx); SplitArgs.push_back( ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), SplitTy, OrigArg.Flags, OrigArg.IsFixed}); + if (NeedsRegBlock) + SplitArgs.back().Flags.setInConsecutiveRegs(); } + SplitArgs.back().Flags.setInConsecutiveRegsLast(); + for (unsigned i = 0; i < Offsets.size(); ++i) PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8); } @@ -207,7 +220,7 @@ void AArch64CallLowering::splitToValueTypes( bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, unsigned VReg) const { MachineFunction &MF = MIRBuilder.getMF(); - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR); assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg"); @@ -222,7 +235,7 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); SmallVector<ArgInfo, 8> SplitArgs; - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv(), [&](unsigned Reg, uint64_t Offset) { MIRBuilder.buildExtract(Reg, VReg, Offset); }); @@ -246,13 +259,15 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, SmallVector<ArgInfo, 8> SplitArgs; unsigned i = 0; for (auto &Arg : F.args()) { + if (DL.getTypeStoreSize(Arg.getType()) == 0) + continue; ArgInfo OrigArg{VRegs[i], Arg.getType()}; setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F); bool Split = false; LLT Ty = MRI.getType(VRegs[i]); unsigned Dst = VRegs[i]; - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, F.getCallingConv(), [&](unsigned Reg, uint64_t Offset) { if (!Split) { Split = true; @@ -307,13 +322,13 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const ArgInfo &OrigRet, ArrayRef<ArgInfo> OrigArgs) const { MachineFunction &MF = MIRBuilder.getMF(); - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); auto &DL = F.getParent()->getDataLayout(); SmallVector<ArgInfo, 8> SplitArgs; for (auto &OrigArg : OrigArgs) { - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, CallConv, [&](unsigned Reg, uint64_t Offset) { MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset); }); @@ -366,7 +381,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector<uint64_t, 8> RegOffsets; SmallVector<unsigned, 8> SplitRegs; - splitToValueTypes(OrigRet, SplitArgs, DL, MRI, + splitToValueTypes(OrigRet, SplitArgs, DL, MRI, F.getCallingConv(), [&](unsigned Reg, uint64_t Offset) { RegOffsets.push_back(Offset); SplitRegs.push_back(Reg); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h index d96ce95c4de0..68c127fc42e5 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h @@ -1,4 +1,4 @@ -//===--- AArch64CallLowering.h - Call lowering ------------------*- C++ -*-===// +//===- AArch64CallLowering.h - Call lowering --------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,12 +17,18 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/IR/CallingConv.h" #include <cstdint> #include <functional> namespace llvm { class AArch64TargetLowering; +class CCValAssign; +class DataLayout; +class MachineIRBuilder; +class MachineRegisterInfo; +class Type; class AArch64CallLowering: public CallLowering { public: @@ -39,18 +45,18 @@ public: ArrayRef<ArgInfo> OrigArgs) const override; private: - typedef std::function<void(MachineIRBuilder &, Type *, unsigned, - CCValAssign &)> - RegHandler; + using RegHandler = std::function<void(MachineIRBuilder &, Type *, unsigned, + CCValAssign &)>; - typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)> - MemHandler; + using MemHandler = + std::function<void(MachineIRBuilder &, int, CCValAssign &)>; - typedef std::function<void(unsigned, uint64_t)> SplitArgTy; + using SplitArgTy = std::function<void(unsigned, uint64_t)>; void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, + CallingConv::ID CallConv, const SplitArgTy &SplitArg) const; }; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h index bc44bc5f2461..461c01318d4e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -19,8 +19,8 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/CallingConv.h" -#include "llvm/Target/TargetInstrInfo.h" namespace { using namespace llvm; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 291bc5ea858e..93a68449de8d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -49,6 +49,9 @@ def CC_AArch64_AAPCS : CallingConv<[ // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>, + // A SwiftError is passed in X21. + CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index b3b738584b40..b88fba4452a1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -25,7 +25,6 @@ #include "AArch64.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" -#include "AArch64TargetMachine.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -43,7 +42,7 @@ struct LDTLSCleanup : public MachineFunctionPass { } bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index 17aafa0c3d6e..0a9167edcdb3 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -101,23 +101,19 @@ #include "AArch64.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" -#include "AArch64Subtarget.h" -#include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; #define DEBUG_TYPE "aarch64-collect-loh" @@ -486,7 +482,7 @@ static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) { } bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; DEBUG(dbgs() << "********** AArch64 Collect LOH **********\n" diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp index 51700f905979..30cefbad884c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -32,13 +32,12 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -291,7 +290,7 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, } bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning **********\n" diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 2dfcd2d1c393..d14bde33d94e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -60,20 +60,26 @@ #include "AArch64.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> #include <cstdlib> #include <tuple> @@ -84,6 +90,7 @@ using namespace llvm; STATISTIC(NumConditionsAdjusted, "Number of conditions adjusted"); namespace { + class AArch64ConditionOptimizer : public MachineFunctionPass { const TargetInstrInfo *TII; MachineDominatorTree *DomTree; @@ -92,12 +99,14 @@ class AArch64ConditionOptimizer : public MachineFunctionPass { public: // Stores immediate, compare instruction opcode and branch condition (in this // order) of adjusted comparison. - typedef std::tuple<int, unsigned, AArch64CC::CondCode> CmpInfo; + using CmpInfo = std::tuple<int, unsigned, AArch64CC::CondCode>; static char ID; + AArch64ConditionOptimizer() : MachineFunctionPass(ID) { initializeAArch64ConditionOptimizerPass(*PassRegistry::getPassRegistry()); } + void getAnalysisUsage(AnalysisUsage &AU) const override; MachineInstr *findSuitableCompare(MachineBasicBlock *MBB); CmpInfo adjustCmp(MachineInstr *CmpMI, AArch64CC::CondCode Cmp); @@ -105,10 +114,12 @@ public: bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm); bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { return "AArch64 Condition Optimizer"; } }; + } // end anonymous namespace char AArch64ConditionOptimizer::ID = 0; @@ -196,7 +207,7 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( return nullptr; } } - DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n'); + DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB) << '\n'); return nullptr; } @@ -316,7 +327,7 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI, bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" << "********** Function: " << MF.getName() << '\n'); - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; TII = MF.getSubtarget().getInstrInfo(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 9eda56c825a9..b0bda7c43c15 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -31,12 +31,12 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -369,7 +369,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { return nullptr; } } - DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n'); + DEBUG(dbgs() << "Flags not defined in " << printMBBReference(*MBB) << '\n'); return nullptr; } @@ -383,7 +383,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB, // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to // get right. if (!MBB->livein_empty()) { - DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n"); + DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n"); return false; } @@ -396,7 +396,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB, continue; if (++InstrCount > BlockInstrLimit && !Stress) { - DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than " + DEBUG(dbgs() << printMBBReference(*MBB) << " has more than " << BlockInstrLimit << " instructions.\n"); return false; } @@ -458,8 +458,9 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { return false; // The CFG topology checks out. - DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#" - << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n'); + DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> " + << printMBBReference(*CmpBB) << " -> " + << printMBBReference(*Tail) << '\n'); ++NumConsidered; // Tail is allowed to have many predecessors, but we can't handle PHIs yet. @@ -562,8 +563,9 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { } void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { - DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#" - << Head->getNumber() << ":\n" << *CmpBB); + DEBUG(dbgs() << "Merging " << printMBBReference(*CmpBB) << " into " + << printMBBReference(*Head) << ":\n" + << *CmpBB); // All CmpBB instructions are moved into Head, and CmpBB is deleted. // Update the CFG first. @@ -922,7 +924,7 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) { bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" << "********** Function: " << MF.getName() << '\n'); - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; TII = MF.getSubtarget().getInstrInfo(); @@ -934,7 +936,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; - MinSize = MF.getFunction()->optForMinSize(); + MinSize = MF.getFunction().optForMinSize(); bool Changed = false; CmpConv.runOnMachineFunction(MF, MBPI); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index b72f23b109d9..8e7e740da6f6 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -20,10 +20,10 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; #define DEBUG_TYPE "aarch64-dead-defs" @@ -55,6 +55,8 @@ public: AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } + + bool shouldSkip(const MachineInstr &MI, const MachineFunction &MF) const; }; char AArch64DeadRegisterDefinitions::ID = 0; } // end anonymous namespace @@ -69,6 +71,63 @@ static bool usesFrameIndex(const MachineInstr &MI) { return false; } +bool +AArch64DeadRegisterDefinitions::shouldSkip(const MachineInstr &MI, + const MachineFunction &MF) const { + if (!MF.getSubtarget<AArch64Subtarget>().hasLSE()) + return false; + +#define CASE_AARCH64_ATOMIC_(PREFIX) \ + case AArch64::PREFIX##X: \ + case AArch64::PREFIX##W: \ + case AArch64::PREFIX##H: \ + case AArch64::PREFIX##B + + for (const MachineMemOperand *MMO : MI.memoperands()) { + if (MMO->isAtomic()) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + return false; + break; + + CASE_AARCH64_ATOMIC_(LDADDA): + CASE_AARCH64_ATOMIC_(LDADDAL): + + CASE_AARCH64_ATOMIC_(LDCLRA): + CASE_AARCH64_ATOMIC_(LDCLRAL): + + CASE_AARCH64_ATOMIC_(LDEORA): + CASE_AARCH64_ATOMIC_(LDEORAL): + + CASE_AARCH64_ATOMIC_(LDSETA): + CASE_AARCH64_ATOMIC_(LDSETAL): + + CASE_AARCH64_ATOMIC_(LDSMAXA): + CASE_AARCH64_ATOMIC_(LDSMAXAL): + + CASE_AARCH64_ATOMIC_(LDSMINA): + CASE_AARCH64_ATOMIC_(LDSMINAL): + + CASE_AARCH64_ATOMIC_(LDUMAXA): + CASE_AARCH64_ATOMIC_(LDUMAXAL): + + CASE_AARCH64_ATOMIC_(LDUMINA): + CASE_AARCH64_ATOMIC_(LDUMINAL): + + CASE_AARCH64_ATOMIC_(SWPA): + CASE_AARCH64_ATOMIC_(SWPAL): + return true; + break; + } + } + } + +#undef CASE_AARCH64_ATOMIC_ + + return false; +} + void AArch64DeadRegisterDefinitions::processMachineBasicBlock( MachineBasicBlock &MBB) { const MachineFunction &MF = *MBB.getParent(); @@ -86,55 +145,12 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n"); continue; } - if (MF.getSubtarget<AArch64Subtarget>().hasLSE()) { - // XZ/WZ for LSE can only be used when acquire semantics are not used, - // LDOPAL WZ is an invalid opcode. - switch (MI.getOpcode()) { - case AArch64::CASALb: - case AArch64::CASALh: - case AArch64::CASALs: - case AArch64::CASALd: - case AArch64::SWPALb: - case AArch64::SWPALh: - case AArch64::SWPALs: - case AArch64::SWPALd: - case AArch64::LDADDALb: - case AArch64::LDADDALh: - case AArch64::LDADDALs: - case AArch64::LDADDALd: - case AArch64::LDCLRALb: - case AArch64::LDCLRALh: - case AArch64::LDCLRALs: - case AArch64::LDCLRALd: - case AArch64::LDEORALb: - case AArch64::LDEORALh: - case AArch64::LDEORALs: - case AArch64::LDEORALd: - case AArch64::LDSETALb: - case AArch64::LDSETALh: - case AArch64::LDSETALs: - case AArch64::LDSETALd: - case AArch64::LDSMINALb: - case AArch64::LDSMINALh: - case AArch64::LDSMINALs: - case AArch64::LDSMINALd: - case AArch64::LDSMAXALb: - case AArch64::LDSMAXALh: - case AArch64::LDSMAXALs: - case AArch64::LDSMAXALd: - case AArch64::LDUMINALb: - case AArch64::LDUMINALh: - case AArch64::LDUMINALs: - case AArch64::LDUMINALd: - case AArch64::LDUMAXALb: - case AArch64::LDUMAXALh: - case AArch64::LDUMAXALs: - case AArch64::LDUMAXALd: - continue; - default: - break; - } + + if (shouldSkip(MI, MF)) { + DEBUG(dbgs() << " Ignoring, Atomic instruction with acquire semantics using WZR/XZR\n"); + continue; } + const MCInstrDesc &Desc = MI.getDesc(); for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) { MachineOperand &MO = MI.getOperand(I); @@ -182,7 +198,7 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( // Scan the function for instructions that have a dead definition of a // register. Replace that register with the zero register when possible. bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; TRI = MF.getSubtarget().getRegisterInfo(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index d52cd84246a1..c3842785f2be 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1,4 +1,4 @@ -//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=// +//===- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions ----------===// // // The LLVM Compiler Infrastructure // @@ -18,24 +18,44 @@ #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> +#include <utility> + using namespace llvm; #define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass" namespace { + class AArch64ExpandPseudo : public MachineFunctionPass { public: + const AArch64InstrInfo *TII; + static char ID; + AArch64ExpandPseudo() : MachineFunctionPass(ID) { initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry()); } - const AArch64InstrInfo *TII; - bool runOnMachineFunction(MachineFunction &Fn) override; StringRef getPassName() const override { return AARCH64_EXPAND_PSEUDO_NAME; } @@ -55,8 +75,10 @@ private: MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); }; + +} // end anonymous namespace + char AArch64ExpandPseudo::ID = 0; -} INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo", AARCH64_EXPAND_PSEUDO_NAME, false, false) @@ -151,12 +173,12 @@ static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { /// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order /// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with /// an ORR instruction. -/// static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const AArch64InstrInfo *TII) { - typedef DenseMap<uint64_t, unsigned> CountMap; + using CountMap = DenseMap<uint64_t, unsigned>; + CountMap Counts; // Scan the constant and count how often every chunk occurs. @@ -242,7 +264,7 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, /// starts a contiguous sequence of ones if we look at the bits from the LSB /// towards the MSB. static bool isStartChunk(uint64_t Chunk) { - if (Chunk == 0 || Chunk == UINT64_MAX) + if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max()) return false; return isMask_64(~Chunk); @@ -252,7 +274,7 @@ static bool isStartChunk(uint64_t Chunk) { /// ends a contiguous sequence of ones if we look at the bits from the LSB /// towards the MSB. static bool isEndChunk(uint64_t Chunk) { - if (Chunk == 0 || Chunk == UINT64_MAX) + if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max()) return false; return isMask_64(Chunk); @@ -285,7 +307,6 @@ static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { /// /// We are also looking for constants like |S|A|B|E| where the contiguous /// sequence of ones wraps around the MSB into the LSB. -/// static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, @@ -651,16 +672,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( MI.eraseFromParent(); // Recompute livein lists. - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); LivePhysRegs LiveRegs; - computeLiveIns(LiveRegs, MRI, *DoneBB); - computeLiveIns(LiveRegs, MRI, *StoreBB); - computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + computeAndAddLiveIns(LiveRegs, *DoneBB); + computeAndAddLiveIns(LiveRegs, *StoreBB); + computeAndAddLiveIns(LiveRegs, *LoadCmpBB); // Do an extra pass around the loop to get loop carried registers right. StoreBB->clearLiveIns(); - computeLiveIns(LiveRegs, MRI, *StoreBB); + computeAndAddLiveIns(LiveRegs, *StoreBB); LoadCmpBB->clearLiveIns(); - computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + computeAndAddLiveIns(LiveRegs, *LoadCmpBB); return true; } @@ -668,7 +688,6 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( bool AArch64ExpandPseudo::expandCMP_SWAP_128( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI) { - MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); MachineOperand &DestLo = MI.getOperand(0); @@ -746,16 +765,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( MI.eraseFromParent(); // Recompute liveness bottom up. - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); LivePhysRegs LiveRegs; - computeLiveIns(LiveRegs, MRI, *DoneBB); - computeLiveIns(LiveRegs, MRI, *StoreBB); - computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + computeAndAddLiveIns(LiveRegs, *DoneBB); + computeAndAddLiveIns(LiveRegs, *StoreBB); + computeAndAddLiveIns(LiveRegs, *LoadCmpBB); // Do an extra pass in the loop to get the loop carried dependencies right. StoreBB->clearLiveIns(); - computeLiveIns(LiveRegs, MRI, *StoreBB); + computeAndAddLiveIns(LiveRegs, *StoreBB); LoadCmpBB->clearLiveIns(); - computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + computeAndAddLiveIns(LiveRegs, *LoadCmpBB); return true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 2c887a9ca5db..d1ddb2e3ef70 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -1,4 +1,4 @@ -//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===// +//===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===// // // The LLVM Compiler Infrastructure // @@ -15,21 +15,41 @@ #include "AArch64.h" #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <iterator> +#include <utility> using namespace llvm; @@ -60,6 +80,7 @@ private: class FalkorMarkStridedAccessesLegacy : public FunctionPass { public: static char ID; // Pass ID, replacement for typeid + FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) { initializeFalkorMarkStridedAccessesLegacyPass( *PassRegistry::getPassRegistry()); @@ -71,16 +92,16 @@ public: AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<ScalarEvolutionWrapperPass>(); - // FIXME: For some reason, preserving SE here breaks LSR (even if - // this pass changes nothing). - // AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); } bool runOnFunction(Function &F) override; }; -} // namespace + +} // end anonymous namespace char FalkorMarkStridedAccessesLegacy::ID = 0; + INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, "Falkor HW Prefetch Fix", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) @@ -165,7 +186,7 @@ public: bool runOnMachineFunction(MachineFunction &Fn) override; - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -186,17 +207,16 @@ private: /// Bits from load opcodes used to compute HW prefetcher instruction tags. struct LoadInfo { - LoadInfo() - : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr), - IsPrePost(false) {} - unsigned DestReg; - unsigned BaseReg; - int BaseRegIdx; - const MachineOperand *OffsetOpnd; - bool IsPrePost; + LoadInfo() = default; + + unsigned DestReg = 0; + unsigned BaseReg = 0; + int BaseRegIdx = -1; + const MachineOperand *OffsetOpnd = nullptr; + bool IsPrePost = false; }; -} // namespace +} // end anonymous namespace char FalkorHWPFFix::ID = 0; @@ -618,9 +638,14 @@ static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) { break; } + // Loads from the stack pointer don't get prefetched. + unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg(); + if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP) + return None; + LoadInfo LI; LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg(); - LI.BaseReg = MI.getOperand(BaseRegIdx).getReg(); + LI.BaseReg = BaseReg; LI.BaseRegIdx = BaseRegIdx; LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx); LI.IsPrePost = IsPrePost; @@ -715,7 +740,7 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) { if (TagMap.count(NewTag)) continue; - DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI) + DEBUG(dbgs() << "Changing base reg to: " << printReg(ScratchReg, TRI) << '\n'); // Rewrite: @@ -735,7 +760,7 @@ void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) { // well to update the real base register. if (LdI.IsPrePost) { DEBUG(dbgs() << "Doing post MOV of incremented reg: " - << PrintReg(ScratchReg, TRI) << '\n'); + << printReg(ScratchReg, TRI) << '\n'); MI.getOperand(0).setReg( ScratchReg); // Change tied operand pre/post update dest. BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL, @@ -773,7 +798,7 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { if (ST.getProcFamily() != AArch64Subtarget::Falkor) return false; - if (skipFunction(*Fn.getFunction())) + if (skipFunction(Fn.getFunction())) return false; TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 97396057dce0..fd1699fd363d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -1,4 +1,4 @@ -//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===// +//===- AArch6464FastISel.cpp - AArch64 FastISel implementation ------------===// // // The LLVM Compiler Infrastructure // @@ -53,6 +53,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" @@ -63,6 +64,7 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include <algorithm> @@ -78,10 +80,10 @@ namespace { class AArch64FastISel final : public FastISel { class Address { public: - typedef enum { + using BaseKind = enum { RegBase, FrameIndexBase - } BaseKind; + }; private: BaseKind Kind = RegBase; @@ -944,7 +946,6 @@ bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { return false; } - bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) { EVT evt = TLI.getValueType(DL, Ty, true); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 7c6a99990406..73944359223a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -110,6 +110,9 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" @@ -121,11 +124,8 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <cassert> #include <cstdint> #include <iterator> @@ -155,8 +155,8 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { MI.getOpcode() == AArch64::ADDSXri) continue; - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - if (!MI.getOperand(i).isFI()) + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isFI()) continue; int Offset = 0; @@ -174,7 +174,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { return false; // Don't use the red zone if the function explicitly asks us not to. // This is typically used for kernel code. - if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone)) + if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) return false; const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -459,13 +459,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - const Function *Fn = MF.getFunction(); + const Function &F = MF.getFunction(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); + bool needsFrameMoves = MMI.hasDebugInfo() || F.needsUnwindTableEntry(); bool HasFP = hasFP(MF); // Debug location must be unknown since the first debug location is used @@ -474,7 +474,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. - if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; int NumBytes = (int)MFI.getStackSize(); @@ -507,7 +507,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } bool IsWin64 = - Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; @@ -716,7 +716,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. - if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; // Initial and residual are named for consistency with the prologue. Note that @@ -765,7 +765,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // it as the 2nd argument of AArch64ISD::TC_RETURN. bool IsWin64 = - Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; @@ -857,7 +857,7 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); bool IsWin64 = - Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16; int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize(); @@ -928,7 +928,7 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { static bool produceCompactUnwindFrame(MachineFunction &MF) { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - AttributeList Attrs = MF.getFunction()->getAttributes(); + AttributeList Attrs = MF.getFunction().getAttributes(); return Subtarget.isTargetMachO() && !(Subtarget.getTargetLowering()->supportSwiftError() && Attrs.hasAttrSomewhere(Attribute::SwiftError)); @@ -959,7 +959,7 @@ static void computeCalleeSaveRegisterPairs( AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); MachineFrameInfo &MFI = MF.getFrameInfo(); - CallingConv::ID CC = MF.getFunction()->getCallingConv(); + CallingConv::ID CC = MF.getFunction().getCallingConv(); unsigned Count = CSI.size(); (void)CC; // MachO's compact unwind format relies on all registers being stored in @@ -1060,9 +1060,9 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; else StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; - DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1); + DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) - dbgs() << ", " << TRI->getName(Reg2); + dbgs() << ", " << printReg(Reg2, TRI); dbgs() << ") -> fi#(" << RPI.FrameIdx; if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx+1; @@ -1092,7 +1092,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -1123,9 +1123,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; else LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; - DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1); + DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); if (RPI.isPaired()) - dbgs() << ", " << TRI->getName(Reg2); + dbgs() << ", " << printReg(Reg2, TRI); dbgs() << ") -> fi#(" << RPI.FrameIdx; if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx+1; @@ -1154,7 +1154,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, RegScavenger *RS) const { // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. - if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); @@ -1208,7 +1208,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) - dbgs() << ' ' << PrintReg(Reg, RegInfo); + dbgs() << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. @@ -1233,8 +1233,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // here. if (BigStack) { if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) { - DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) - << " to get a scratch register.\n"); + DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo) + << " to get a scratch register.\n"); SavedRegs.set(UnspilledCSGPR); // MachO's compact unwind format relies on all registers being stored in // pairs, so if we need to spill one extra for BigStack, then we need to diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h index f254ea9b70aa..55a256867fab 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { @@ -50,7 +50,7 @@ public: bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const override; /// \brief Can this function use the red zone for local allocations. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index 8b1c9740d2ad..37720cbd32bb 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -11,26 +11,24 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - namespace llvm { RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{ /* StartIdx, Length, RegBank */ - // 0: FPR 32-bit value. + // 0: FPR 16-bit value. + {0, 16, AArch64::FPRRegBank}, + // 1: FPR 32-bit value. {0, 32, AArch64::FPRRegBank}, - // 1: FPR 64-bit value. + // 2: FPR 64-bit value. {0, 64, AArch64::FPRRegBank}, - // 2: FPR 128-bit value. + // 3: FPR 128-bit value. {0, 128, AArch64::FPRRegBank}, - // 3: FPR 256-bit value. + // 4: FPR 256-bit value. {0, 256, AArch64::FPRRegBank}, - // 4: FPR 512-bit value. + // 5: FPR 512-bit value. {0, 512, AArch64::FPRRegBank}, - // 5: GPR 32-bit value. + // 6: GPR 32-bit value. {0, 32, AArch64::GPRRegBank}, - // 6: GPR 64-bit value. + // 7: GPR 64-bit value. {0, 64, AArch64::GPRRegBank}, }; @@ -41,58 +39,78 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ {nullptr, 0}, // 3-operands instructions (all binary operations should end up with one of // those mapping). - // 1: FPR 32-bit value. <-- This must match First3OpsIdx. + // 1: FPR 16-bit value. <-- This must match First3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 4: FPR 32-bit value. <-- This must match First3OpsIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 4: FPR 64-bit value. + // 7: FPR 64-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, - // 7: FPR 128-bit value. + // 10: FPR 128-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, - // 10: FPR 256-bit value. + // 13: FPR 256-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, - // 13: FPR 512-bit value. + // 16: FPR 512-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, - // 16: GPR 32-bit value. + // 19: GPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 19: GPR 64-bit value. <-- This must match Last3OpsIdx. + // 22: GPR 64-bit value. <-- This must match Last3OpsIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, // Cross register bank copies. - // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match - // FirstCrossRegCpyIdx. + // 25: FPR 16-bit value to GPR 16-bit. <-- This must match + // FirstCrossRegCpyIdx. + // Note: This is the kind of copy we see with physical registers. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + // 27: FPR 32-bit value to GPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 24: FPR 64-bit value to GPR 64-bit value. + // 29: FPR 64-bit value to GPR 64-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, - // 26: FPR 128-bit value to GPR 128-bit value (invalid) + // 31: FPR 128-bit value to GPR 128-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 28: FPR 256-bit value to GPR 256-bit value (invalid) + // 33: FPR 256-bit value to GPR 256-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 30: FPR 512-bit value to GPR 512-bit value (invalid) + // 35: FPR 512-bit value to GPR 512-bit value (invalid) {nullptr, 1}, {nullptr, 1}, - // 32: GPR 32-bit value to FPR 32-bit value. + // 37: GPR 32-bit value to FPR 32-bit value. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match + // 39: GPR 64-bit value to FPR 64-bit value. <-- This must match // LastCrossRegCpyIdx. {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 41: FPExt: 16 to 32. <-- This must match FPExt16To32Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 43: FPExt: 16 to 32. <-- This must match FPExt16To64Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 45: FPExt: 32 to 64. <-- This must match FPExt32To64Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 47: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, }; bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx, @@ -149,16 +167,18 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, return -1; } if (RBIdx == PMI_FirstFPR) { - if (Size <= 32) + if (Size <= 16) return 0; - if (Size <= 64) + if (Size <= 32) return 1; - if (Size <= 128) + if (Size <= 64) return 2; - if (Size <= 256) + if (Size <= 128) return 3; - if (Size <= 512) + if (Size <= 256) return 4; + if (Size <= 512) + return 5; return -1; } return -1; @@ -210,4 +230,35 @@ AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID, ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound"); return &ValMappings[ValMappingIdx]; } + +const RegisterBankInfo::ValueMapping * +AArch64GenRegisterBankInfo::getFPExtMapping(unsigned DstSize, + unsigned SrcSize) { + // We support: + // - For Scalar: + // - 16 to 32. + // - 16 to 64. + // - 32 to 64. + // => FPR 16 to FPR 32|64 + // => FPR 32 to FPR 64 + // - For vectors: + // - v4f16 to v4f32 + // - v2f32 to v2f64 + // => FPR 64 to FPR 128 + + // Check that we have been asked sensible sizes. + if (SrcSize == 16) { + assert((DstSize == 32 || DstSize == 64) && "Unexpected half extension"); + if (DstSize == 32) + return &ValMappings[FPExt16To32Idx]; + return &ValMappings[FPExt16To64Idx]; + } + + if (SrcSize == 32) { + assert(DstSize == 64 && "Unexpected float extension"); + return &ValMappings[FPExt32To64Idx]; + } + assert((SrcSize == 64 || DstSize == 128) && "Unexpected vector extension"); + return &ValMappings[FPExt64To128Idx]; +} } // End llvm namespace. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 06005f6b6886..0b10246b0cc8 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -53,7 +53,7 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - ForCodeSize = MF.getFunction()->optForSize(); + ForCodeSize = MF.getFunction().optForSize(); Subtarget = &MF.getSubtarget<AArch64Subtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9c57926da5f5..1242cf5be188 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -42,6 +42,8 @@ #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetCallingConv.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constants.h" @@ -70,8 +72,6 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetCallingConv.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include <algorithm> @@ -166,6 +166,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::i32, Custom); setOperationAction(ISD::SETCC, MVT::i64, Custom); + setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); @@ -173,14 +174,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); + setOperationAction(ISD::BR_CC, MVT::f16, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Expand); @@ -317,119 +321,118 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); - - // f16 is a storage-only type, always promote it to f32. - setOperationAction(ISD::SETCC, MVT::f16, Promote); - setOperationAction(ISD::BR_CC, MVT::f16, Promote); - setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); - setOperationAction(ISD::SELECT, MVT::f16, Promote); - setOperationAction(ISD::FADD, MVT::f16, Promote); - setOperationAction(ISD::FSUB, MVT::f16, Promote); - setOperationAction(ISD::FMUL, MVT::f16, Promote); - setOperationAction(ISD::FDIV, MVT::f16, Promote); - setOperationAction(ISD::FREM, MVT::f16, Promote); - setOperationAction(ISD::FMA, MVT::f16, Promote); - setOperationAction(ISD::FNEG, MVT::f16, Promote); - setOperationAction(ISD::FABS, MVT::f16, Promote); - setOperationAction(ISD::FCEIL, MVT::f16, Promote); - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FFLOOR, MVT::f16, Promote); - setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); - setOperationAction(ISD::FPOW, MVT::f16, Promote); - setOperationAction(ISD::FPOWI, MVT::f16, Promote); - setOperationAction(ISD::FRINT, MVT::f16, Promote); - setOperationAction(ISD::FSIN, MVT::f16, Promote); - setOperationAction(ISD::FSINCOS, MVT::f16, Promote); - setOperationAction(ISD::FSQRT, MVT::f16, Promote); - setOperationAction(ISD::FEXP, MVT::f16, Promote); - setOperationAction(ISD::FEXP2, MVT::f16, Promote); - setOperationAction(ISD::FLOG, MVT::f16, Promote); - setOperationAction(ISD::FLOG2, MVT::f16, Promote); - setOperationAction(ISD::FLOG10, MVT::f16, Promote); - setOperationAction(ISD::FROUND, MVT::f16, Promote); - setOperationAction(ISD::FTRUNC, MVT::f16, Promote); - setOperationAction(ISD::FMINNUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); - setOperationAction(ISD::FMINNAN, MVT::f16, Promote); - setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); - - // v4f16 is also a storage-only type, so promote it to v4f32 when that is - // known to be safe. - setOperationAction(ISD::FADD, MVT::v4f16, Promote); - setOperationAction(ISD::FSUB, MVT::v4f16, Promote); - setOperationAction(ISD::FMUL, MVT::v4f16, Promote); - setOperationAction(ISD::FDIV, MVT::v4f16, Promote); - setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); - setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); - AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); - AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); - AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); - AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); - AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); - AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); - - // Expand all other v4f16 operations. - // FIXME: We could generate better code by promoting some operations to - // a pair of v4f32s - setOperationAction(ISD::FABS, MVT::v4f16, Expand); - setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); - setOperationAction(ISD::FCOS, MVT::v4f16, Expand); - setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); - setOperationAction(ISD::FMA, MVT::v4f16, Expand); - setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); - setOperationAction(ISD::FNEG, MVT::v4f16, Expand); - setOperationAction(ISD::FPOW, MVT::v4f16, Expand); - setOperationAction(ISD::FREM, MVT::v4f16, Expand); - setOperationAction(ISD::FROUND, MVT::v4f16, Expand); - setOperationAction(ISD::FRINT, MVT::v4f16, Expand); - setOperationAction(ISD::FSIN, MVT::v4f16, Expand); - setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); - setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); - setOperationAction(ISD::SETCC, MVT::v4f16, Expand); - setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); - setOperationAction(ISD::SELECT, MVT::v4f16, Expand); - setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); - setOperationAction(ISD::FEXP, MVT::v4f16, Expand); - setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); - setOperationAction(ISD::FLOG, MVT::v4f16, Expand); - setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); - setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); - - - // v8f16 is also a storage-only type, so expand it. - setOperationAction(ISD::FABS, MVT::v8f16, Expand); - setOperationAction(ISD::FADD, MVT::v8f16, Expand); - setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); - setOperationAction(ISD::FCOS, MVT::v8f16, Expand); - setOperationAction(ISD::FDIV, MVT::v8f16, Expand); - setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); - setOperationAction(ISD::FMA, MVT::v8f16, Expand); - setOperationAction(ISD::FMUL, MVT::v8f16, Expand); - setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); - setOperationAction(ISD::FNEG, MVT::v8f16, Expand); - setOperationAction(ISD::FPOW, MVT::v8f16, Expand); - setOperationAction(ISD::FREM, MVT::v8f16, Expand); - setOperationAction(ISD::FROUND, MVT::v8f16, Expand); - setOperationAction(ISD::FRINT, MVT::v8f16, Expand); - setOperationAction(ISD::FSIN, MVT::v8f16, Expand); - setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); - setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); - setOperationAction(ISD::FSUB, MVT::v8f16, Expand); - setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); - setOperationAction(ISD::SETCC, MVT::v8f16, Expand); - setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); - setOperationAction(ISD::SELECT, MVT::v8f16, Expand); - setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); - setOperationAction(ISD::FEXP, MVT::v8f16, Expand); - setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); - setOperationAction(ISD::FLOG, MVT::v8f16, Expand); - setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); - setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); + if (Subtarget->hasFullFP16()) + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); + else + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FREM, MVT::v4f16, Promote); + setOperationAction(ISD::FREM, MVT::v8f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::v4f16, Promote); + setOperationAction(ISD::FPOW, MVT::v8f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::v4f16, Promote); + setOperationAction(ISD::FCOS, MVT::v8f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::v4f16, Promote); + setOperationAction(ISD::FSIN, MVT::v8f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::v4f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::v8f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::v4f16, Promote); + setOperationAction(ISD::FEXP, MVT::v8f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::v4f16, Promote); + setOperationAction(ISD::FEXP2, MVT::v8f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::v4f16, Promote); + setOperationAction(ISD::FLOG, MVT::v8f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::v4f16, Promote); + setOperationAction(ISD::FLOG2, MVT::v8f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::v4f16, Promote); + setOperationAction(ISD::FLOG10, MVT::v8f16, Promote); + + if (!Subtarget->hasFullFP16()) { + setOperationAction(ISD::SELECT, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Promote); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FMA, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FCEIL, MVT::f16, Promote); + setOperationAction(ISD::FSQRT, MVT::f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::FRINT, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINNAN, MVT::f16, Promote); + setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); + + // promote v4f16 to v4f32 when that is known to be safe. + setOperationAction(ISD::FADD, MVT::v4f16, Promote); + setOperationAction(ISD::FSUB, MVT::v4f16, Promote); + setOperationAction(ISD::FMUL, MVT::v4f16, Promote); + setOperationAction(ISD::FDIV, MVT::v4f16, Promote); + setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); + AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); + + setOperationAction(ISD::FABS, MVT::v4f16, Expand); + setOperationAction(ISD::FNEG, MVT::v4f16, Expand); + setOperationAction(ISD::FROUND, MVT::v4f16, Expand); + setOperationAction(ISD::FMA, MVT::v4f16, Expand); + setOperationAction(ISD::SETCC, MVT::v4f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); + setOperationAction(ISD::SELECT, MVT::v4f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); + setOperationAction(ISD::FRINT, MVT::v4f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); + + setOperationAction(ISD::FABS, MVT::v8f16, Expand); + setOperationAction(ISD::FADD, MVT::v8f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); + setOperationAction(ISD::FDIV, MVT::v8f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); + setOperationAction(ISD::FMA, MVT::v8f16, Expand); + setOperationAction(ISD::FMUL, MVT::v8f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); + setOperationAction(ISD::FNEG, MVT::v8f16, Expand); + setOperationAction(ISD::FROUND, MVT::v8f16, Expand); + setOperationAction(ISD::FRINT, MVT::v8f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); + setOperationAction(ISD::FSUB, MVT::v8f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); + setOperationAction(ISD::SETCC, MVT::v8f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); + } // AArch64 has implementations of a lot of rounding-like FP operations. for (MVT Ty : {MVT::f32, MVT::f64}) { @@ -445,6 +448,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAXNAN, Ty, Legal); } + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); + setOperationAction(ISD::FFLOOR, MVT::f16, Legal); + setOperationAction(ISD::FCEIL, MVT::f16, Legal); + setOperationAction(ISD::FRINT, MVT::f16, Legal); + setOperationAction(ISD::FTRUNC, MVT::f16, Legal); + setOperationAction(ISD::FROUND, MVT::f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::f16, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); + setOperationAction(ISD::FMINNAN, MVT::f16, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f16, Legal); + } + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); @@ -775,8 +791,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); - // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). - if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) + // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. + if (VT.isFloatingPoint() && + (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, ISD::FMINNUM, ISD::FMAXNUM}) setOperationAction(Opcode, VT, Legal); @@ -1414,16 +1431,20 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, static bool isLegalArithImmed(uint64_t C) { // Matches AArch64DAGToDAGISel::SelectArithImmed(). - return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); + bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); + DEBUG(dbgs() << "Is imm " << C << " legal: " << (IsLegal ? "yes\n" : "no\n")); + return IsLegal; } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); + const bool FullFP16 = + static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); if (VT.isFloatingPoint()) { assert(VT != MVT::f128); - if (VT == MVT::f16) { + if (VT == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); VT = MVT::f32; @@ -1513,9 +1534,12 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG) { unsigned Opcode = 0; + const bool FullFP16 = + static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); + if (LHS.getValueType().isFloatingPoint()) { assert(LHS.getValueType() != MVT::f128); - if (LHS.getValueType() == MVT::f16) { + if (LHS.getValueType() == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); } @@ -1948,10 +1972,41 @@ SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; } +// Returns true if the given Op is the overflow flag result of an overflow +// intrinsic operation. +static bool isOverflowIntrOpRes(SDValue Op) { + unsigned Opc = Op.getOpcode(); + return (Op.getResNo() == 1 && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)); +} + static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { SDValue Sel = Op.getOperand(0); SDValue Other = Op.getOperand(1); + SDLoc dl(Sel); + // If the operand is an overflow checking operation, invert the condition + // code and kill the Not operation. I.e., transform: + // (xor (overflow_op_bool, 1)) + // --> + // (csel 1, 0, invert(cc), overflow_op_bool) + // ... which later gets transformed to just a cset instruction with an + // inverted condition code, rather than a cset + eor sequence. + if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) { + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) + return SDValue(); + + SDValue TVal = DAG.getConstant(1, dl, MVT::i32); + SDValue FVal = DAG.getConstant(0, dl, MVT::i32); + AArch64CC::CondCode CC; + SDValue Value, Overflow; + std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); + SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); + return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, + CCVal, Overflow); + } // If neither operand is a SELECT_CC, give up. if (Sel.getOpcode() != ISD::SELECT_CC) std::swap(Sel, Other); @@ -1970,7 +2025,6 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { SDValue RHS = Sel.getOperand(1); SDValue TVal = Sel.getOperand(2); SDValue FVal = Sel.getOperand(3); - SDLoc dl(Sel); // FIXME: This could be generalized to non-integer comparisons. if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) @@ -2171,8 +2225,9 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, if (Op.getOperand(0).getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); - // f16 conversions are promoted to f32. - if (Op.getOperand(0).getValueType() == MVT::f16) { + // f16 conversions are promoted to f32 when full fp16 is not supported. + if (Op.getOperand(0).getValueType() == MVT::f16 && + !Subtarget->hasFullFP16()) { SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), @@ -2227,8 +2282,9 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); - // f16 conversions are promoted to f32. - if (Op.getValueType() == MVT::f16) { + // f16 conversions are promoted to f32 when full fp16 is not supported. + if (Op.getValueType() == MVT::f16 && + !Subtarget->hasFullFP16()) { SDLoc dl(Op); return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, @@ -2517,6 +2573,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + DEBUG(dbgs() << "Custom lowering: "); + DEBUG(Op.dump()); + switch (Op.getOpcode()) { default: llvm_unreachable("unimplemented operand"); @@ -2640,7 +2699,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { switch (CC) { default: - llvm_unreachable("Unsupported calling convention."); + report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: @@ -2672,7 +2731,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); - bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; @@ -2686,7 +2745,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // we use a special version of AnalyzeFormalArguments to pass in ValVT and // LocVT. unsigned NumArgs = Ins.size(); - Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); + Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; @@ -2876,7 +2935,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); SmallVector<SDValue, 8> MemOps; @@ -3028,15 +3087,15 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; MachineFunction &MF = DAG.getMachineFunction(); - const Function *CallerF = MF.getFunction(); - CallingConv::ID CallerCC = CallerF->getCallingConv(); + const Function &CallerF = MF.getFunction(); + CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible (see // X86) but less efficient and uglier in LowerCall. - for (Function::const_arg_iterator i = CallerF->arg_begin(), - e = CallerF->arg_end(); + for (Function::const_arg_iterator i = CallerF.arg_begin(), + e = CallerF.arg_end(); i != e; ++i) if (i->hasByValAttr()) return false; @@ -3187,7 +3246,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) + if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -3428,6 +3487,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, AArch64II::MO_GOT) { Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) { + assert(Subtarget->isTargetWindows() && + "Windows is the only supported COFF target"); + Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT); } else { const GlobalValue *GV = G->getGlobal(); Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); @@ -3628,11 +3691,12 @@ SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, // (loadGOT sym) template <class NodeTy> -SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, + unsigned Flags) const { DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); - SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT); + SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes instead of using a wrapper node. return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); @@ -3640,29 +3704,30 @@ SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const { // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) template <class NodeTy> -SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG) - const { +SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, + unsigned Flags) const { DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( - AArch64ISD::WrapperLarge, DL, Ty, - getTargetNode(N, Ty, DAG, AArch64II::MO_G3), - getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC), - getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC), - getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC)); + AArch64ISD::WrapperLarge, DL, Ty, + getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), + getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), + getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), + getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); } // (addlow (adrp %hi(sym)) %lo(sym)) template <class NodeTy> -SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, + unsigned Flags) const { DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); - SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE); + SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); SDValue Lo = getTargetNode(N, Ty, DAG, - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); } @@ -3671,6 +3736,9 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); + const AArch64II::TOF TargetFlags = + (GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT + : AArch64II::MO_NO_FLAG); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); @@ -3679,14 +3747,21 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, // This also catches the large code model case for Darwin. if ((OpFlags & AArch64II::MO_GOT) != 0) { - return getGOT(GN, DAG); + return getGOT(GN, DAG, TargetFlags); } + SDValue Result; if (getTargetMachine().getCodeModel() == CodeModel::Large) { - return getAddrLarge(GN, DAG); + Result = getAddrLarge(GN, DAG, TargetFlags); } else { - return getAddr(GN, DAG); + Result = getAddr(GN, DAG, TargetFlags); } + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(GN); + if (GV->hasDLLImportStorageClass()) + Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction())); + return Result; } /// \brief Convert a TLS address reference into the correct sequence of loads @@ -3720,7 +3795,8 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SDValue AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); + assert(Subtarget->isTargetDarwin() && + "This function expects a Darwin target"); SDLoc DL(Op); MVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -3809,9 +3885,6 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); - if (DAG.getTarget().Options.EmulatedTLS) - return LowerToTLSEmulatedModel(GA, DAG); - if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; @@ -3897,6 +3970,10 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + if (Subtarget->isTargetDarwin()) return LowerDarwinGlobalTLSAddress(Op, DAG); if (Subtarget->isTargetELF()) @@ -3929,12 +4006,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. - unsigned Opc = LHS.getOpcode(); - if (LHS.getResNo() == 1 && isOneConstant(RHS) && - (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || - Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { - assert((CC == ISD::SETEQ || CC == ISD::SETNE) && - "Unexpected condition code."); + if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) return SDValue(); @@ -4017,7 +4090,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { Cmp); } - assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || + LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. @@ -4051,25 +4125,26 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; - EVT EltVT; uint64_t EltMask; SDValue VecVal1, VecVal2; - if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { - EltVT = MVT::i32; - VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); - EltMask = 0x80000000ULL; + auto setVecVal = [&] (int Idx) { if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); } + }; + + if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { + VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); + EltMask = 0x80000000ULL; + setVecVal(AArch64::ssub); } else if (VT == MVT::f64 || VT == MVT::v2f64) { - EltVT = MVT::i64; VecVT = MVT::v2i64; // We want to materialize a mask with the high bit set, but the AdvSIMD @@ -4077,15 +4152,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, // 64-bit elements. Instead, materialize zero and then negate it. EltMask = 0; - if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, - DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, - DAG.getUNDEF(VecVT), In2); - } else { - VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); - VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); - } + setVecVal(AArch64::dsub); + } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { + VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); + EltMask = 0x8000ULL; + setVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } @@ -4103,6 +4174,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue Sel = DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); + if (VT == MVT::f16) + return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); if (VT == MVT::f32) return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); else if (VT == MVT::f64) @@ -4112,7 +4185,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getMachineFunction().getFunction()->hasFnAttribute( + if (DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat)) return SDValue(); @@ -4185,7 +4258,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } // Now we know we're dealing with FP values. - assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || + LHS.getValueType() == MVT::f64); // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead // and do the comparison. @@ -4235,7 +4309,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } // Also handle f16, for which we need to do a f32 comparison. - if (LHS.getValueType() == MVT::f16) { + if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); } @@ -4356,13 +4430,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); - EVT VT = TVal.getValueType(); return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); } // Now we know we're dealing with FP values. - assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || + LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); EVT VT = TVal.getValueType(); SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); @@ -4423,12 +4497,9 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue FVal = Op->getOperand(2); SDLoc DL(Op); - unsigned Opc = CCVal.getOpcode(); // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. - if (CCVal.getResNo() == 1 && - (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || - Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { + if (isOverflowIntrOpRes(CCVal)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) return SDValue(); @@ -4597,7 +4668,7 @@ SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) return LowerWin64_VASTART(Op, DAG); else if (Subtarget->isTargetDarwin()) return LowerDarwin_VASTART(Op, DAG); @@ -4849,20 +4920,47 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, bool AArch64TargetLowering::isOffsetFoldingLegal( const GlobalAddressSDNode *GA) const { - // The AArch64 target doesn't support folding offsets into global addresses. + DEBUG(dbgs() << "Skipping offset folding global address: "); + DEBUG(GA->dump()); + DEBUG(dbgs() << "AArch64 doesn't support folding offsets into global " + "addresses\n"); return false; } bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. // FIXME: We should be able to handle f128 as well with a clever lowering. - if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) + if (Imm.isPosZero() && (VT == MVT::f16 || VT == MVT::f64 || VT == MVT::f32)) { + DEBUG(dbgs() << "Legal fp imm: materialize 0 using the zero register\n"); + return true; + } + + StringRef FPType; + bool IsLegal = false; + SmallString<128> ImmStrVal; + Imm.toString(ImmStrVal); + + if (VT == MVT::f64) { + FPType = "f64"; + IsLegal = AArch64_AM::getFP64Imm(Imm) != -1; + } else if (VT == MVT::f32) { + FPType = "f32"; + IsLegal = AArch64_AM::getFP32Imm(Imm) != -1; + } else if (VT == MVT::f16 && Subtarget->hasFullFP16()) { + FPType = "f16"; + IsLegal = AArch64_AM::getFP16Imm(Imm) != -1; + } + + if (IsLegal) { + DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal << "\n"); return true; + } + + if (!FPType.empty()) + DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal << "\n"); + else + DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal << ": unsupported fp type\n"); - if (VT == MVT::f64) - return AArch64_AM::getFP64Imm(Imm) != -1; - else if (VT == MVT::f32) - return AArch64_AM::getFP32Imm(Imm) != -1; return false; } @@ -4884,7 +4982,7 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, // the initial estimate is 2^-8. Thus the number of extra steps to refine // the result for float (23 mantissa bits) is 2 and for double (52 // mantissa bits) is 3. - ExtraSteps = VT == MVT::f64 ? 3 : 2; + ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); } @@ -5301,6 +5399,7 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); @@ -5336,8 +5435,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa<ConstantSDNode>(V.getOperand(1))) { - // A shuffle can only come from building a vector from various - // elements of other vectors, provided their indices are constant. + DEBUG(dbgs() << "Reshuffle failed: " + "a shuffle can only come from building a vector from " + "various elements of other vectors, provided their " + "indices are constant\n"); return SDValue(); } @@ -5353,10 +5454,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, Source->MaxElt = std::max(Source->MaxElt, EltNo); } - // Currently only do something sane when at most two source vectors - // are involved. - if (Sources.size() > 2) + if (Sources.size() > 2) { + DEBUG(dbgs() << "Reshuffle failed: currently only do something sane when at " + "most two source vectors are involved\n"); return SDValue(); + } // Find out the smallest element size among result and two sources, and use // it as element size to build the shuffle_vector. @@ -5400,7 +5502,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { - // Span too large for a VEXT to cope + DEBUG(dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); return SDValue(); } @@ -5481,8 +5583,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, } // Final check before we try to produce nonsense... - if (!isShuffleMaskLegal(Mask, ShuffleVT)) + if (!isShuffleMaskLegal(Mask, ShuffleVT)) { + DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); return SDValue(); + } SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; for (unsigned i = 0; i < Sources.size(); ++i) @@ -5490,7 +5594,16 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], Mask); - return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + + DEBUG( + dbgs() << "Reshuffle, creating node: "; + Shuffle.dump(); + dbgs() << "Reshuffle, creating node: "; + V.dump(); + ); + + return V; } // check if an EXT instruction can handle the shuffle mask when the @@ -6703,27 +6816,36 @@ FailedModImm: usesOnlyOneValue = false; } - if (!Value.getNode()) + if (!Value.getNode()) { + DEBUG(dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); return DAG.getUNDEF(VT); + } - if (isOnlyLowElement) + if (isOnlyLowElement) { + DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " + "SCALAR_TO_VECTOR node\n"); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); + } - // Use DUP for non-constant splats. For f32 constant splats, reduce to + // Use DUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (usesOnlyOneValue) { if (!isConstant) { if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - Value.getValueType() != VT) + Value.getValueType() != VT) { + DEBUG(dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); + } // This is actually a DUPLANExx operation, which keeps everything vectory. - // DUPLANE works on 128-bit vectors, widen it if necessary. SDValue Lane = Value.getOperand(1); Value = Value.getOperand(0); - if (Value.getValueSizeInBits() == 64) + if (Value.getValueSizeInBits() == 64) { + DEBUG(dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " + "widening it\n"); Value = WidenVector(Value, DAG); + } unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); return DAG.getNode(Opcode, dl, VT, Value, Lane); @@ -6734,11 +6856,17 @@ FailedModImm: EVT EltTy = VT.getVectorElementType(); assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && "Unsupported floating-point vector type"); + DEBUG(dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " + "BITCASTS, and try again\n"); MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); + DEBUG( + dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; + Val.dump(); + ); Val = LowerBUILD_VECTOR(Val, DAG); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); @@ -6764,11 +6892,12 @@ FailedModImm: return Val; } - // If all elements are constants and the case above didn't get hit, fall back - // to the default expansion, which will generate a load from the constant - // pool. - if (isConstant) + // This will generate a load from the constant pool. + if (isConstant) { + DEBUG(dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " + "expansion\n"); return SDValue(); + } // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { @@ -6783,6 +6912,9 @@ FailedModImm: // shuffle is valid for the target) and materialization element by element // on the stack followed by a load for everything else. if (!isConstant && !usesOnlyOneValue) { + DEBUG(dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " + "of INSERT_VECTOR_ELT\n"); + SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); unsigned i = 0; @@ -6798,9 +6930,14 @@ FailedModImm: // extended (i32) and it is safe to cast them to the vector type by ignoring // the upper bits of the lowest lane (e.g. v8i8, v4i16). if (!Op0.isUndef()) { + DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); ++i; } + DEBUG( + if (i < NumElts) + dbgs() << "Creating nodes for the other vector elements:\n"; + ); for (; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) @@ -6811,7 +6948,8 @@ FailedModImm: return Vec; } - // Just use the default expansion. We failed to find a better alternative. + DEBUG(dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " + "better alternative\n"); return SDValue(); } @@ -6912,8 +7050,7 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, return SDValue(); } -bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, - EVT VT) const { +bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; @@ -7234,6 +7371,7 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, /// specified in the intrinsic calls. bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const { auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { @@ -7256,9 +7394,8 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; - Info.vol = false; // volatile loads with NEON intrinsics not supported - Info.readMem = true; - Info.writeMem = false; + // volatile loads with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::aarch64_neon_st2: @@ -7283,9 +7420,8 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; - Info.vol = false; // volatile stores with NEON intrinsics not supported - Info.readMem = false; - Info.writeMem = true; + // volatile stores with NEON intrinsics not supported + Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::aarch64_ldaxr: @@ -7296,9 +7432,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); - Info.vol = true; - Info.readMem = true; - Info.writeMem = false; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_stlxr: @@ -7309,9 +7443,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); - Info.vol = true; - Info.readMem = false; - Info.writeMem = true; + Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_ldaxp: @@ -7321,9 +7453,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = 16; - Info.vol = true; - Info.readMem = true; - Info.writeMem = false; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_stlxp: case Intrinsic::aarch64_stxp: @@ -7332,9 +7462,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = 16; - Info.vol = true; - Info.readMem = false; - Info.writeMem = true; + Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; default: break; @@ -7422,7 +7550,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { if (isa<FPExtInst>(Ext)) return false; - // Vector types are next free. + // Vector types are not free. if (Ext->getType()->isVectorTy()) return false; @@ -7781,9 +7909,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, // instruction to materialize the v2i64 zero and one store (with restrictive // addressing mode). Just do two i64 store of zero-registers. bool Fast; - const Function *F = MF.getFunction(); + const Function &F = MF.getFunction(); if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && - !F->hasFnAttribute(Attribute::NoImplicitFloat) && + !F.hasFnAttribute(Attribute::NoImplicitFloat) && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) return MVT::f128; @@ -7803,12 +7931,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { - // Avoid UB for INT64_MIN. - if (Immed == std::numeric_limits<int64_t>::min()) + if (Immed == std::numeric_limits<int64_t>::min()) { + DEBUG(dbgs() << "Illegal add imm " << Immed << ": avoid UB for INT64_MIN\n"); return false; + } // Same encoding for add/sub, just flip the sign. Immed = std::abs(Immed); - return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); + bool IsLegal = ((Immed >> 12) == 0 || + ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); + DEBUG(dbgs() << "Is " << Immed << " legal add imm: " << + (IsLegal ? "yes" : "no") << "\n"); + return IsLegal; } // Integer comparisons are implemented with ADDS/SUBS, so the range of valid @@ -7821,7 +7954,7 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { /// by AM is legal for this target, for a load/store of the specified type. bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS) const { + unsigned AS, Instruction *I) const { // AArch64 has five basic addressing modes: // reg // reg + 9-bit signed offset @@ -8023,7 +8156,7 @@ SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { - AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV @@ -9420,8 +9553,6 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { - if (!DCI.isBeforeLegalize()) - return SDValue(); StoreSDNode *S = cast<StoreSDNode>(N); if (S->isVolatile() || S->isIndexed()) @@ -9446,7 +9577,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); // Don't split at -Oz. - if (DAG.getMachineFunction().getFunction()->optForMinSize()) + if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); // Don't split v2i64 vectors. Memcpy lowering produces those and splitting @@ -10267,6 +10398,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: + DEBUG(dbgs() << "Custom combining: skipping\n"); break; case ISD::ADD: case ISD::SUB: @@ -10740,7 +10872,7 @@ Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { return UseTlsOffset(IRB, 0x28); // Fuchsia is similar. - // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value. + // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. if (Subtarget->isTargetFuchsia()) return UseTlsOffset(IRB, -0x10); @@ -10755,7 +10887,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons return UseTlsOffset(IRB, 0x48); // Fuchsia is similar. - // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value. + // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. if (Subtarget->isTargetFuchsia()) return UseTlsOffset(IRB, -0x8); @@ -10772,7 +10904,7 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); if (!Mask) return false; - return Mask->getUniqueInteger().isPowerOf2(); + return Mask->getValue().isPowerOf2(); } void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { @@ -10807,7 +10939,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR( // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. - assert(Entry->getParent()->getFunction()->hasFnAttribute( + assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 3b0e0f1de894..8d78b5b6b5b4 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -18,9 +18,9 @@ #include "AArch64.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Instruction.h" -#include "llvm/Target/TargetLowering.h" namespace llvm { @@ -290,7 +290,7 @@ public: /// Return true if the given shuffle mask can be codegen'd directly, or if it /// should be stack expanded. - bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override; + bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override; /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, @@ -306,6 +306,7 @@ public: MachineBasicBlock *MBB) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const override; bool isTruncateFree(Type *Ty1, Type *Ty2) const override; @@ -338,7 +339,8 @@ public: /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS) const override; + unsigned AS, + Instruction *I = nullptr) const override; /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store @@ -413,7 +415,7 @@ public: // Do not merge to float value size (128 bytes) if no implicit // float attribute is set. - bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute( + bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (NoFloat) @@ -442,8 +444,8 @@ public: } bool supportSplitCSR(MachineFunction *MF) const override { - return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && - MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction().hasFnAttribute(Attribute::NoUnwind); } void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( @@ -470,6 +472,9 @@ public: MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; + bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, + CallingConv::ID CallConv, + bool isVarArg) const override; private: bool isExtFreeImpl(const Instruction *Ext) const override; @@ -534,10 +539,12 @@ private: unsigned Flag) const; SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; - template <class NodeTy> SDValue getGOT(NodeTy *N, SelectionDAG &DAG) const; template <class NodeTy> - SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG) const; - template <class NodeTy> SDValue getAddr(NodeTy *N, SelectionDAG &DAG) const; + SDValue getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; + template <class NodeTy> + SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; + template <class NodeTy> + SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; @@ -639,10 +646,6 @@ private: void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; - bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, - CallingConv::ID CallConv, - bool isVarArg) const override; - bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; }; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index eec41ddbc159..153bcf75cbcd 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -30,18 +30,18 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; // A atomic load operation that actually needs acquire semantics. class acquiring_load<PatFrag base> - : PatFrag<(ops node:$ptr), (base node:$ptr), [{ - AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - return isAcquireOrStronger(Ordering); -}]>; + : PatFrag<(ops node:$ptr), (base node:$ptr)> { + let IsAtomic = 1; + let IsAtomicOrderingAcquireOrStronger = 1; +} // An atomic load operation that does not need either acquire or release // semantics. class relaxed_load<PatFrag base> - : PatFrag<(ops node:$ptr), (base node:$ptr), [{ - AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - return !isAcquireOrStronger(Ordering); -}]>; + : PatFrag<(ops node:$ptr), (base node:$ptr)> { + let IsAtomic = 1; + let IsAtomicOrderingAcquireOrStronger = 0; +} // 8-bit loads def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; @@ -113,19 +113,17 @@ def : Pat<(relaxed_load<atomic_load_64> // A store operation that actually needs release semantics. class releasing_store<PatFrag base> - : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ - AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - assert(Ordering != AtomicOrdering::AcquireRelease && - "unexpected store ordering"); - return isReleaseOrStronger(Ordering); -}]>; + : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val)> { + let IsAtomic = 1; + let IsAtomicOrderingReleaseOrStronger = 1; +} // An atomic store operation that doesn't actually need to be atomic on AArch64. class relaxed_store<PatFrag base> - : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ - AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - return !isReleaseOrStronger(Ordering); -}]>; + : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val)> { + let IsAtomic = 1; + let IsAtomicOrderingReleaseOrStronger = 0; +} // 8-bit stores def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val), @@ -407,57 +405,17 @@ def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch), Sched<[WriteAtomic]>; // v8.1 Atomic instructions: -def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>; - -def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>; - -def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>; - -def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>; - -def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>; - -def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>; - -def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>; - -def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALb GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; -def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALh GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; -def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALs GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; -def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rold, GPR64:$Rnew), (CASALd GPR64:$Rold, GPR64:$Rnew, GPR64sp:$Rn)>; - -def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>; -def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>; - -def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; -def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; -def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; -def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>; - -def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; -def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; -def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; -def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>; +let Predicates = [HasLSE] in { + defm : LDOPregister_patterns<"LDADD", "atomic_load_add">; + defm : LDOPregister_patterns<"LDSET", "atomic_load_or">; + defm : LDOPregister_patterns<"LDEOR", "atomic_load_xor">; + defm : LDOPregister_patterns<"LDSMAX", "atomic_load_max">; + defm : LDOPregister_patterns<"LDSMIN", "atomic_load_min">; + defm : LDOPregister_patterns<"LDUMAX", "atomic_load_umax">; + defm : LDOPregister_patterns<"LDUMIN", "atomic_load_umin">; + defm : LDOPregister_patterns<"SWP", "atomic_swap">; + defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">; + defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">; + defm : CASregister_patterns<"CAS", "atomic_cmp_swap">; +} + diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td index c44daf306ea9..80c5092a4eed 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -189,6 +189,11 @@ def GPR32as64 : RegisterOperand<GPR32> { // are encoded as the eight bit value 'abcdefgh'. def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; } +// Authenticated loads for v8.3 can have scaled 10-bit immediate offsets. +def SImm10s8Operand : AsmOperandClass { + let Name = "SImm10s8"; + let DiagnosticType = "InvalidMemoryIndexedSImm10"; +} //===----------------------------------------------------------------------===// // Operand Definitions. @@ -216,6 +221,12 @@ def adrlabel : Operand<i64> { let ParserMatchClass = AdrOperand; } +def simm10Scaled : Operand<i64> { + let ParserMatchClass = SImm10s8Operand; + let DecoderMethod = "DecodeSImm<10>"; + let PrintMethod = "printImmScale<8>"; +} + // simm9 predicate - True if the immediate is in the range [-256, 255]. def SImm9Operand : AsmOperandClass { let Name = "SImm9"; @@ -489,14 +500,14 @@ let DiagnosticType = "LogicalSecondSource" in { let Name = "LogicalImm64Not"; } } -def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{ - return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32); +def logical_imm32 : Operand<i32>, IntImmLeaf<i32, [{ + return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 32); }], logical_imm32_XFORM> { let PrintMethod = "printLogicalImm32"; let ParserMatchClass = LogicalImm32Operand; } -def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{ - return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64); +def logical_imm64 : Operand<i64>, IntImmLeaf<i64, [{ + return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 64); }], logical_imm64_XFORM> { let PrintMethod = "printLogicalImm64"; let ParserMatchClass = LogicalImm64Operand; @@ -743,8 +754,8 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>, // Floating-point immediate. def fpimm16 : Operand<f16>, - PatLeaf<(f16 fpimm), [{ - return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1; + FPImmLeaf<f16, [{ + return AArch64_AM::getFP16Imm(Imm) != -1; }], SDNodeXForm<fpimm, [{ APFloat InVal = N->getValueAPF(); uint32_t enc = AArch64_AM::getFP16Imm(InVal); @@ -754,8 +765,8 @@ def fpimm16 : Operand<f16>, let PrintMethod = "printFPImmOperand"; } def fpimm32 : Operand<f32>, - PatLeaf<(f32 fpimm), [{ - return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1; + FPImmLeaf<f32, [{ + return AArch64_AM::getFP32Imm(Imm) != -1; }], SDNodeXForm<fpimm, [{ APFloat InVal = N->getValueAPF(); uint32_t enc = AArch64_AM::getFP32Imm(InVal); @@ -765,8 +776,8 @@ def fpimm32 : Operand<f32>, let PrintMethod = "printFPImmOperand"; } def fpimm64 : Operand<f64>, - PatLeaf<(f64 fpimm), [{ - return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1; + FPImmLeaf<f64, [{ + return AArch64_AM::getFP64Imm(Imm) != -1; }], SDNodeXForm<fpimm, [{ APFloat InVal = N->getValueAPF(); uint32_t enc = AArch64_AM::getFP64Imm(InVal); @@ -781,8 +792,8 @@ def fpimm8 : Operand<i32> { let PrintMethod = "printFPImmOperand"; } -def fpimm0 : PatLeaf<(fpimm), [{ - return N->isExactlyValue(+0.0); +def fpimm0 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(+0.0); }]>; // Vector lane operands @@ -836,10 +847,9 @@ def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{ // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh // are encoded as the eight bit value 'abcdefgh'. def simdimmtype10 : Operand<i32>, - PatLeaf<(f64 fpimm), [{ - return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF() - .bitcastToAPInt() - .getZExtValue()); + FPImmLeaf<f64, [{ + return AArch64_AM::isAdvSIMDModImmType10( + Imm.bitcastToAPInt().getZExtValue()); }], SDNodeXForm<fpimm, [{ APFloat InVal = N->getValueAPF(); uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF() @@ -913,6 +923,17 @@ class CRmSystemI<Operand crmtype, bits<3> opc, string asm, let Inst{7-5} = opc; } +class SystemNoOperands<bits<3> op2, string asm, list<dag> pattern = []> + : SimpleSystemI<0, (ins), asm, "", pattern>, + Sched<[]> { + bits<4> CRm; + let CRm = 0b0011; + let Inst{31-12} = 0b11010101000000110010; + let Inst{11-8} = CRm; + let Inst{7-5} = op2; + let Inst{4-0} = 0b11111; +} + // MRS/MSR system instructions. These have different operand classes because // a different subset of registers can be accessed through each instruction. def MRSSystemRegisterOperand : AsmOperandClass { @@ -1098,6 +1119,83 @@ class SpecialReturn<bits<4> opc, string asm> let Inst{9-5} = 0b11111; } +let mayLoad = 1 in +class RCPCLoad<bits<2> sz, string asm, RegisterClass RC> + : I<(outs RC:$Rt), (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]", "", []>, + Sched<[]> { + bits<5> Rn; + bits<5> Rt; + let Inst{31-30} = sz; + let Inst{29-10} = 0b11100010111111110000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +class AuthBase<bits<1> M, dag oops, dag iops, string asm, string operands, + list<dag> pattern> + : I<oops, iops, asm, operands, "", pattern>, Sched<[]> { + let Inst{31-25} = 0b1101011; + let Inst{20-11} = 0b1111100001; + let Inst{10} = M; + let Inst{4-0} = 0b11111; +} + +class AuthBranchTwoOperands<bits<1> op, bits<1> M, string asm> + : AuthBase<M, (outs), (ins GPR64:$Rn, GPR64sp:$Rm), asm, "\t$Rn, $Rm", []> { + bits<5> Rn; + bits<5> Rm; + let Inst{24-22} = 0b100; + let Inst{21} = op; + let Inst{9-5} = Rn; + let Inst{4-0} = Rm; +} + +class AuthOneOperand<bits<3> opc, bits<1> M, string asm> + : AuthBase<M, (outs), (ins GPR64:$Rn), asm, "\t$Rn", []> { + bits<5> Rn; + let Inst{24} = 0; + let Inst{23-21} = opc; + let Inst{9-5} = Rn; +} + +class AuthReturn<bits<3> op, bits<1> M, string asm> + : AuthBase<M, (outs), (ins), asm, "", []> { + let Inst{24} = 0; + let Inst{23-21} = op; + let Inst{9-0} = 0b1111111111; +} + +let mayLoad = 1 in +class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm, + string operands, string cstr, Operand opr> + : I<oops, iops, asm, operands, cstr, []>, Sched<[]> { + bits<10> offset; + bits<5> Rn; + bits<5> Rt; + let Inst{31-24} = 0b11111000; + let Inst{23} = M; + let Inst{22} = offset{9}; + let Inst{21} = 1; + let Inst{20-12} = offset{8-0}; + let Inst{11} = W; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass AuthLoad<bit M, string asm, Operand opr> { + def indexed : BaseAuthLoad<M, 0, (outs GPR64:$Rt), + (ins GPR64sp:$Rn, opr:$offset), + asm, "\t$Rt, [$Rn, $offset]", "", opr>; + def writeback : BaseAuthLoad<M, 1, (outs GPR64sp:$wback, GPR64:$Rt), + (ins GPR64sp:$Rn, opr:$offset), + asm, "\t$Rt, [$Rn, $offset]!", + "$Rn = $wback,@earlyclobber $wback", opr>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>; +} + //--- // Conditional branch instruction. //--- @@ -1320,6 +1418,46 @@ class OneXRegData<bits<3> opc, string asm, SDPatternOperator node> let Inst{31} = 1; } +class SignAuthOneData<bits<3> opcode_prefix, bits<2> opcode, string asm> + : I<(outs GPR64:$Rd), (ins GPR64sp:$Rn), asm, "\t$Rd, $Rn", "", + []>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-15} = 0b11011010110000010; + let Inst{14-12} = opcode_prefix; + let Inst{11-10} = opcode; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SignAuthZero<bits<3> opcode_prefix, bits<2> opcode, string asm> + : I<(outs GPR64:$Rd), (ins), asm, "\t$Rd", "", []>, Sched<[]> { + bits<5> Rd; + let Inst{31-15} = 0b11011010110000010; + let Inst{14-12} = opcode_prefix; + let Inst{11-10} = opcode; + let Inst{9-5} = 0b11111; + let Inst{4-0} = Rd; +} + +class SignAuthTwoOperand<bits<4> opc, string asm, + SDPatternOperator OpNode> + : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64sp:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64sp:$Rm))]>, + Sched<[WriteI, ReadI, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-21} = 0b10011010110; + let Inst{20-16} = Rm; + let Inst{15-14} = 0b00; + let Inst{13-10} = opc; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + //--- // Basic two-operand data processing instructions. //--- @@ -2378,6 +2516,22 @@ def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>; def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>; def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>; +def gi_am_indexed8 : + GIComplexOperandMatcher<s64, "selectAddrModeIndexed<8>">, + GIComplexPatternEquiv<am_indexed8>; +def gi_am_indexed16 : + GIComplexOperandMatcher<s64, "selectAddrModeIndexed<16>">, + GIComplexPatternEquiv<am_indexed16>; +def gi_am_indexed32 : + GIComplexOperandMatcher<s64, "selectAddrModeIndexed<32>">, + GIComplexPatternEquiv<am_indexed32>; +def gi_am_indexed64 : + GIComplexOperandMatcher<s64, "selectAddrModeIndexed<64>">, + GIComplexPatternEquiv<am_indexed64>; +def gi_am_indexed128 : + GIComplexOperandMatcher<s64, "selectAddrModeIndexed<128>">, + GIComplexPatternEquiv<am_indexed128>; + class UImm12OffsetOperand<int Scale> : AsmOperandClass { let Name = "UImm12Offset" # Scale; let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">"; @@ -2449,6 +2603,23 @@ multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } +// Same as StoreUI, but take a RegisterOperand. This is used by GlobalISel to +// substitute zero-registers automatically. +// +// TODO: Roll out zero-register subtitution to GPR32/GPR64 and fold this back +// into StoreUI. +multiclass StoreUIz<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype, + Operand indextype, string asm, list<dag> pattern> { + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def ui : BaseLoadStoreUI<sz, V, opc, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset), + asm, pattern>, + Sched<[WriteST]>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + def PrefetchOperand : AsmOperandClass { let Name = "Prefetch"; let ParserMethod = "tryParsePrefetch"; @@ -2933,22 +3104,18 @@ multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, string asm, ValueType Ty, SDPatternOperator storeop> { - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs), (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend), - [(storeop (Ty regtype:$Rt), - (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend128:$extend))]>, + []>, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b0; } - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs), (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend), - [(storeop (Ty regtype:$Rt), - (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend128:$extend))]>, + []>, Sched<[WriteSTIdx, ReadAdrBase]> { let Inst{13} = 0b1; } @@ -3012,6 +3179,23 @@ def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>; def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>; def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>; +def gi_am_unscaled8 : + GIComplexOperandMatcher<s64, "selectAddrModeUnscaled8">, + GIComplexPatternEquiv<am_unscaled8>; +def gi_am_unscaled16 : + GIComplexOperandMatcher<s64, "selectAddrModeUnscaled16">, + GIComplexPatternEquiv<am_unscaled16>; +def gi_am_unscaled32 : + GIComplexOperandMatcher<s64, "selectAddrModeUnscaled32">, + GIComplexPatternEquiv<am_unscaled32>; +def gi_am_unscaled64 : + GIComplexOperandMatcher<s64, "selectAddrModeUnscaled64">, + GIComplexPatternEquiv<am_unscaled64>; +def gi_am_unscaled128 : + GIComplexOperandMatcher<s64, "selectAddrModeUnscaled128">, + GIComplexPatternEquiv<am_unscaled128>; + + class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops, string asm, list<dag> pattern> : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> { @@ -4374,6 +4558,12 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode, let Inst{4-0} = Rd; } +class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1, + string kind2> : + BaseSIMDThreeSameVector<Q, U, 0b100, 0b10010, V128, asm, kind1, [] > { + let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); +} + // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { @@ -6801,6 +6991,16 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc, let Inst{4-0} = Rd; } +// ARMv8.2 Index Dot product instructions +class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind, + string lhs_kind, string rhs_kind> : + BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, V128, V128, V128, VectorIndexS, + asm, "", dst_kind, lhs_kind, rhs_kind, []> { + bits<2> idx; + let Inst{21} = idx{0}; // L + let Inst{11} = idx{1}; // H +} + multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm, SDPatternOperator OpNode> { let Predicates = [HasNEON, HasFullFP16] in { @@ -9241,6 +9441,238 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm, } // let Predicates = [HasNeon, HasRDM] //---------------------------------------------------------------------------- +// ARMv8.3 Complex ADD/MLA instructions +//---------------------------------------------------------------------------- + +class ComplexRotationOperand<int Angle, int Remainder, string Type> + : AsmOperandClass { + let PredicateMethod = "isComplexRotation<" # Angle # ", " # Remainder # ">"; + let DiagnosticType = "InvalidComplexRotation" # Type; + let Name = "ComplexRotation" # Type; +} +def complexrotateop : Operand<i32> { + let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">; + let PrintMethod = "printComplexRotationOp<90, 0>"; +} +def complexrotateopodd : Operand<i32> { + let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">; + let PrintMethod = "printComplexRotationOp<180, 90>"; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode, + RegisterOperand regtype, Operand rottype, + string asm, string kind, list<dag> pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, rottype:$rot), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot" + "|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<1> rot; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0; + let Inst{20-16} = Rm; + let Inst{15-13} = opcode; + // Non-tied version (FCADD) only has one rotation bit + let Inst{12} = rot; + let Inst{11} = 0; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype, + string asm, SDPatternOperator OpNode>{ + let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVectorComplex<0, U, 0b01, opcode, V64, rottype, + asm, ".4h", + [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), + (v4f16 V64:$Rn), + (v4f16 V64:$Rm), + (rottype i32:$rot)))]>; + + def v8f16 : BaseSIMDThreeSameVectorComplex<1, U, 0b01, opcode, V128, rottype, + asm, ".8h", + [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), + (v8f16 V128:$Rn), + (v8f16 V128:$Rm), + (rottype i32:$rot)))]>; + } + + let Predicates = [HasV8_3a, HasNEON] in { + def v2f32 : BaseSIMDThreeSameVectorComplex<0, U, 0b10, opcode, V64, rottype, + asm, ".2s", + [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), + (v2f32 V64:$Rn), + (v2f32 V64:$Rm), + (rottype i32:$rot)))]>; + + def v4f32 : BaseSIMDThreeSameVectorComplex<1, U, 0b10, opcode, V128, rottype, + asm, ".4s", + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v4f32 V128:$Rn), + (v4f32 V128:$Rm), + (rottype i32:$rot)))]>; + + def v2f64 : BaseSIMDThreeSameVectorComplex<1, U, 0b11, opcode, V128, rottype, + asm, ".2d", + [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), + (v2f64 V128:$Rn), + (v2f64 V128:$Rm), + (rottype i32:$rot)))]>; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size, + bits<3> opcode, + RegisterOperand regtype, + Operand rottype, string asm, + string kind, list<dag> pattern> + : I<(outs regtype:$dst), + (ins regtype:$Rd, regtype:$Rn, regtype:$Rm, rottype:$rot), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot" + "|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<2> rot; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0; + let Inst{20-16} = Rm; + let Inst{15-13} = opcode; + let Inst{12-11} = rot; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode, + Operand rottype, string asm, + SDPatternOperator OpNode> { + let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b01, opcode, V64, + rottype, asm, ".4h", + [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), + (v4f16 V64:$Rn), + (v4f16 V64:$Rm), + (rottype i32:$rot)))]>; + + def v8f16 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b01, opcode, V128, + rottype, asm, ".8h", + [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), + (v8f16 V128:$Rn), + (v8f16 V128:$Rm), + (rottype i32:$rot)))]>; + } + + let Predicates = [HasV8_3a, HasNEON] in { + def v2f32 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b10, opcode, V64, + rottype, asm, ".2s", + [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), + (v2f32 V64:$Rn), + (v2f32 V64:$Rm), + (rottype i32:$rot)))]>; + + def v4f32 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b10, opcode, V128, + rottype, asm, ".4s", + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v4f32 V128:$Rn), + (v4f32 V128:$Rm), + (rottype i32:$rot)))]>; + + def v2f64 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b11, opcode, V128, + rottype, asm, ".2d", + [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), + (v2f64 V128:$Rn), + (v2f64 V128:$Rm), + (rottype i32:$rot)))]>; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size, + bit opc1, bit opc2, RegisterOperand dst_reg, + RegisterOperand lhs_reg, + RegisterOperand rhs_reg, Operand vec_idx, + Operand rottype, string asm, string apple_kind, + string dst_kind, string lhs_kind, + string rhs_kind, list<dag> pattern> + : I<(outs dst_reg:$dst), + (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx, rottype:$rot), + asm, + "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # + "$idx, $rot" # "|" # apple_kind # + "\t$Rd, $Rn, $Rm$idx, $rot}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<2> rot; + + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28} = Scalar; + let Inst{27-24} = 0b1111; + let Inst{23-22} = size; + // Bit 21 must be set by the derived class. + let Inst{20-16} = Rm; + let Inst{15} = opc1; + let Inst{14-13} = rot; + let Inst{12} = opc2; + // Bit 11 must be set by the derived class. + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// The complex instructions index by pairs of elements, so the VectorIndexes +// don't match the lane types, and the index bits are different to the other +// classes. +multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype, + string asm, SDPatternOperator OpNode> { + let Predicates = [HasV8_3a,HasNEON,HasFullFP16] in { + def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64, + V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h", + ".4h", ".h", []> { + bits<1> idx; + let Inst{11} = 0; + let Inst{21} = idx{0}; + } + + def v8f16_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b01, opc1, opc2, + V128, V128, V128, VectorIndexS, rottype, asm, ".8h", + ".8h", ".8h", ".h", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + } // Predicates = [HasV8_3a,HasNEON,HasFullFP16] + + let Predicates = [HasV8_3a,HasNEON] in { + def v4f32_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b10, opc1, opc2, + V128, V128, V128, VectorIndexD, rottype, asm, ".4s", + ".4s", ".4s", ".s", []> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } + } // Predicates = [HasV8_3a,HasNEON] +} + +//---------------------------------------------------------------------------- // Crypto extensions //---------------------------------------------------------------------------- @@ -9398,10 +9830,10 @@ class BaseCAS<string order, string size, RegisterClass RC> } multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> { - let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseCAS<order, "b", GPR32>; - let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseCAS<order, "h", GPR32>; - let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseCAS<order, "", GPR32>; - let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseCAS<order, "", GPR64>; + let Sz = 0b00, Acq = Acq, Rel = Rel in def B : BaseCAS<order, "b", GPR32>; + let Sz = 0b01, Acq = Acq, Rel = Rel in def H : BaseCAS<order, "h", GPR32>; + let Sz = 0b10, Acq = Acq, Rel = Rel in def W : BaseCAS<order, "", GPR32>; + let Sz = 0b11, Acq = Acq, Rel = Rel in def X : BaseCAS<order, "", GPR64>; } class BaseCASP<string order, string size, RegisterOperand RC> @@ -9413,10 +9845,10 @@ class BaseCASP<string order, string size, RegisterOperand RC> } multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> { - let Sz = 0b00, Acq = Acq, Rel = Rel in - def s : BaseCASP<order, "", WSeqPairClassOperand>; - let Sz = 0b01, Acq = Acq, Rel = Rel in - def d : BaseCASP<order, "", XSeqPairClassOperand>; + let Sz = 0b00, Acq = Acq, Rel = Rel in + def W : BaseCASP<order, "", WSeqPairClassOperand>; + let Sz = 0b01, Acq = Acq, Rel = Rel in + def X : BaseCASP<order, "", XSeqPairClassOperand>; } let Predicates = [HasLSE] in @@ -9446,10 +9878,10 @@ class BaseSWP<string order, string size, RegisterClass RC> } multiclass Swap<bits<1> Acq, bits<1> Rel, string order> { - let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseSWP<order, "b", GPR32>; - let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseSWP<order, "h", GPR32>; - let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseSWP<order, "", GPR32>; - let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseSWP<order, "", GPR64>; + let Sz = 0b00, Acq = Acq, Rel = Rel in def B : BaseSWP<order, "b", GPR32>; + let Sz = 0b01, Acq = Acq, Rel = Rel in def H : BaseSWP<order, "h", GPR32>; + let Sz = 0b10, Acq = Acq, Rel = Rel in def W : BaseSWP<order, "", GPR32>; + let Sz = 0b11, Acq = Acq, Rel = Rel in def X : BaseSWP<order, "", GPR64>; } let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in @@ -9480,14 +9912,94 @@ class BaseLDOPregister<string op, string order, string size, RegisterClass RC> multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel, string order> { - let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in - def b : BaseLDOPregister<op, order, "b", GPR32>; - let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in - def h : BaseLDOPregister<op, order, "h", GPR32>; - let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in - def s : BaseLDOPregister<op, order, "", GPR32>; - let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in - def d : BaseLDOPregister<op, order, "", GPR64>; + let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in + def B : BaseLDOPregister<op, order, "b", GPR32>; + let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in + def H : BaseLDOPregister<op, order, "h", GPR32>; + let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in + def W : BaseLDOPregister<op, order, "", GPR32>; + let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in + def X : BaseLDOPregister<op, order, "", GPR64>; +} + +// Differing SrcRHS and DstRHS allow you to cover CLR & SUB by giving a more +// complex DAG for DstRHS. +let Predicates = [HasLSE] in +multiclass LDOPregister_patterns_ord_dag<string inst, string suffix, string op, + string size, dag SrcRHS, dag DstRHS> { + def : Pat<(!cast<SDNode>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS), + (!cast<Instruction>(inst # suffix) DstRHS, GPR64sp:$Rn)>; + def : Pat<(!cast<SDNode>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS), + (!cast<Instruction>(inst # "A" # suffix) DstRHS, GPR64sp:$Rn)>; + def : Pat<(!cast<SDNode>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS), + (!cast<Instruction>(inst # "L" # suffix) DstRHS, GPR64sp:$Rn)>; + def : Pat<(!cast<SDNode>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS), + (!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>; + def : Pat<(!cast<SDNode>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS), + (!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>; +} + +multiclass LDOPregister_patterns_ord<string inst, string suffix, string op, + string size, dag RHS> { + defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, RHS, RHS>; +} + +multiclass LDOPregister_patterns_ord_mod<string inst, string suffix, string op, + string size, dag LHS, dag RHS> { + defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, LHS, RHS>; +} + +multiclass LDOPregister_patterns<string inst, string op> { + defm : LDOPregister_patterns_ord<inst, "X", op, "64", (i64 GPR64:$Rm)>; + defm : LDOPregister_patterns_ord<inst, "W", op, "32", (i32 GPR32:$Rm)>; + defm : LDOPregister_patterns_ord<inst, "H", op, "16", (i32 GPR32:$Rm)>; + defm : LDOPregister_patterns_ord<inst, "B", op, "8", (i32 GPR32:$Rm)>; +} + +multiclass LDOPregister_patterns_mod<string inst, string op, string mod> { + defm : LDOPregister_patterns_ord_mod<inst, "X", op, "64", + (i64 GPR64:$Rm), + (i64 (!cast<Instruction>(mod#Xrr) XZR, GPR64:$Rm))>; + defm : LDOPregister_patterns_ord_mod<inst, "W", op, "32", + (i32 GPR32:$Rm), + (i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>; + defm : LDOPregister_patterns_ord_mod<inst, "H", op, "16", + (i32 GPR32:$Rm), + (i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>; + defm : LDOPregister_patterns_ord_mod<inst, "B", op, "8", + (i32 GPR32:$Rm), + (i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>; +} + +let Predicates = [HasLSE] in +multiclass CASregister_patterns_ord_dag<string inst, string suffix, string op, + string size, dag OLD, dag NEW> { + def : Pat<(!cast<SDNode>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW), + (!cast<Instruction>(inst # suffix) OLD, NEW, GPR64sp:$Rn)>; + def : Pat<(!cast<SDNode>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW), + (!cast<Instruction>(inst # "A" # suffix) OLD, NEW, GPR64sp:$Rn)>; + def : Pat<(!cast<SDNode>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW), + (!cast<Instruction>(inst # "L" # suffix) OLD, NEW, GPR64sp:$Rn)>; + def : Pat<(!cast<SDNode>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW), + (!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>; + def : Pat<(!cast<SDNode>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW), + (!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>; +} + +multiclass CASregister_patterns_ord<string inst, string suffix, string op, + string size, dag OLD, dag NEW> { + defm : CASregister_patterns_ord_dag<inst, suffix, op, size, OLD, NEW>; +} + +multiclass CASregister_patterns<string inst, string op> { + defm : CASregister_patterns_ord<inst, "X", op, "64", + (i64 GPR64:$Rold), (i64 GPR64:$Rnew)>; + defm : CASregister_patterns_ord<inst, "W", op, "32", + (i32 GPR32:$Rold), (i32 GPR32:$Rnew)>; + defm : CASregister_patterns_ord<inst, "H", op, "16", + (i32 GPR32:$Rold), (i32 GPR32:$Rnew)>; + defm : CASregister_patterns_ord<inst, "B", op, "8", + (i32 GPR32:$Rold), (i32 GPR32:$Rnew)>; } let Predicates = [HasLSE] in @@ -9496,26 +10008,27 @@ class BaseSTOPregister<string asm, RegisterClass OP, Register Reg, InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>; multiclass STOPregister<string asm, string instr> { - def : BaseSTOPregister<asm # "lb", GPR32, WZR, - !cast<Instruction>(instr # "Lb")>; - def : BaseSTOPregister<asm # "lh", GPR32, WZR, - !cast<Instruction>(instr # "Lh")>; - def : BaseSTOPregister<asm # "l", GPR32, WZR, - !cast<Instruction>(instr # "Ls")>; - def : BaseSTOPregister<asm # "l", GPR64, XZR, - !cast<Instruction>(instr # "Ld")>; - def : BaseSTOPregister<asm # "b", GPR32, WZR, - !cast<Instruction>(instr # "b")>; - def : BaseSTOPregister<asm # "h", GPR32, WZR, - !cast<Instruction>(instr # "h")>; - def : BaseSTOPregister<asm, GPR32, WZR, - !cast<Instruction>(instr # "s")>; - def : BaseSTOPregister<asm, GPR64, XZR, - !cast<Instruction>(instr # "d")>; + def : BaseSTOPregister<asm # "lb", GPR32, WZR, + !cast<Instruction>(instr # "LB")>; + def : BaseSTOPregister<asm # "lh", GPR32, WZR, + !cast<Instruction>(instr # "LH")>; + def : BaseSTOPregister<asm # "l", GPR32, WZR, + !cast<Instruction>(instr # "LW")>; + def : BaseSTOPregister<asm # "l", GPR64, XZR, + !cast<Instruction>(instr # "LX")>; + def : BaseSTOPregister<asm # "b", GPR32, WZR, + !cast<Instruction>(instr # "B")>; + def : BaseSTOPregister<asm # "h", GPR32, WZR, + !cast<Instruction>(instr # "H")>; + def : BaseSTOPregister<asm, GPR32, WZR, + !cast<Instruction>(instr # "W")>; + def : BaseSTOPregister<asm, GPR64, XZR, + !cast<Instruction>(instr # "X")>; } //---------------------------------------------------------------------------- // Allow the size specifier tokens to be upper case, not just lower. +def : TokenAlias<".4B", ".4b">; // Add dot product def : TokenAlias<".8B", ".8b">; def : TokenAlias<".4H", ".4h">; def : TokenAlias<".2S", ".2s">; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 13c80a46e5b0..c7c560a81328 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -27,7 +28,10 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/GlobalValue.h" #include "llvm/MC/MCInst.h" @@ -40,8 +44,6 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <cassert> #include <cstdint> #include <iterator> @@ -52,17 +54,17 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" -static cl::opt<unsigned> -TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), - cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); +static cl::opt<unsigned> TBZDisplacementBits( + "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), + cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); -static cl::opt<unsigned> -CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), - cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); +static cl::opt<unsigned> CBZDisplacementBits( + "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), + cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); static cl::opt<unsigned> -BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), - cl::desc("Restrict range of Bcc instructions (DEBUG)")); + BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), + cl::desc("Restrict range of Bcc instructions (DEBUG)")); AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), @@ -172,8 +174,8 @@ bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, return isIntN(Bits, BrOffset / 4); } -MachineBasicBlock *AArch64InstrInfo::getBranchDestBlock( - const MachineInstr &MI) const { +MachineBasicBlock * +AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: llvm_unreachable("unexpected opcode!"); @@ -374,12 +376,9 @@ void AArch64InstrInfo::instantiateCondBranch( } } -unsigned AArch64InstrInfo::insertBranch(MachineBasicBlock &MBB, - MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - ArrayRef<MachineOperand> Cond, - const DebugLoc &DL, - int *BytesAdded) const { +unsigned AArch64InstrInfo::insertBranch( + MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { // Shouldn't be a fall through. assert(TBB && "insertBranch must not be told to insert a fallthrough"); @@ -485,10 +484,11 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, return Opc; } -bool AArch64InstrInfo::canInsertSelect( - const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, - int &FalseCycles) const { +bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, int &TrueCycles, + int &FalseCycles) const { // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = @@ -656,8 +656,10 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, MRI.constrainRegClass(FalseReg, RC); // Insert the csel. - BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm( - CC); + BuildMI(MBB, I, DL, get(Opc), DstReg) + .addReg(TrueReg) + .addReg(FalseReg) + .addImm(CC); } /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. @@ -673,8 +675,9 @@ static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { if (!Subtarget.hasCustomCheapAsMoveHandling()) return MI.isAsCheapAsAMove(); - - unsigned Imm; + if (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && + isExynosShiftLeftFast(MI)) + return true; switch (MI.getOpcode()) { default: @@ -685,17 +688,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { case AArch64::ADDXri: case AArch64::SUBWri: case AArch64::SUBXri: - return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 || - MI.getOperand(3).getImm() == 0); - - // add/sub on register with shift - case AArch64::ADDWrs: - case AArch64::ADDXrs: - case AArch64::SUBWrs: - case AArch64::SUBXrs: - Imm = MI.getOperand(3).getImm(); - return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && - AArch64_AM::getArithShiftValue(Imm) < 4); + return (MI.getOperand(3).getImm() == 0); // logical ops on immediate case AArch64::ANDWri: @@ -721,24 +714,6 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { case AArch64::ORRXrr: return true; - // logical ops on register with shift - case AArch64::ANDWrs: - case AArch64::ANDXrs: - case AArch64::BICWrs: - case AArch64::BICXrs: - case AArch64::EONWrs: - case AArch64::EONXrs: - case AArch64::EORWrs: - case AArch64::EORXrs: - case AArch64::ORNWrs: - case AArch64::ORNXrs: - case AArch64::ORRWrs: - case AArch64::ORRXrs: - Imm = MI.getOperand(3).getImm(); - return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && - AArch64_AM::getShiftValue(Imm) < 4 && - AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL); - // If MOVi32imm or MOVi64imm can be expanded into ORRWri or // ORRXri, it is as cheap as MOV case AArch64::MOVi32imm: @@ -748,6 +723,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing // feature. + case AArch64::FMOVH0: case AArch64::FMOVS0: case AArch64::FMOVD0: return Subtarget.hasZeroCycleZeroing(); @@ -760,6 +736,129 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { llvm_unreachable("Unknown opcode to check as cheap as a move!"); } +bool AArch64InstrInfo::isExynosShiftLeftFast(const MachineInstr &MI) const { + unsigned Imm, Shift; + AArch64_AM::ShiftExtendType Ext; + + switch (MI.getOpcode()) { + default: + return false; + + // WriteI + case AArch64::ADDSWri: + case AArch64::ADDSXri: + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::SUBSWri: + case AArch64::SUBSXri: + case AArch64::SUBWri: + case AArch64::SUBXri: + return true; + + // WriteISReg + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + case AArch64::EONWrs: + case AArch64::EONXrs: + case AArch64::EORWrs: + case AArch64::EORXrs: + case AArch64::ORNWrs: + case AArch64::ORNXrs: + case AArch64::ORRWrs: + case AArch64::ORRXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + Imm = MI.getOperand(3).getImm(); + Shift = AArch64_AM::getShiftValue(Imm); + Ext = AArch64_AM::getShiftType(Imm); + return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::LSL)); + + // WriteIEReg + case AArch64::ADDSWrx: + case AArch64::ADDSXrx: + case AArch64::ADDSXrx64: + case AArch64::ADDWrx: + case AArch64::ADDXrx: + case AArch64::ADDXrx64: + case AArch64::SUBSWrx: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: + case AArch64::SUBWrx: + case AArch64::SUBXrx: + case AArch64::SUBXrx64: + Imm = MI.getOperand(3).getImm(); + Shift = AArch64_AM::getArithShiftValue(Imm); + Ext = AArch64_AM::getArithExtendType(Imm); + return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::UXTX)); + + case AArch64::PRFMroW: + case AArch64::PRFMroX: + + // WriteLDIdx + case AArch64::LDRBBroW: + case AArch64::LDRBBroX: + case AArch64::LDRHHroW: + case AArch64::LDRHHroX: + case AArch64::LDRSBWroW: + case AArch64::LDRSBWroX: + case AArch64::LDRSBXroW: + case AArch64::LDRSBXroX: + case AArch64::LDRSHWroW: + case AArch64::LDRSHWroX: + case AArch64::LDRSHXroW: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroW: + case AArch64::LDRSWroX: + case AArch64::LDRWroW: + case AArch64::LDRWroX: + case AArch64::LDRXroW: + case AArch64::LDRXroX: + + case AArch64::LDRBroW: + case AArch64::LDRBroX: + case AArch64::LDRDroW: + case AArch64::LDRDroX: + case AArch64::LDRHroW: + case AArch64::LDRHroX: + case AArch64::LDRSroW: + case AArch64::LDRSroX: + + // WriteSTIdx + case AArch64::STRBBroW: + case AArch64::STRBBroX: + case AArch64::STRHHroW: + case AArch64::STRHHroX: + case AArch64::STRWroW: + case AArch64::STRWroX: + case AArch64::STRXroW: + case AArch64::STRXroX: + + case AArch64::STRBroW: + case AArch64::STRBroX: + case AArch64::STRDroW: + case AArch64::STRDroX: + case AArch64::STRHroW: + case AArch64::STRHroX: + case AArch64::STRSroW: + case AArch64::STRSroX: + Imm = MI.getOperand(3).getImm(); + Ext = AArch64_AM::getMemExtendType(Imm); + return (Ext == AArch64_AM::SXTX || Ext == AArch64_AM::UXTX); + } +} + bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: @@ -1084,11 +1183,7 @@ static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { } } -enum AccessKind { - AK_Write = 0x01, - AK_Read = 0x10, - AK_All = 0x11 -}; +enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; /// True when condition flags are accessed (either by writing or reading) /// on the instruction trace starting at From and ending at To. @@ -1117,21 +1212,24 @@ static bool areCFlagsAccessedBetweenInstrs( for (--To; To != From; --To) { const MachineInstr &Instr = *To; - if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) || - ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) + if (((AccessToCheck & AK_Write) && + Instr.modifiesRegister(AArch64::NZCV, TRI)) || + ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) return true; } return false; } /// Try to optimize a compare instruction. A compare instruction is an -/// instruction which produces AArch64::NZCV. It can be truly compare instruction +/// instruction which produces AArch64::NZCV. It can be truly compare +/// instruction /// when there are no uses of its destination register. /// /// The following steps are tried in order: /// 1. Convert CmpInstr into an unconditional version. /// 2. Remove CmpInstr if above there is an instruction producing a needed -/// condition code or an instruction which can be converted into such an instruction. +/// condition code or an instruction which can be converted into such an +/// instruction. /// Only comparison with zero is supported. bool AArch64InstrInfo::optimizeCompareInstr( MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, @@ -1193,20 +1291,34 @@ static unsigned sForm(MachineInstr &Instr) { case AArch64::SUBSXri: return Instr.getOpcode(); - case AArch64::ADDWrr: return AArch64::ADDSWrr; - case AArch64::ADDWri: return AArch64::ADDSWri; - case AArch64::ADDXrr: return AArch64::ADDSXrr; - case AArch64::ADDXri: return AArch64::ADDSXri; - case AArch64::ADCWr: return AArch64::ADCSWr; - case AArch64::ADCXr: return AArch64::ADCSXr; - case AArch64::SUBWrr: return AArch64::SUBSWrr; - case AArch64::SUBWri: return AArch64::SUBSWri; - case AArch64::SUBXrr: return AArch64::SUBSXrr; - case AArch64::SUBXri: return AArch64::SUBSXri; - case AArch64::SBCWr: return AArch64::SBCSWr; - case AArch64::SBCXr: return AArch64::SBCSXr; - case AArch64::ANDWri: return AArch64::ANDSWri; - case AArch64::ANDXri: return AArch64::ANDSXri; + case AArch64::ADDWrr: + return AArch64::ADDSWrr; + case AArch64::ADDWri: + return AArch64::ADDSWri; + case AArch64::ADDXrr: + return AArch64::ADDSXrr; + case AArch64::ADDXri: + return AArch64::ADDSXri; + case AArch64::ADCWr: + return AArch64::ADCSWr; + case AArch64::ADCXr: + return AArch64::ADCSXr; + case AArch64::SUBWrr: + return AArch64::SUBSWrr; + case AArch64::SUBWri: + return AArch64::SUBSWri; + case AArch64::SUBXrr: + return AArch64::SUBSXrr; + case AArch64::SUBXri: + return AArch64::SUBSXri; + case AArch64::SBCWr: + return AArch64::SBCSWr; + case AArch64::SBCXr: + return AArch64::SBCSXr; + case AArch64::ANDWri: + return AArch64::ANDSWri; + case AArch64::ANDXri: + return AArch64::ANDSXri; } } @@ -1228,7 +1340,7 @@ struct UsedNZCV { UsedNZCV() = default; - UsedNZCV& operator |=(const UsedNZCV& UsedFlags) { + UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { this->N |= UsedFlags.N; this->Z |= UsedFlags.Z; this->C |= UsedFlags.C; @@ -1244,29 +1356,29 @@ struct UsedNZCV { /// codes or we don't optimize CmpInstr in the presence of such instructions. static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { switch (Instr.getOpcode()) { - default: - return AArch64CC::Invalid; + default: + return AArch64CC::Invalid; - case AArch64::Bcc: { - int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); - assert(Idx >= 2); - return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm()); - } + case AArch64::Bcc: { + int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); + assert(Idx >= 2); + return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm()); + } - case AArch64::CSINVWr: - case AArch64::CSINVXr: - case AArch64::CSINCWr: - case AArch64::CSINCXr: - case AArch64::CSELWr: - case AArch64::CSELXr: - case AArch64::CSNEGWr: - case AArch64::CSNEGXr: - case AArch64::FCSELSrrr: - case AArch64::FCSELDrrr: { - int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); - assert(Idx >= 1); - return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm()); - } + case AArch64::CSINVWr: + case AArch64::CSINVXr: + case AArch64::CSINCWr: + case AArch64::CSINCXr: + case AArch64::CSELWr: + case AArch64::CSELXr: + case AArch64::CSNEGWr: + case AArch64::CSNEGXr: + case AArch64::FCSELSrrr: + case AArch64::FCSELDrrr: { + int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); + assert(Idx >= 1); + return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm()); + } } } @@ -1274,42 +1386,42 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { assert(CC != AArch64CC::Invalid); UsedNZCV UsedFlags; switch (CC) { - default: - break; + default: + break; - case AArch64CC::EQ: // Z set - case AArch64CC::NE: // Z clear - UsedFlags.Z = true; - break; + case AArch64CC::EQ: // Z set + case AArch64CC::NE: // Z clear + UsedFlags.Z = true; + break; - case AArch64CC::HI: // Z clear and C set - case AArch64CC::LS: // Z set or C clear - UsedFlags.Z = true; - LLVM_FALLTHROUGH; - case AArch64CC::HS: // C set - case AArch64CC::LO: // C clear - UsedFlags.C = true; - break; + case AArch64CC::HI: // Z clear and C set + case AArch64CC::LS: // Z set or C clear + UsedFlags.Z = true; + LLVM_FALLTHROUGH; + case AArch64CC::HS: // C set + case AArch64CC::LO: // C clear + UsedFlags.C = true; + break; - case AArch64CC::MI: // N set - case AArch64CC::PL: // N clear - UsedFlags.N = true; - break; + case AArch64CC::MI: // N set + case AArch64CC::PL: // N clear + UsedFlags.N = true; + break; - case AArch64CC::VS: // V set - case AArch64CC::VC: // V clear - UsedFlags.V = true; - break; + case AArch64CC::VS: // V set + case AArch64CC::VC: // V clear + UsedFlags.V = true; + break; - case AArch64CC::GT: // Z clear, N and V the same - case AArch64CC::LE: // Z set, N and V differ - UsedFlags.Z = true; - LLVM_FALLTHROUGH; - case AArch64CC::GE: // N and V the same - case AArch64CC::LT: // N and V differ - UsedFlags.N = true; - UsedFlags.V = true; - break; + case AArch64CC::GT: // Z clear, N and V the same + case AArch64CC::LE: // Z set, N and V differ + UsedFlags.Z = true; + LLVM_FALLTHROUGH; + case AArch64CC::GE: // N and V the same + case AArch64CC::LT: // N and V differ + UsedFlags.N = true; + UsedFlags.V = true; + break; } return UsedFlags; } @@ -1334,7 +1446,7 @@ static bool isSUBSRegImm(unsigned Opcode) { /// nor uses of flags between MI and CmpInstr. /// - and C/V flags are not used after CmpInstr static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, - const TargetRegisterInfo *TRI) { + const TargetRegisterInfo *TRI) { assert(MI); assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); assert(CmpInstr); @@ -1356,7 +1468,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, return false; UsedNZCV NZCVUsedAfterCmp; - for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end(); + for (auto I = std::next(CmpInstr->getIterator()), + E = CmpInstr->getParent()->instr_end(); I != E; ++I) { const MachineInstr &Instr = *I; if (Instr.readsRegister(AArch64::NZCV, TRI)) { @@ -1369,7 +1482,7 @@ static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, if (Instr.modifiesRegister(AArch64::NZCV, TRI)) break; } - + return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; } @@ -1427,16 +1540,20 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addMemOperand(*MI.memoperands_begin()); } else if (TM.getCodeModel() == CodeModel::Large) { BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) - .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); + .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) + .addImm(0); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16); + .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) + .addImm(16); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32); + .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) + .addImm(32); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); + .addGlobalAddress(GV, 0, AArch64II::MO_G3) + .addImm(48); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addImm(0) @@ -1818,7 +1935,7 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( } else return false; - // Get the scaling factor for the instruction and set the width for the + // Get the scaling factor for the instruction and set the width for the // instruction. unsigned Scale = 0; int64_t Dummy1, Dummy2; @@ -1841,10 +1958,10 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( return true; } -MachineOperand& +MachineOperand & AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); - MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands()-1); + MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); assert(OfsOp.isImm() && "Offset operand wasn't immediate."); return OfsOp; } @@ -1853,7 +1970,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset) const { switch (Opcode) { - // Not a memory operation or something we want to handle. + // Not a memory operation or something we want to handle. default: Scale = Width = 0; MinOffset = MaxOffset = 0; @@ -2050,8 +2167,13 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { /// /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + unsigned BaseReg1, MachineInstr &SecondLdSt, + unsigned BaseReg2, unsigned NumLoads) const { + if (BaseReg1 != BaseReg2) + return false; + // Only cluster up to a single pair. if (NumLoads > 1) return false; @@ -2089,18 +2211,6 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, return Offset1 + 1 == Offset2; } -MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( - MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, - const MDNode *Expr, const DebugLoc &DL) const { - MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) - .addFrameIndex(FrameIx) - .addImm(0) - .addImm(Offset) - .addMetadata(Var) - .addMetadata(Expr); - return &*MIB; -} - static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, unsigned Reg, unsigned SubIdx, unsigned State, @@ -2120,12 +2230,13 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, return ((DestReg - SrcReg) & 0x1f) < NumRegs; } -void AArch64InstrInfo::copyPhysRegTuple( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, - ArrayRef<unsigned> Indices) const { - assert(Subtarget.hasNEON() && - "Unexpected register copy without NEON"); +void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc, + unsigned Opcode, + ArrayRef<unsigned> Indices) const { + assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); const TargetRegisterInfo *TRI = &getRegisterInfo(); uint16_t DestEncoding = TRI->getEncodingValue(DestReg); uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); @@ -2178,8 +2289,9 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) { - BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm( - AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { if (Subtarget.hasZeroCycleRegMove()) { // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. @@ -2214,8 +2326,9 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) { - BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm( - AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { // Otherwise, expand to ORR XZR. BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) @@ -2228,8 +2341,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a DDDD register quad by copying the individual sub-registers. if (AArch64::DDDDRegClass.contains(DestReg) && AArch64::DDDDRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1, - AArch64::dsub2, AArch64::dsub3 }; + static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2, AArch64::dsub3}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, Indices); return; @@ -2238,8 +2351,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a DDD register triple by copying the individual sub-registers. if (AArch64::DDDRegClass.contains(DestReg) && AArch64::DDDRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1, - AArch64::dsub2 }; + static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, Indices); return; @@ -2248,7 +2361,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a DD register pair by copying the individual sub-registers. if (AArch64::DDRegClass.contains(DestReg) && AArch64::DDRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 }; + static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, Indices); return; @@ -2257,8 +2370,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a QQQQ register quad by copying the individual sub-registers. if (AArch64::QQQQRegClass.contains(DestReg) && AArch64::QQQQRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1, - AArch64::qsub2, AArch64::qsub3 }; + static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, Indices); return; @@ -2267,8 +2380,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a QQQ register triple by copying the individual sub-registers. if (AArch64::QQQRegClass.contains(DestReg) && AArch64::QQQRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1, - AArch64::qsub2 }; + static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, Indices); return; @@ -2277,7 +2390,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Copy a QQ register pair by copying the individual sub-registers. if (AArch64::QQRegClass.contains(DestReg) && AArch64::QQRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 }; + static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, Indices); return; @@ -2285,28 +2398,28 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR128RegClass.contains(DestReg) && AArch64::FPR128RegClass.contains(SrcReg)) { - if(Subtarget.hasNEON()) { + if (Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) .addReg(SrcReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else { BuildMI(MBB, I, DL, get(AArch64::STRQpre)) - .addReg(AArch64::SP, RegState::Define) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addReg(AArch64::SP) - .addImm(-16); + .addReg(AArch64::SP, RegState::Define) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addReg(AArch64::SP) + .addImm(-16); BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) - .addReg(AArch64::SP, RegState::Define) - .addReg(DestReg, RegState::Define) - .addReg(AArch64::SP) - .addImm(16); + .addReg(AArch64::SP, RegState::Define) + .addReg(DestReg, RegState::Define) + .addReg(AArch64::SP) + .addImm(16); } return; } if (AArch64::FPR64RegClass.contains(DestReg) && AArch64::FPR64RegClass.contains(SrcReg)) { - if(Subtarget.hasNEON()) { + if (Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, @@ -2323,7 +2436,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::FPR32RegClass.contains(SrcReg)) { - if(Subtarget.hasNEON()) { + if (Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, @@ -2340,7 +2453,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR16RegClass.contains(DestReg) && AArch64::FPR16RegClass.contains(SrcReg)) { - if(Subtarget.hasNEON()) { + if (Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, @@ -2361,7 +2474,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR8RegClass.contains(DestReg) && AArch64::FPR8RegClass.contains(SrcReg)) { - if(Subtarget.hasNEON()) { + if (Subtarget.hasNEON()) { DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR128RegClass); SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, @@ -2410,17 +2523,17 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (DestReg == AArch64::NZCV) { assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); BuildMI(MBB, I, DL, get(AArch64::MSR)) - .addImm(AArch64SysReg::NZCV) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); + .addImm(AArch64SysReg::NZCV) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); return; } if (SrcReg == AArch64::NZCV) { assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) - .addImm(AArch64SysReg::NZCV) - .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); + .addImm(AArch64SysReg::NZCV) + .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); return; } @@ -2476,45 +2589,39 @@ void AArch64InstrInfo::storeRegToStackSlot( if (AArch64::FPR128RegClass.hasSubClassEq(RC)) Opc = AArch64::STRQui; else if (AArch64::DDRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register store without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov1d; Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register store without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Threev1d; Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register store without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv1d; Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register store without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov2d; Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register store without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Threev2d; Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register store without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv2d; Offset = false; } @@ -2523,8 +2630,8 @@ void AArch64InstrInfo::storeRegToStackSlot( assert(Opc && "Unknown register class"); const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI); + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI); if (Offset) MI.addImm(0); @@ -2580,45 +2687,39 @@ void AArch64InstrInfo::loadRegFromStackSlot( if (AArch64::FPR128RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRQui; else if (AArch64::DDRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register load without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov1d; Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register load without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Threev1d; Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register load without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv1d; Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register load without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov2d; Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register load without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Threev2d; Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { - assert(Subtarget.hasNEON() && - "Unexpected register load without NEON"); + assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv2d; Offset = false; } @@ -2627,8 +2728,8 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Opc && "Unknown register class"); const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc)) - .addReg(DestReg, getDefRegState(true)) - .addFrameIndex(FI); + .addReg(DestReg, getDefRegState(true)) + .addFrameIndex(FI); if (Offset) MI.addImm(0); MI.addMemOperand(MMO); @@ -2701,14 +2802,14 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( LiveIntervals *LIS) const { // This is a bit of a hack. Consider this instruction: // - // %vreg0<def> = COPY %SP; GPR64all:%vreg0 + // %0 = COPY %sp; GPR64all:%0 // // We explicitly chose GPR64all for the virtual register so such a copy might // be eliminated by RegisterCoalescer. However, that may not be possible, and - // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all + // %0 may even spill. We can't spill %sp, and since it is in the GPR64all // register class, TargetInstrInfo::foldMemoryOperand() is going to try. // - // To prevent that, we are going to constrain the %vreg0 register class here. + // To prevent that, we are going to constrain the %0 register class here. // // <rdar://problem/11522048> // @@ -2730,26 +2831,26 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // Handle the case where a copy is being spilled or filled but the source // and destination register class don't match. For example: // - // %vreg0<def> = COPY %XZR; GPR64common:%vreg0 + // %0 = COPY %xzr; GPR64common:%0 // // In this case we can still safely fold away the COPY and generate the // following spill code: // - // STRXui %XZR, <fi#0> + // STRXui %xzr, %stack.0 // // This also eliminates spilled cross register class COPYs (e.g. between x and // d regs) of the same size. For example: // - // %vreg0<def> = COPY %vreg1; GPR64:%vreg0, FPR64:%vreg1 + // %0 = COPY %1; GPR64:%0, FPR64:%1 // // will be filled as // - // LDRDui %vreg0, fi<#0> + // LDRDui %0, fi<#0> // // instead of // - // LDRXui %vregTemp, fi<#0> - // %vreg0 = FMOV %vregTemp + // LDRXui %Temp, fi<#0> + // %0 = FMOV %Temp // if (MI.isCopy() && Ops.size() == 1 && // Make sure we're only folding the explicit COPY defs/uses. @@ -2773,7 +2874,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == - TRI.getRegSizeInBits(*getRegClass(SrcReg)) && + TRI.getRegSizeInBits(*getRegClass(SrcReg)) && "Mismatched register size in non subreg COPY"); if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, @@ -2786,12 +2887,12 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // Handle cases like spilling def of: // - // %vreg0:sub_32<def,read-undef> = COPY %WZR; GPR64common:%vreg0 + // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 // // where the physical register source can be widened and stored to the full // virtual reg destination stack slot, in this case producing: // - // STRXui %XZR, <fi#0> + // STRXui %xzr, %stack.0 // if (IsSpill && DstMO.isUndef() && TargetRegisterInfo::isPhysicalRegister(SrcReg)) { @@ -2834,12 +2935,12 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // Handle cases like filling use of: // - // %vreg0:sub_32<def,read-undef> = COPY %vreg1; GPR64:%vreg0, GPR32:%vreg1 + // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 // // where we can load the full virtual reg source stack slot, into the subreg // destination, in this case producing: // - // LDRWui %vreg0:sub_32<def,read-undef>, <fi#0> + // LDRWui %0:sub_32<def,read-undef>, %stack.0 // if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { const TargetRegisterClass *FillRC; @@ -3156,10 +3257,7 @@ void AArch64InstrInfo::getNoop(MCInst &NopInst) const { } // AArch64 supports MachineCombiner. -bool AArch64InstrInfo::useMachineCombiner() const { - - return true; -} +bool AArch64InstrInfo::useMachineCombiner() const { return true; } // True when Opc sets flag static bool isCombineInstrSettingFlag(unsigned Opc) { @@ -3293,7 +3391,8 @@ static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) // 3. Other forms of the same operation (intrinsics and other variants) -bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { +bool AArch64InstrInfo::isAssociativeAndCommutative( + const MachineInstr &Inst) const { switch (Inst.getOpcode()) { case AArch64::FADDDrr: case AArch64::FADDSrr: @@ -3574,6 +3673,15 @@ static bool getFMAPatterns(MachineInstr &Root, } break; case AArch64::FSUBv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1); + Found = true; + } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); @@ -3585,6 +3693,15 @@ static bool getFMAPatterns(MachineInstr &Root, } break; case AArch64::FSUBv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP1); + Found = true; + } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv2i64_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); @@ -3596,6 +3713,15 @@ static bool getFMAPatterns(MachineInstr &Root, } break; case AArch64::FSUBv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP1); + Found = true; + } if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULv4i32_indexed)) { Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); @@ -3613,8 +3739,8 @@ static bool getFMAPatterns(MachineInstr &Root, /// Return true when a code sequence can improve throughput. It /// should be called only for instructions in loops. /// \param Pattern - combiner pattern -bool -AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { +bool AArch64InstrInfo::isThroughputPattern( + MachineCombinerPattern Pattern) const { switch (Pattern) { default: break; @@ -3692,12 +3818,15 @@ enum class FMAInstKind { Default, Indexed, Accumulator }; /// \param MaddOpc the opcode fo the f|madd instruction /// \param RC Register class of operands /// \param kind of fma instruction (addressing mode) to be generated +/// \param ReplacedAddend is the result register from the instruction +/// replacing the non-combined operand, if any. static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, - FMAInstKind kind = FMAInstKind::Default) { + FMAInstKind kind = FMAInstKind::Default, + const unsigned *ReplacedAddend = nullptr) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; @@ -3707,8 +3836,17 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, bool Src0IsKill = MUL->getOperand(1).isKill(); unsigned SrcReg1 = MUL->getOperand(2).getReg(); bool Src1IsKill = MUL->getOperand(2).isKill(); - unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); - bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); + + unsigned SrcReg2; + bool Src2IsKill; + if (ReplacedAddend) { + // If we just generated a new addend, we must be it's only use. + SrcReg2 = *ReplacedAddend; + Src2IsKill = true; + } else { + SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); + Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); + } if (TargetRegisterInfo::isVirtualRegister(ResultReg)) MRI.constrainRegClass(ResultReg, RC); @@ -3765,8 +3903,8 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, - unsigned IdxMulOpd, unsigned MaddOpc, - unsigned VR, const TargetRegisterClass *RC) { + unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, + const TargetRegisterClass *RC) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); @@ -3785,11 +3923,11 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, if (TargetRegisterInfo::isVirtualRegister(VR)) MRI.constrainRegClass(VR, RC); - MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), - ResultReg) - .addReg(SrcReg0, getKillRegState(Src0IsKill)) - .addReg(SrcReg1, getKillRegState(Src1IsKill)) - .addReg(VR); + MachineInstrBuilder MIB = + BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(VR); // Insert the MADD InsInstrs.push_back(MIB); return MUL; @@ -4228,6 +4366,66 @@ void AArch64InstrInfo::genAlternativeCodeSequence( FMAInstKind::Accumulator); } break; + case MachineCombinerPattern::FMLSv2f32_OP1: + case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { + RC = &AArch64::FPR64RegClass; + unsigned NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } + break; + } + case MachineCombinerPattern::FMLSv4f32_OP1: + case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { + RC = &AArch64::FPR128RegClass; + unsigned NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } + break; + } + case MachineCombinerPattern::FMLSv2f64_OP1: + case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { + RC = &AArch64::FPR128RegClass; + unsigned NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed, &NewVR); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator, &NewVR); + } + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); @@ -4419,12 +4617,9 @@ AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { using namespace AArch64II; static const std::pair<unsigned, const char *> TargetFlags[] = { - {MO_PAGE, "aarch64-page"}, - {MO_PAGEOFF, "aarch64-pageoff"}, - {MO_G3, "aarch64-g3"}, - {MO_G2, "aarch64-g2"}, - {MO_G1, "aarch64-g1"}, - {MO_G0, "aarch64-g0"}, + {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, + {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, + {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, {MO_HI12, "aarch64-hi12"}}; return makeArrayRef(TargetFlags); } @@ -4434,9 +4629,7 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { using namespace AArch64II; static const std::pair<unsigned, const char *> TargetFlags[] = { - {MO_GOT, "aarch64-got"}, - {MO_NC, "aarch64-nc"}, - {MO_TLS, "aarch64-tls"}}; + {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"}, {MO_TLS, "aarch64-tls"}}; return makeArrayRef(TargetFlags); } @@ -4448,30 +4641,148 @@ AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { return makeArrayRef(TargetFlags); } -unsigned AArch64InstrInfo::getOutliningBenefit(size_t SequenceSize, - size_t Occurrences, - bool CanBeTailCall) const { - unsigned NotOutlinedSize = SequenceSize * Occurrences; - unsigned OutlinedSize; - - // Is this candidate something we can outline as a tail call? - if (CanBeTailCall) { - // If yes, then we just outline the sequence and replace each of its - // occurrences with a branch instruction. - OutlinedSize = SequenceSize + Occurrences; - } else { - // If no, then we outline the sequence (SequenceSize), add a return (+1), - // and replace each occurrence with a save/restore to LR and a call - // (3 * Occurrences) - OutlinedSize = (SequenceSize + 1) + (3 * Occurrences); + /// Constants defining how certain sequences should be outlined. + /// This encompasses how an outlined function should be called, and what kind of + /// frame should be emitted for that outlined function. + /// + /// \p MachineOutlinerDefault implies that the function should be called with + /// a save and restore of LR to the stack. + /// + /// That is, + /// + /// I1 Save LR OUTLINED_FUNCTION: + /// I2 --> BL OUTLINED_FUNCTION I1 + /// I3 Restore LR I2 + /// I3 + /// RET + /// + /// * Call construction overhead: 3 (save + BL + restore) + /// * Frame construction overhead: 1 (ret) + /// * Requires stack fixups? Yes + /// + /// \p MachineOutlinerTailCall implies that the function is being created from + /// a sequence of instructions ending in a return. + /// + /// That is, + /// + /// I1 OUTLINED_FUNCTION: + /// I2 --> B OUTLINED_FUNCTION I1 + /// RET I2 + /// RET + /// + /// * Call construction overhead: 1 (B) + /// * Frame construction overhead: 0 (Return included in sequence) + /// * Requires stack fixups? No + /// + /// \p MachineOutlinerNoLRSave implies that the function should be called using + /// a BL instruction, but doesn't require LR to be saved and restored. This + /// happens when LR is known to be dead. + /// + /// That is, + /// + /// I1 OUTLINED_FUNCTION: + /// I2 --> BL OUTLINED_FUNCTION I1 + /// I3 I2 + /// I3 + /// RET + /// + /// * Call construction overhead: 1 (BL) + /// * Frame construction overhead: 1 (RET) + /// * Requires stack fixups? No + /// +enum MachineOutlinerClass { + MachineOutlinerDefault, /// Emit a save, restore, call, and return. + MachineOutlinerTailCall, /// Only emit a branch. + MachineOutlinerNoLRSave /// Emit a call and return. +}; + +bool AArch64InstrInfo::canOutlineWithoutLRSave( + MachineBasicBlock::iterator &CallInsertionPt) const { + // Was LR saved in the function containing this basic block? + MachineBasicBlock &MBB = *(CallInsertionPt->getParent()); + LiveRegUnits LRU(getRegisterInfo()); + LRU.addLiveOuts(MBB); + + // Get liveness information from the end of the block to the end of the + // prospective outlined region. + std::for_each(MBB.rbegin(), + (MachineBasicBlock::reverse_iterator)CallInsertionPt, + [&LRU](MachineInstr &MI) { LRU.stepBackward(MI); }); + + // If the link register is available at this point, then we can safely outline + // the region without saving/restoring LR. Otherwise, we must emit a save and + // restore. + return LRU.available(AArch64::LR); +} + +AArch64GenInstrInfo::MachineOutlinerInfo +AArch64InstrInfo::getOutlininingCandidateInfo( + std::vector< + std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> + &RepeatedSequenceLocs) const { + + unsigned CallID = MachineOutlinerDefault; + unsigned FrameID = MachineOutlinerDefault; + unsigned NumInstrsForCall = 3; + unsigned NumInstrsToCreateFrame = 1; + + auto DoesntNeedLRSave = + [this](std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator> + &I) { return canOutlineWithoutLRSave(I.second); }; + + // If the last instruction in any candidate is a terminator, then we should + // tail call all of the candidates. + if (RepeatedSequenceLocs[0].second->isTerminator()) { + CallID = MachineOutlinerTailCall; + FrameID = MachineOutlinerTailCall; + NumInstrsForCall = 1; + NumInstrsToCreateFrame = 0; + } + + else if (std::all_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), + DoesntNeedLRSave)) { + CallID = MachineOutlinerNoLRSave; + FrameID = MachineOutlinerNoLRSave; + NumInstrsForCall = 1; + NumInstrsToCreateFrame = 1; } - // Return the number of instructions saved by outlining this sequence. - return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0; + // Check if the range contains a call. These require a save + restore of the + // link register. + if (std::any_of(RepeatedSequenceLocs[0].first, RepeatedSequenceLocs[0].second, + [](const MachineInstr &MI) { return MI.isCall(); })) + NumInstrsToCreateFrame += 2; // Save + restore the link register. + + // Handle the last instruction separately. If this is a tail call, then the + // last instruction is a call. We don't want to save + restore in this case. + // However, it could be possible that the last instruction is a call without + // it being valid to tail call this sequence. We should consider this as well. + else if (RepeatedSequenceLocs[0].second->isCall() && + FrameID != MachineOutlinerTailCall) + NumInstrsToCreateFrame += 2; + + return MachineOutlinerInfo(NumInstrsForCall, NumInstrsToCreateFrame, CallID, + FrameID); } -bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const { - return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); +bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( + MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { + const Function &F = MF.getFunction(); + + // If F uses a redzone, then don't outline from it because it might mess up + // the stack. + if (!F.hasFnAttribute(Attribute::NoRedZone)) + return false; + + // If anyone is using the address of this function, don't outline from it. + if (F.hasAddressTaken()) + return false; + + // Can F be deduplicated by the linker? If it can, don't outline from it. + if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) + return false; + + return true; } AArch64GenInstrInfo::MachineOutlinerInstrType @@ -4493,54 +4804,121 @@ AArch64InstrInfo::getOutliningType(MachineInstr &MI) const { // Is this the end of a function? if (MI.getParent()->succ_empty()) - return MachineOutlinerInstrType::Legal; + return MachineOutlinerInstrType::Legal; // It's not, so don't outline it. return MachineOutlinerInstrType::Illegal; } + // Outline calls without stack parameters or aggregate parameters. + if (MI.isCall()) { + const Module *M = MF->getFunction().getParent(); + assert(M && "No module?"); + + // Get the function associated with the call. Look at each operand and find + // the one that represents the callee and get its name. + Function *Callee = nullptr; + for (const MachineOperand &MOP : MI.operands()) { + if (MOP.isSymbol()) { + Callee = M->getFunction(MOP.getSymbolName()); + break; + } + + else if (MOP.isGlobal()) { + Callee = M->getFunction(MOP.getGlobal()->getGlobalIdentifier()); + break; + } + } + + // Only handle functions that we have information about. + if (!Callee) + return MachineOutlinerInstrType::Illegal; + + // We have a function we have information about. Check it if it's something + // can safely outline. + + // If the callee is vararg, it passes parameters on the stack. Don't touch + // it. + // FIXME: Functions like printf are very common and we should be able to + // outline them. + if (Callee->isVarArg()) + return MachineOutlinerInstrType::Illegal; + + // Check if any of the arguments are a pointer to a struct. We don't want + // to outline these since they might be loaded in two instructions. + for (Argument &Arg : Callee->args()) { + if (Arg.getType()->isPointerTy() && + Arg.getType()->getPointerElementType()->isAggregateType()) + return MachineOutlinerInstrType::Illegal; + } + + // If the thing we're calling doesn't access memory at all, then we're good + // to go. + if (Callee->doesNotAccessMemory()) + return MachineOutlinerInstrType::Legal; + + // It accesses memory. Get the machine function for the callee to see if + // it's safe to outline. + MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); + + // We don't know what's going on with the callee at all. Don't touch it. + if (!CalleeMF) + return MachineOutlinerInstrType::Illegal; + + // Does it pass anything on the stack? If it does, don't outline it. + if (CalleeMF->getInfo<AArch64FunctionInfo>()->getBytesInStackArgArea() != 0) + return MachineOutlinerInstrType::Illegal; + + // It doesn't, so it's safe to outline and we're done. + return MachineOutlinerInstrType::Legal; + } + // Don't outline positions. if (MI.isPosition()) return MachineOutlinerInstrType::Illegal; + // Don't touch the link register or W30. + if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || + MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) + return MachineOutlinerInstrType::Illegal; + // Make sure none of the operands are un-outlinable. - for (const MachineOperand &MOP : MI.operands()) + for (const MachineOperand &MOP : MI.operands()) { if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || MOP.isTargetIndex()) return MachineOutlinerInstrType::Illegal; - // Don't outline anything that uses the link register. - if (MI.modifiesRegister(AArch64::LR, &RI) || - MI.readsRegister(AArch64::LR, &RI)) + // Don't outline anything that uses the link register. + if (MOP.isReg() && getRegisterInfo().regsOverlap(MOP.getReg(), AArch64::LR)) return MachineOutlinerInstrType::Illegal; + } // Does this use the stack? if (MI.modifiesRegister(AArch64::SP, &RI) || MI.readsRegister(AArch64::SP, &RI)) { - // Is it a memory operation? if (MI.mayLoadOrStore()) { - unsigned Base; // Filled with the base regiser of MI. + unsigned Base; // Filled with the base regiser of MI. int64_t Offset; // Filled with the offset of MI. unsigned DummyWidth; // Does it allow us to offset the base register and is the base SP? if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) || - Base != AArch64::SP) + Base != AArch64::SP) return MachineOutlinerInstrType::Illegal; // Find the minimum/maximum offset for this instruction and check if // fixing it up would be in range. - int64_t MinOffset, MaxOffset; - unsigned DummyScale; - getMemOpInfo(MI.getOpcode(), DummyScale, DummyWidth, MinOffset, - MaxOffset); + int64_t MinOffset, MaxOffset; // Unscaled offsets for the instruction. + unsigned Scale; // The scale to multiply the offsets by. + getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); // TODO: We should really test what happens if an instruction overflows. // This is tricky to test with IR tests, but when the outliner is moved // to a MIR test, it really ought to be checked. - if (Offset + 16 < MinOffset || Offset + 16 > MaxOffset) - return MachineOutlinerInstrType::Illegal; + Offset += 16; // Update the offset to what it would be if we outlined. + if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale) + return MachineOutlinerInstrType::Illegal; // It's in range, so we can outline it. return MachineOutlinerInstrType::Legal; @@ -4576,17 +4954,57 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { // We've pushed the return address to the stack, so add 16 to the offset. // This is safe, since we already checked if it would overflow when we // checked if this instruction was legal to outline. - int64_t NewImm = (Offset + 16)/Scale; + int64_t NewImm = (Offset + 16) / Scale; StackOffsetOperand.setImm(NewImm); } } -void AArch64InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, - MachineFunction &MF, - bool IsTailCall) const { +void AArch64InstrInfo::insertOutlinerEpilogue( + MachineBasicBlock &MBB, MachineFunction &MF, + const MachineOutlinerInfo &MInfo) const { + + bool ContainsCalls = false; + + for (MachineInstr &MI : MBB) { + if (MI.isCall()) { + ContainsCalls = true; + break; + } + } + + if (ContainsCalls) { + // Fix up the instructions in the range, since we're going to modify the + // stack. + fixupPostOutline(MBB); + + // LR has to be a live in so that we can save it. + MBB.addLiveIn(AArch64::LR); + + MachineBasicBlock::iterator It = MBB.begin(); + MachineBasicBlock::iterator Et = MBB.end(); + + if (MInfo.FrameConstructionID == MachineOutlinerTailCall) + Et = std::prev(MBB.end()); + + // Insert a save before the outlined region + MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(-16); + It = MBB.insert(It, STRXpre); + + // Insert a restore before the terminator for the function. + MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR, RegState::Define) + .addReg(AArch64::SP) + .addImm(16); + Et = MBB.insert(Et, LDRXpost); + } // If this is a tail call outlined function, then there's already a return. - if (IsTailCall) + if (MInfo.FrameConstructionID == MachineOutlinerTailCall) return; // It's not a tail call, so we have to insert the return ourselves. @@ -4594,29 +5012,40 @@ void AArch64InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, .addReg(AArch64::LR, RegState::Undef); MBB.insert(MBB.end(), ret); + // Did we have to modify the stack by saving the link register? + if (MInfo.FrameConstructionID == MachineOutlinerNoLRSave) + return; + + // We modified the stack. // Walk over the basic block and fix up all the stack accesses. fixupPostOutline(MBB); } -void AArch64InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB, - MachineFunction &MF, - bool IsTailCall) const {} +void AArch64InstrInfo::insertOutlinerPrologue( + MachineBasicBlock &MBB, MachineFunction &MF, + const MachineOutlinerInfo &MInfo) const {} MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, - MachineFunction &MF, bool IsTailCall) const { + MachineFunction &MF, const MachineOutlinerInfo &MInfo) const { // Are we tail calling? - if (IsTailCall) { + if (MInfo.CallConstructionID == MachineOutlinerTailCall) { // If yes, then we can just branch to the label. - It = MBB.insert(It, - BuildMI(MF, DebugLoc(), get(AArch64::B)) - .addGlobalAddress(M.getNamedValue(MF.getName()))); + It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::B)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); return It; } - // We're not tail calling, so we have to save LR before the call and restore - // it after. + // Are we saving the link register? + if (MInfo.CallConstructionID == MachineOutlinerNoLRSave) { + // No, so just insert the call. + It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); + return It; + } + + // We have a default call. Save the link register. MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) .addReg(AArch64::SP, RegState::Define) .addReg(AArch64::LR) @@ -4626,20 +5055,18 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( It++; // Insert the call. - It = MBB.insert(It, - BuildMI(MF, DebugLoc(), get(AArch64::BL)) - .addGlobalAddress(M.getNamedValue(MF.getName()))); + It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); It++; // Restore the link register. MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) .addReg(AArch64::SP, RegState::Define) - .addReg(AArch64::LR) + .addReg(AArch64::LR, RegState::Define) .addReg(AArch64::SP) .addImm(16); It = MBB.insert(It, LDRXpost); return It; -} - +}
\ No newline at end of file diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 1765a0263ea4..2f10bef1e474 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -17,7 +17,7 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" #include "llvm/CodeGen/MachineCombinerPattern.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "AArch64GenInstrInfo.inc" @@ -136,37 +136,88 @@ public: default: llvm_unreachable("Opcode has no flag setting equivalent!"); // 32-bit cases: - case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri; - case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr; - case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs; - case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx; - case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri; - case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr; - case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs; - case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr; - case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs; - case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri; - case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr; - case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs; - case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx; + case AArch64::ADDWri: + Is64Bit = false; + return AArch64::ADDSWri; + case AArch64::ADDWrr: + Is64Bit = false; + return AArch64::ADDSWrr; + case AArch64::ADDWrs: + Is64Bit = false; + return AArch64::ADDSWrs; + case AArch64::ADDWrx: + Is64Bit = false; + return AArch64::ADDSWrx; + case AArch64::ANDWri: + Is64Bit = false; + return AArch64::ANDSWri; + case AArch64::ANDWrr: + Is64Bit = false; + return AArch64::ANDSWrr; + case AArch64::ANDWrs: + Is64Bit = false; + return AArch64::ANDSWrs; + case AArch64::BICWrr: + Is64Bit = false; + return AArch64::BICSWrr; + case AArch64::BICWrs: + Is64Bit = false; + return AArch64::BICSWrs; + case AArch64::SUBWri: + Is64Bit = false; + return AArch64::SUBSWri; + case AArch64::SUBWrr: + Is64Bit = false; + return AArch64::SUBSWrr; + case AArch64::SUBWrs: + Is64Bit = false; + return AArch64::SUBSWrs; + case AArch64::SUBWrx: + Is64Bit = false; + return AArch64::SUBSWrx; // 64-bit cases: - case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri; - case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr; - case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs; - case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx; - case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri; - case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr; - case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs; - case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr; - case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs; - case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri; - case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr; - case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs; - case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx; + case AArch64::ADDXri: + Is64Bit = true; + return AArch64::ADDSXri; + case AArch64::ADDXrr: + Is64Bit = true; + return AArch64::ADDSXrr; + case AArch64::ADDXrs: + Is64Bit = true; + return AArch64::ADDSXrs; + case AArch64::ADDXrx: + Is64Bit = true; + return AArch64::ADDSXrx; + case AArch64::ANDXri: + Is64Bit = true; + return AArch64::ANDSXri; + case AArch64::ANDXrr: + Is64Bit = true; + return AArch64::ANDSXrr; + case AArch64::ANDXrs: + Is64Bit = true; + return AArch64::ANDSXrs; + case AArch64::BICXrr: + Is64Bit = true; + return AArch64::BICSXrr; + case AArch64::BICXrs: + Is64Bit = true; + return AArch64::BICSXrs; + case AArch64::SUBXri: + Is64Bit = true; + return AArch64::SUBSXri; + case AArch64::SUBXrr: + Is64Bit = true; + return AArch64::SUBSXrr; + case AArch64::SUBXrs: + Is64Bit = true; + return AArch64::SUBSXrs; + case AArch64::SUBXrx: + Is64Bit = true; + return AArch64::SUBSXrx; } } - /// Return true if this is a load/store that can be potentially paired/merged. bool isCandidateToMergeOrPair(MachineInstr &MI) const; @@ -191,13 +242,10 @@ public: bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, int64_t &MinOffset, int64_t &MaxOffset) const; - bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, + MachineInstr &SecondLdSt, unsigned BaseReg2, unsigned NumLoads) const override; - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, - uint64_t Offset, const MDNode *Var, - const MDNode *Expr, - const DebugLoc &DL) const; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, @@ -275,9 +323,9 @@ public: /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in ``Root``. All potential patterns are /// listed in the ``Patterns`` array. - bool getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &Patterns) - const override; + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) const override; /// Return true when Inst is associative and commutative so that it can be /// reassociated. bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; @@ -302,27 +350,32 @@ public: ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> getSerializableMachineMemOperandTargetFlags() const override; - bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override; - unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences, - bool CanBeTailCall) const override; + bool + canOutlineWithoutLRSave(MachineBasicBlock::iterator &CallInsertionPt) const; + bool isFunctionSafeToOutlineFrom(MachineFunction &MF, + bool OutlineFromLinkOnceODRs) const override; + MachineOutlinerInfo getOutlininingCandidateInfo( + std::vector< + std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> + &RepeatedSequenceLocs) const override; AArch64GenInstrInfo::MachineOutlinerInstrType getOutliningType(MachineInstr &MI) const override; - void insertOutlinerEpilogue(MachineBasicBlock &MBB, - MachineFunction &MF, - bool IsTailCall) const override; - void insertOutlinerPrologue(MachineBasicBlock &MBB, - MachineFunction &MF, - bool isTailCall) const override; + void insertOutlinerEpilogue(MachineBasicBlock &MBB, MachineFunction &MF, + const MachineOutlinerInfo &MInfo) const override; + void insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF, + const MachineOutlinerInfo &MInfo) const override; MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, - MachineBasicBlock::iterator &It, - MachineFunction &MF, - bool IsTailCall) const override; + MachineBasicBlock::iterator &It, MachineFunction &MF, + const MachineOutlinerInfo &MInfo) const override; + /// Returns true if the instruction has a shift left that can be executed + /// more efficiently. + bool isExynosShiftLeftFast(const MachineInstr &MI) const; /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. bool isFalkorShiftExtFast(const MachineInstr &MI) const; -private: +private: /// \brief Sets the offsets on outlined instructions in \p MBB which use SP /// so that they will be valid post-outlining. /// @@ -350,8 +403,8 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, /// FP. Return false if the offset could not be handled directly in MI, and /// return the left-over portion by reference. bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, - const AArch64InstrInfo *TII); + unsigned FrameReg, int &Offset, + const AArch64InstrInfo *TII); /// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal. enum AArch64FrameOffsetStatus { @@ -375,9 +428,9 @@ enum AArch64FrameOffsetStatus { /// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that /// is a legal offset. int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, - bool *OutUseUnscaledOp = nullptr, - unsigned *OutUnscaledOp = nullptr, - int *EmittableOffset = nullptr); + bool *OutUseUnscaledOp = nullptr, + unsigned *OutUnscaledOp = nullptr, + int *EmittableOffset = nullptr); static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; } @@ -398,7 +451,9 @@ static inline bool isCondBranchOpcode(int Opc) { } } -static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; } +static inline bool isIndirectBranchOpcode(int Opc) { + return Opc == AArch64::BR; +} } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 59719978a3a6..79826ca2ed8d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -18,12 +18,16 @@ def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; +def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, + AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate<"FeatureNEON", "neon">; def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; +def HasDotProd : Predicate<"Subtarget->hasDotProd()">, + AssemblerPredicate<"FeatureDotProd", "dotprod">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; def HasLSE : Predicate<"Subtarget->hasLSE()">, @@ -42,6 +46,8 @@ def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, "fuse-aes">; def HasSVE : Predicate<"Subtarget->hasSVE()">, AssemblerPredicate<"FeatureSVE", "sve">; +def HasRCPC : Predicate<"Subtarget->hasRCPC()">, + AssemblerPredicate<"FeatureRCPC", "rcpc">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; @@ -322,11 +328,14 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; // the Function object through the <Target>Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def ForCodeSize : Predicate<"MF->getFunction()->optForSize()">; - def NotForCodeSize : Predicate<"!MF->getFunction()->optForSize()">; + def ForCodeSize : Predicate<"MF->getFunction().optForSize()">; + def NotForCodeSize : Predicate<"!MF->getFunction().optForSize()">; + // Avoid generating STRQro if it is slow, unless we're optimizing for code size. + def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().optForSize()">; } include "AArch64InstrFormats.td" +include "SVEInstrFormats.td" //===----------------------------------------------------------------------===// @@ -432,6 +441,108 @@ def ISB : CRmSystemI<barrier_op, 0b110, "isb", [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>; } +// ARMv8.2 Dot Product +let Predicates = [HasDotProd] in { +def UDOT2S : BaseSIMDThreeSameVectorDot<0, 1, "udot", ".2s", ".8b">; +def SDOT2S : BaseSIMDThreeSameVectorDot<0, 0, "sdot", ".2s", ".8b">; +def UDOT4S : BaseSIMDThreeSameVectorDot<1, 1, "udot", ".4s", ".16b">; +def SDOT4S : BaseSIMDThreeSameVectorDot<1, 0, "sdot", ".4s", ".16b">; +def UDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 1, "udot", ".2s", ".8b", ".4b">; +def SDOTIDX2S : BaseSIMDThreeSameVectorDotIndex<0, 0, "sdot", ".2s", ".8b", ".4b">; +def UDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 1, "udot", ".4s", ".16b", ".4b">; +def SDOTIDX4S : BaseSIMDThreeSameVectorDotIndex<1, 0, "sdot", ".4s", ".16b", ".4b">; +} + +let Predicates = [HasRCPC] in { + // v8.3 Release Consistent Processor Consistent support, optional in v8.2. + def LDAPRB : RCPCLoad<0b00, "ldaprb", GPR32>; + def LDAPRH : RCPCLoad<0b01, "ldaprh", GPR32>; + def LDAPRW : RCPCLoad<0b10, "ldapr", GPR32>; + def LDAPRX : RCPCLoad<0b11, "ldapr", GPR64>; +} + +// v8.3a complex add and multiply-accumulate. No predicate here, that is done +// inside the multiclass as the FP16 versions need different predicates. +defm FCMLA : SIMDThreeSameVectorTiedComplexHSD<1, 0b110, complexrotateop, + "fcmla", null_frag>; +defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd, + "fcadd", null_frag>; +defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla", + null_frag>; + +let Predicates = [HasV8_3a] in { + // v8.3a Pointer Authentication + let Uses = [LR], Defs = [LR] in { + def PACIAZ : SystemNoOperands<0b000, "paciaz">; + def PACIBZ : SystemNoOperands<0b010, "pacibz">; + def AUTIAZ : SystemNoOperands<0b100, "autiaz">; + def AUTIBZ : SystemNoOperands<0b110, "autibz">; + } + let Uses = [LR, SP], Defs = [LR] in { + def PACIASP : SystemNoOperands<0b001, "paciasp">; + def PACIBSP : SystemNoOperands<0b011, "pacibsp">; + def AUTIASP : SystemNoOperands<0b101, "autiasp">; + def AUTIBSP : SystemNoOperands<0b111, "autibsp">; + } + let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in { + def PACIA1716 : SystemNoOperands<0b000, "pacia1716">; + def PACIB1716 : SystemNoOperands<0b010, "pacib1716">; + def AUTIA1716 : SystemNoOperands<0b100, "autia1716">; + def AUTIB1716 : SystemNoOperands<0b110, "autib1716">; + } + + let Uses = [LR], Defs = [LR], CRm = 0b0000 in { + def XPACLRI : SystemNoOperands<0b111, "xpaclri">; + } + + multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> { + def IA : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>; + def IB : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>; + def DA : SignAuthOneData<prefix, 0b10, !strconcat(asm, "da")>; + def DB : SignAuthOneData<prefix, 0b11, !strconcat(asm, "db")>; + def IZA : SignAuthZero<prefix_z, 0b00, !strconcat(asm, "iza")>; + def DZA : SignAuthZero<prefix_z, 0b10, !strconcat(asm, "dza")>; + def IZB : SignAuthZero<prefix_z, 0b01, !strconcat(asm, "izb")>; + def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb")>; + } + + defm PAC : SignAuth<0b000, 0b010, "pac">; + defm AUT : SignAuth<0b001, 0b011, "aut">; + + def XPACI : SignAuthZero<0b100, 0b00, "xpaci">; + def XPACD : SignAuthZero<0b100, 0b01, "xpacd">; + def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>; + + // Combined Instructions + def BRAA : AuthBranchTwoOperands<0, 0, "braa">; + def BRAB : AuthBranchTwoOperands<0, 1, "brab">; + def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">; + def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">; + + def BRAAZ : AuthOneOperand<0b000, 0, "braaz">; + def BRABZ : AuthOneOperand<0b000, 1, "brabz">; + def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">; + def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">; + + let isReturn = 1 in { + def RETAA : AuthReturn<0b010, 0, "retaa">; + def RETAB : AuthReturn<0b010, 1, "retab">; + def ERETAA : AuthReturn<0b100, 0, "eretaa">; + def ERETAB : AuthReturn<0b100, 1, "eretab">; + } + + defm LDRAA : AuthLoad<0, "ldraa", simm10Scaled>; + defm LDRAB : AuthLoad<1, "ldrab", simm10Scaled>; + + // v8.3a floating point conversion for javascript + let Predicates = [HasV8_3a, HasFPARMv8] in + def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, + "fjcvtzs", []> { + let Inst{31} = 0; + } + +} // HasV8_3A + def : InstAlias<"clrex", (CLREX 0xf)>; def : InstAlias<"isb", (ISB 0xf)>; @@ -468,8 +579,8 @@ let PostEncoderMethod = "fixMOVZ" in defm MOVZ : MoveImmediate<0b10, "movz">; // First group of aliases covers an implicit "lsl #0". -def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>; +def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>; def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>; def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>; def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>; @@ -486,10 +597,10 @@ def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>; def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; @@ -497,8 +608,8 @@ def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>; // Final group of aliases covers true "mov $Rd, $imm" cases. multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR, @@ -2039,6 +2150,17 @@ defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>; defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>; defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>; +let Predicates = [UseSTRQro], AddedComplexity = 10 in { + def : Pat<(store (f128 FPR128:$Rt), + (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend128:$extend)), + (STRQroW FPR128:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend)>; + def : Pat<(store (f128 FPR128:$Rt), + (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend128:$extend)), + (STRQroX FPR128:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Wextend128:$extend)>; +} + multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop, Instruction STRW, Instruction STRX> { @@ -2086,7 +2208,7 @@ defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>; defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>; // Match all store 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { +let Predicates = [IsLE, UseSTRQro] in { // We must use ST1 to store vectors in big-endian. defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>; defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>; @@ -2127,11 +2249,11 @@ let AddedComplexity = 19 in { //--- // (unsigned immediate) -defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", - [(store GPR64:$Rt, +defm STRX : StoreUIz<0b11, 0, 0b00, GPR64z, uimm12s8, "str", + [(store GPR64z:$Rt, (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; -defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str", - [(store GPR32:$Rt, +defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str", + [(store GPR32z:$Rt, (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str", [(store FPR8:$Rt, @@ -2147,12 +2269,12 @@ defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str", (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>; -defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh", - [(truncstorei16 GPR32:$Rt, +defm STRHH : StoreUIz<0b01, 0, 0b00, GPR32z, uimm12s2, "strh", + [(truncstorei16 GPR32z:$Rt, (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>; -defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb", - [(truncstorei8 GPR32:$Rt, +defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb", + [(truncstorei8 GPR32z:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; @@ -2590,6 +2712,8 @@ defm FMOV : UnscaledConversion<"fmov">; // Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in { +def FMOVH0 : Pseudo<(outs FPR16:$Rd), (ins), [(set f16:$Rd, (fpimm0))]>, + Sched<[WriteF]>; def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, Sched<[WriteF]>; def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, @@ -4393,20 +4517,20 @@ def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>; def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>; // AdvSIMD FMOV def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8, @@ -6151,3 +6275,4 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; include "AArch64InstrAtomics.td" +include "AArch64SVEInstrInfo.td" diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp index 7e275e4d2f46..c2d3ae31c624 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -20,6 +20,7 @@ #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -33,14 +34,8 @@ #define DEBUG_TYPE "aarch64-isel" -#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" - using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - namespace { #define GET_GLOBALISEL_PREDICATE_BITSET @@ -53,12 +48,13 @@ public: const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI); - bool select(MachineInstr &I) const override; + bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + static const char *getName() { return DEBUG_TYPE; } private: /// tblgen-erated 'select' implementation, used as the initial selector for /// the patterns that don't require complex C++. - bool selectImpl(MachineInstr &I) const; + bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; @@ -68,7 +64,33 @@ private: bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; - ComplexRendererFn selectArithImmed(MachineOperand &Root) const; + ComplexRendererFns selectArithImmed(MachineOperand &Root) const; + + ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root, + unsigned Size) const; + + ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 1); + } + ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 2); + } + ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 4); + } + ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 8); + } + ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const { + return selectAddrModeUnscaled(Root, 16); + } + + ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root, + unsigned Size) const; + template <int Width> + ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const { + return selectAddrModeIndexed(Root, Width / 8); + } const AArch64TargetMachine &TM; const AArch64Subtarget &STI; @@ -321,7 +343,9 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterClass *RC = nullptr; if (RegBank.getID() == AArch64::FPRRegBankID) { - if (DstSize <= 32) + if (DstSize <= 16) + RC = &AArch64::FPR16RegClass; + else if (DstSize <= 32) RC = &AArch64::FPR32RegClass; else if (DstSize <= 64) RC = &AArch64::FPR64RegClass; @@ -513,6 +537,8 @@ bool AArch64InstructionSelector::selectCompareBranch( const unsigned CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); MachineInstr *CCMI = MRI.getVRegDef(CondReg); + if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) + CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg()); if (CCMI->getOpcode() != TargetOpcode::G_ICMP) return false; @@ -583,7 +609,8 @@ bool AArch64InstructionSelector::selectVaStartDarwin( return true; } -bool AArch64InstructionSelector::select(MachineInstr &I) const { +bool AArch64InstructionSelector::select(MachineInstr &I, + CodeGenCoverage &CoverageInfo) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -592,13 +619,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned Opcode = I.getOpcode(); - if (!isPreISelGenericOpcode(I.getOpcode())) { + // G_PHI requires same handling as PHI + if (!isPreISelGenericOpcode(Opcode) || Opcode == TargetOpcode::G_PHI) { // Certain non-generic instructions also need some special handling. if (Opcode == TargetOpcode::LOAD_STACK_GUARD) return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - if (Opcode == TargetOpcode::PHI) { + if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) { const unsigned DefReg = I.getOperand(0).getReg(); const LLT DefTy = MRI.getType(DefReg); @@ -623,6 +651,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { } } } + I.setDesc(TII.get(TargetOpcode::PHI)); return RBI.constrainGenericRegister(DefReg, *DefRC, MRI); } @@ -639,7 +668,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return false; } - if (selectImpl(I)) + if (selectImpl(I, CoverageInfo)) return true; LLT Ty = @@ -703,8 +732,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { << " constant on bank: " << RB << ", expected: FPR\n"); return false; } + + // The case when we have 0.0 is covered by tablegen. Reject it here so we + // can be sure tablegen works correctly and isn't rescued by this code. + if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0)) + return false; } else { - if (Ty != s32 && Ty != s64 && Ty != p0) { + // s32 and s64 are covered by tablegen. + if (Ty != p0) { DEBUG(dbgs() << "Unable to materialize integer " << Ty << " constant, expected: " << s32 << ", " << s64 << ", or " << p0 << '\n'); @@ -758,7 +793,55 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { constrainSelectedInstRegOperands(I, TII, TRI, RBI); return true; } + case TargetOpcode::G_EXTRACT: { + LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); + // Larger extracts are vectors, same-size extracts should be something else + // by now (either split up or simplified to a COPY). + if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32) + return false; + + I.setDesc(TII.get(AArch64::UBFMXri)); + MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + + Ty.getSizeInBits() - 1); + + unsigned DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(), + TII.get(AArch64::COPY)) + .addDef(I.getOperand(0).getReg()) + .addUse(DstReg, 0, AArch64::sub_32); + RBI.constrainGenericRegister(I.getOperand(0).getReg(), + AArch64::GPR32RegClass, MRI); + I.getOperand(0).setReg(DstReg); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_INSERT: { + LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); + // Larger inserts are vectors, same-size ones should be something else by + // now (split up or turned into COPYs). + if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) + return false; + + I.setDesc(TII.get(AArch64::BFMXri)); + unsigned LSB = I.getOperand(3).getImm(); + unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); + I.getOperand(3).setImm((64 - LSB) % 64); + MachineInstrBuilder(MF, I).addImm(Width - 1); + + unsigned SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); + BuildMI(MBB, I.getIterator(), I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG)) + .addDef(SrcReg) + .addImm(0) + .addUse(I.getOperand(2).getReg()) + .addImm(AArch64::sub_32); + RBI.constrainGenericRegister(I.getOperand(2).getReg(), + AArch64::GPR32RegClass, MRI); + I.getOperand(2).setReg(SrcReg); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } case TargetOpcode::G_FRAME_INDEX: { // allocas and G_FRAME_INDEX are only supported in addrspace(0). if (Ty != LLT::pointer(0, 64)) { @@ -766,7 +849,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { << ", expected: " << LLT::pointer(0, 64) << '\n'); return false; } - I.setDesc(TII.get(AArch64::ADDXri)); // MOs for a #0 shifted immediate. @@ -1117,62 +1199,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { case TargetOpcode::G_INTTOPTR: - case TargetOpcode::G_BITCAST: + // The importer is currently unable to import pointer types since they + // didn't exist in SelectionDAG. return selectCopy(I, TII, MRI, TRI, RBI); - case TargetOpcode::G_FPEXT: { - if (MRI.getType(I.getOperand(0).getReg()) != LLT::scalar(64)) { - DEBUG(dbgs() << "G_FPEXT to type " << Ty - << ", expected: " << LLT::scalar(64) << '\n'); - return false; - } - - if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(32)) { - DEBUG(dbgs() << "G_FPEXT from type " << Ty - << ", expected: " << LLT::scalar(32) << '\n'); - return false; - } - - const unsigned DefReg = I.getOperand(0).getReg(); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - if (RB.getID() != AArch64::FPRRegBankID) { - DEBUG(dbgs() << "G_FPEXT on bank: " << RB << ", expected: FPR\n"); - return false; - } - - I.setDesc(TII.get(AArch64::FCVTDSr)); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); - - return true; - } - - case TargetOpcode::G_FPTRUNC: { - if (MRI.getType(I.getOperand(0).getReg()) != LLT::scalar(32)) { - DEBUG(dbgs() << "G_FPTRUNC to type " << Ty - << ", expected: " << LLT::scalar(32) << '\n'); - return false; - } - - if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(64)) { - DEBUG(dbgs() << "G_FPTRUNC from type " << Ty - << ", expected: " << LLT::scalar(64) << '\n'); - return false; - } - - const unsigned DefReg = I.getOperand(0).getReg(); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - if (RB.getID() != AArch64::FPRRegBankID) { - DEBUG(dbgs() << "G_FPTRUNC on bank: " << RB << ", expected: FPR\n"); - return false; - } - - I.setDesc(TII.get(AArch64::FCVTSDr)); - constrainSelectedInstRegOperands(I, TII, TRI, RBI); - - return true; - } + case TargetOpcode::G_BITCAST: + // Imported SelectionDAG rules can handle every bitcast except those that + // bitcast from a type to the same type. Ideally, these shouldn't occur + // but we might not run an optimizer that deletes them. + if (MRI.getType(I.getOperand(0).getReg()) == + MRI.getType(I.getOperand(1).getReg())) + return selectCopy(I, TII, MRI, TRI, RBI); + return false; case TargetOpcode::G_SELECT: { if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) { @@ -1214,9 +1252,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return true; } case TargetOpcode::G_ICMP: { - if (Ty != LLT::scalar(1)) { + if (Ty != LLT::scalar(32)) { DEBUG(dbgs() << "G_ICMP result has type: " << Ty - << ", expected: " << LLT::scalar(1) << '\n'); + << ", expected: " << LLT::scalar(32) << '\n'); return false; } @@ -1261,9 +1299,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { } case TargetOpcode::G_FCMP: { - if (Ty != LLT::scalar(1)) { + if (Ty != LLT::scalar(32)) { DEBUG(dbgs() << "G_FCMP result has type: " << Ty - << ", expected: " << LLT::scalar(1) << '\n'); + << ", expected: " << LLT::scalar(32) << '\n'); return false; } @@ -1336,7 +1374,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { /// SelectArithImmed - Select an immediate value that can be represented as /// a 12-bit value shifted left by either 0 or 12. If so, return true with /// Val set to the 12-bit value and Shift set to the shifter operand. -InstructionSelector::ComplexRendererFn +InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { MachineInstr &MI = *Root.getParent(); MachineBasicBlock &MBB = *MI.getParent(); @@ -1356,13 +1394,13 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { else if (Root.isReg()) { MachineInstr *Def = MRI.getVRegDef(Root.getReg()); if (Def->getOpcode() != TargetOpcode::G_CONSTANT) - return nullptr; + return None; MachineOperand &Op1 = Def->getOperand(1); if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64) - return nullptr; + return None; Immed = Op1.getCImm()->getZExtValue(); } else - return nullptr; + return None; unsigned ShiftAmt; @@ -1372,10 +1410,116 @@ AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { ShiftAmt = 12; Immed = Immed >> 12; } else - return nullptr; + return None; unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); - return [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed).addImm(ShVal); }; + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); }, + }}; +} + +/// Select a "register plus unscaled signed 9-bit immediate" address. This +/// should only match when there is an offset that is not valid for a scaled +/// immediate addressing mode. The "Size" argument is the size in bytes of the +/// memory reference, which is needed here to know what is valid for a scaled +/// immediate. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root, + unsigned Size) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + if (!Root.isReg()) + return None; + + if (!isBaseWithConstantOffset(Root, MRI)) + return None; + + MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); + if (!RootDef) + return None; + + MachineOperand &OffImm = RootDef->getOperand(2); + if (!OffImm.isReg()) + return None; + MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg()); + if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT) + return None; + int64_t RHSC; + MachineOperand &RHSOp1 = RHS->getOperand(1); + if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64) + return None; + RHSC = RHSOp1.getCImm()->getSExtValue(); + + // If the offset is valid as a scaled immediate, don't match here. + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size))) + return None; + if (RHSC >= -256 && RHSC < 256) { + MachineOperand &Base = RootDef->getOperand(1); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Base); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); }, + }}; + } + return None; +} + +/// Select a "register plus scaled unsigned 12-bit immediate" address. The +/// "Size" argument is the size in bytes of the memory reference, which +/// determines the scale. +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root, + unsigned Size) const { + MachineRegisterInfo &MRI = + Root.getParent()->getParent()->getParent()->getRegInfo(); + + if (!Root.isReg()) + return None; + + MachineInstr *RootDef = MRI.getVRegDef(Root.getReg()); + if (!RootDef) + return None; + + if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + }}; + } + + if (isBaseWithConstantOffset(Root, MRI)) { + MachineOperand &LHS = RootDef->getOperand(1); + MachineOperand &RHS = RootDef->getOperand(2); + MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg()); + MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg()); + if (LHSDef && RHSDef) { + int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { + if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, + }}; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); }, + }}; + } + } + } + + // Before falling back to our general case, check if the unscaled + // instructions can handle this. If so, that's preferable. + if (selectAddrModeUnscaled(Root, Size).hasValue()) + return None; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, + }}; } namespace llvm { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp index ffb27834c31c..05df51202229 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -13,21 +13,122 @@ //===----------------------------------------------------------------------===// #include "AArch64LegalizerInfo.h" +#include "AArch64Subtarget.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" -#include "llvm/Target/TargetOpcodes.h" using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif +/// FIXME: The following static functions are SizeChangeStrategy functions +/// that are meant to temporarily mimic the behaviour of the old legalization +/// based on doubling/halving non-legal types as closely as possible. This is +/// not entirly possible as only legalizing the types that are exactly a power +/// of 2 times the size of the legal types would require specifying all those +/// sizes explicitly. +/// In practice, not specifying those isn't a problem, and the below functions +/// should disappear quickly as we add support for legalizing non-power-of-2 +/// sized types further. +static void +addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result, + const LegalizerInfo::SizeAndActionsVec &v) { + for (unsigned i = 0; i < v.size(); ++i) { + result.push_back(v[i]); + if (i + 1 < v[i].first && i + 1 < v.size() && + v[i + 1].first != v[i].first + 1) + result.push_back({v[i].first + 1, LegalizerInfo::Unsupported}); + } +} + +static LegalizerInfo::SizeAndActionsVec +widen_1_narrow_128_ToLargest(const LegalizerInfo::SizeAndActionsVec &v) { + assert(v.size() >= 1); + assert(v[0].first > 2); + LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar}, + {2, LegalizerInfo::Unsupported}}; + addAndInterleaveWithUnsupported(result, v); + auto Largest = result.back().first; + assert(Largest + 1 < 128); + result.push_back({Largest + 1, LegalizerInfo::Unsupported}); + result.push_back({128, LegalizerInfo::NarrowScalar}); + result.push_back({129, LegalizerInfo::Unsupported}); + return result; +} + +static LegalizerInfo::SizeAndActionsVec +widen_16(const LegalizerInfo::SizeAndActionsVec &v) { + assert(v.size() >= 1); + assert(v[0].first > 17); + LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::Unsupported}, + {16, LegalizerInfo::WidenScalar}, + {17, LegalizerInfo::Unsupported}}; + addAndInterleaveWithUnsupported(result, v); + auto Largest = result.back().first; + result.push_back({Largest + 1, LegalizerInfo::Unsupported}); + return result; +} + +static LegalizerInfo::SizeAndActionsVec +widen_1_8(const LegalizerInfo::SizeAndActionsVec &v) { + assert(v.size() >= 1); + assert(v[0].first > 9); + LegalizerInfo::SizeAndActionsVec result = { + {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported}, + {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported}}; + addAndInterleaveWithUnsupported(result, v); + auto Largest = result.back().first; + result.push_back({Largest + 1, LegalizerInfo::Unsupported}); + return result; +} + +static LegalizerInfo::SizeAndActionsVec +widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) { + assert(v.size() >= 1); + assert(v[0].first > 17); + LegalizerInfo::SizeAndActionsVec result = { + {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported}, + {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported}, + {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}}; + addAndInterleaveWithUnsupported(result, v); + auto Largest = result.back().first; + result.push_back({Largest + 1, LegalizerInfo::Unsupported}); + return result; +} -AArch64LegalizerInfo::AArch64LegalizerInfo() { +static LegalizerInfo::SizeAndActionsVec +widen_1_8_16_narrowToLargest(const LegalizerInfo::SizeAndActionsVec &v) { + assert(v.size() >= 1); + assert(v[0].first > 17); + LegalizerInfo::SizeAndActionsVec result = { + {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported}, + {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported}, + {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}}; + addAndInterleaveWithUnsupported(result, v); + auto Largest = result.back().first; + result.push_back({Largest + 1, LegalizerInfo::NarrowScalar}); + return result; +} + +static LegalizerInfo::SizeAndActionsVec +widen_1_8_16_32(const LegalizerInfo::SizeAndActionsVec &v) { + assert(v.size() >= 1); + assert(v[0].first > 33); + LegalizerInfo::SizeAndActionsVec result = { + {1, LegalizerInfo::WidenScalar}, {2, LegalizerInfo::Unsupported}, + {8, LegalizerInfo::WidenScalar}, {9, LegalizerInfo::Unsupported}, + {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}, + {32, LegalizerInfo::WidenScalar}, {33, LegalizerInfo::Unsupported}}; + addAndInterleaveWithUnsupported(result, v); + auto Largest = result.back().first; + result.push_back({Largest + 1, LegalizerInfo::Unsupported}); + return result; +} + +AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { using namespace TargetOpcode; const LLT p0 = LLT::pointer(0, 64); const LLT s1 = LLT::scalar(1); @@ -35,6 +136,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); + const LLT s128 = LLT::scalar(128); const LLT v2s32 = LLT::vector(2, 32); const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); @@ -42,21 +144,29 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { for (auto Ty : {p0, s1, s8, s16, s32, s64}) setAction({G_IMPLICIT_DEF, Ty}, Legal); + for (auto Ty : {s16, s32, s64, p0}) + setAction({G_PHI, Ty}, Legal); + + setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1_8); + + for (auto Ty : { s32, s64 }) + setAction({G_BSWAP, Ty}, Legal); + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) { // These operations naturally get the right answer when used on // GPR32, even if the actual type is narrower. for (auto Ty : {s32, s64, v2s32, v4s32, v2s64}) setAction({BinOp, Ty}, Legal); - for (auto Ty : {s1, s8, s16}) - setAction({BinOp, Ty}, WidenScalar); + if (BinOp != G_ADD) + setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, + widen_1_8_16_narrowToLargest); } setAction({G_GEP, p0}, Legal); setAction({G_GEP, 1, s64}, Legal); - for (auto Ty : {s1, s8, s16, s32}) - setAction({G_GEP, 1, Ty}, WidenScalar); + setLegalizeScalarToDifferentSizeStrategy(G_GEP, 1, widen_1_8_16_32); setAction({G_PTR_MASK, p0}, Legal); @@ -64,16 +174,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { for (auto Ty : {s32, s64}) setAction({BinOp, Ty}, Legal); - for (auto Ty : {s1, s8, s16}) - setAction({BinOp, Ty}, WidenScalar); + setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1_8_16); } for (unsigned BinOp : {G_SREM, G_UREM}) for (auto Ty : { s1, s8, s16, s32, s64 }) setAction({BinOp, Ty}, Lower); - for (unsigned Op : {G_SMULO, G_UMULO}) - setAction({Op, s64}, Lower); + for (unsigned Op : {G_SMULO, G_UMULO}) { + setAction({Op, 0, s64}, Lower); + setAction({Op, 1, s1}, Legal); + } for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) { for (auto Ty : { s32, s64 }) @@ -95,8 +206,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_INSERT, Ty}, Legal); setAction({G_INSERT, 1, Ty}, Legal); } + setLegalizeScalarToDifferentSizeStrategy(G_INSERT, 0, + widen_1_8_16_narrowToLargest); for (auto Ty : {s1, s8, s16}) { - setAction({G_INSERT, Ty}, WidenScalar); setAction({G_INSERT, 1, Ty}, Legal); // FIXME: Can't widen the sources because that violates the constraints on // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap). @@ -112,7 +224,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { for (auto Ty : {s8, s16, s32, s64, p0, v2s32}) setAction({MemOp, Ty}, Legal); - setAction({MemOp, s1}, WidenScalar); + setLegalizeScalarToDifferentSizeStrategy(MemOp, 0, + widen_1_narrow_128_ToLargest); // And everything's fine in addrspace 0. setAction({MemOp, 1, p0}, Legal); @@ -126,21 +239,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_CONSTANT, p0}, Legal); - for (auto Ty : {s1, s8, s16}) - setAction({TargetOpcode::G_CONSTANT, Ty}, WidenScalar); - - setAction({TargetOpcode::G_FCONSTANT, s16}, WidenScalar); + setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16); + setLegalizeScalarToDifferentSizeStrategy(G_FCONSTANT, 0, widen_16); - setAction({G_ICMP, s1}, Legal); setAction({G_ICMP, 1, s32}, Legal); setAction({G_ICMP, 1, s64}, Legal); setAction({G_ICMP, 1, p0}, Legal); - for (auto Ty : {s1, s8, s16}) { - setAction({G_ICMP, 1, Ty}, WidenScalar); - } + setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 0, widen_1_8_16); + setLegalizeScalarToDifferentSizeStrategy(G_FCMP, 0, widen_1_8_16); + setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1, widen_1_8_16); - setAction({G_FCMP, s1}, Legal); + setAction({G_ICMP, s32}, Legal); + setAction({G_FCMP, s32}, Legal); setAction({G_FCMP, 1, s32}, Legal); setAction({G_FCMP, 1, s64}, Legal); @@ -151,27 +262,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_ANYEXT, Ty}, Legal); } - for (auto Ty : { s1, s8, s16, s32 }) { - setAction({G_ZEXT, 1, Ty}, Legal); - setAction({G_SEXT, 1, Ty}, Legal); - setAction({G_ANYEXT, 1, Ty}, Legal); - } - - setAction({G_FPEXT, s64}, Legal); - setAction({G_FPEXT, 1, s32}, Legal); - - // Truncations - for (auto Ty : { s16, s32 }) + // FP conversions + for (auto Ty : { s16, s32 }) { setAction({G_FPTRUNC, Ty}, Legal); + setAction({G_FPEXT, 1, Ty}, Legal); + } - for (auto Ty : { s32, s64 }) + for (auto Ty : { s32, s64 }) { setAction({G_FPTRUNC, 1, Ty}, Legal); - - for (auto Ty : { s1, s8, s16, s32 }) - setAction({G_TRUNC, Ty}, Legal); - - for (auto Ty : { s8, s16, s32, s64 }) - setAction({G_TRUNC, 1, Ty}, Legal); + setAction({G_FPEXT, Ty}, Legal); + } // Conversions for (auto Ty : { s32, s64 }) { @@ -180,12 +280,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_SITOFP, 1, Ty}, Legal); setAction({G_UITOFP, 1, Ty}, Legal); } - for (auto Ty : { s1, s8, s16 }) { - setAction({G_FPTOSI, 0, Ty}, WidenScalar); - setAction({G_FPTOUI, 0, Ty}, WidenScalar); - setAction({G_SITOFP, 1, Ty}, WidenScalar); - setAction({G_UITOFP, 1, Ty}, WidenScalar); - } + setLegalizeScalarToDifferentSizeStrategy(G_FPTOSI, 0, widen_1_8_16); + setLegalizeScalarToDifferentSizeStrategy(G_FPTOUI, 0, widen_1_8_16); + setLegalizeScalarToDifferentSizeStrategy(G_SITOFP, 1, widen_1_8_16); + setLegalizeScalarToDifferentSizeStrategy(G_UITOFP, 1, widen_1_8_16); for (auto Ty : { s32, s64 }) { setAction({G_FPTOSI, 1, Ty}, Legal); @@ -200,8 +298,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_BRINDIRECT, p0}, Legal); // Select - for (auto Ty : {s1, s8, s16}) - setAction({G_SELECT, Ty}, WidenScalar); + setLegalizeScalarToDifferentSizeStrategy(G_SELECT, 0, widen_1_8_16); for (auto Ty : {s32, s64, p0}) setAction({G_SELECT, Ty}, Legal); @@ -221,7 +318,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_INTTOPTR, 1, s64}, Legal); // Casts for 32 and 64-bit width type are just copies. - for (auto Ty : {s1, s8, s16, s32, s64}) { + // Same for 128-bit width type, except they are on the FPR bank. + for (auto Ty : {s1, s8, s16, s32, s64, s128}) { setAction({G_BITCAST, 0, Ty}, Legal); setAction({G_BITCAST, 1, Ty}, Legal); } @@ -252,6 +350,41 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { for (auto Ty : {s8, s16, s32, s64, p0}) setAction({G_VAARG, Ty}, Custom); + if (ST.hasLSE()) { + for (auto Ty : {s8, s16, s32, s64}) { + setAction({G_ATOMIC_CMPXCHG_WITH_SUCCESS, Ty}, Lower); + setAction({G_ATOMIC_CMPXCHG, Ty}, Legal); + } + setAction({G_ATOMIC_CMPXCHG, 1, p0}, Legal); + + for (unsigned Op : + {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, + G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, + G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) { + for (auto Ty : {s8, s16, s32, s64}) { + setAction({Op, Ty}, Legal); + } + setAction({Op, 1, p0}, Legal); + } + } + + // Merge/Unmerge + for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) + for (int Sz : {8, 16, 32, 64, 128, 192, 256, 384, 512}) { + LLT ScalarTy = LLT::scalar(Sz); + setAction({Op, ScalarTy}, Legal); + setAction({Op, 1, ScalarTy}, Legal); + if (Sz < 32) + continue; + for (int EltSize = 8; EltSize <= 64; EltSize *= 2) { + if (EltSize >= Sz) + continue; + LLT VecTy = LLT::vector(Sz / EltSize, EltSize); + setAction({Op, VecTy}, Legal); + setAction({Op, 1, VecTy}, Legal); + } + } + computeTables(); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h index 42d4ac130c5c..a745b0edbc6d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h @@ -20,11 +20,12 @@ namespace llvm { class LLVMContext; +class AArch64Subtarget; /// This class provides the information for the target register banks. class AArch64LegalizerInfo : public LegalizerInfo { public: - AArch64LegalizerInfo(); + AArch64LegalizerInfo(const AArch64Subtarget &ST); bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const override; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 9a7f45bde6c9..8a29456430b9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1,4 +1,4 @@ -//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// +//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// // // The LLVM Compiler Infrastructure // @@ -20,12 +20,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" @@ -33,7 +35,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <cassert> #include <cstdint> #include <iterator> @@ -64,7 +65,7 @@ static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), namespace { -typedef struct LdStPairFlags { +using LdStPairFlags = struct LdStPairFlags { // If a matching instruction is found, MergeForward is set to true if the // merge is to remove the first instruction and replace the second with // a pair-wise insn, and false if the reverse is true. @@ -83,8 +84,7 @@ typedef struct LdStPairFlags { void setSExtIdx(int V) { SExtIdx = V; } int getSExtIdx() const { return SExtIdx; } - -} LdStPairFlags; +}; struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; @@ -101,7 +101,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Track which registers have been modified and used. BitVector ModifiedRegs, UsedRegs; - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AAResultsWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -168,6 +168,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); + // Find and merge a base register updates before or after a ld/st instruction. + bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); + bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -578,6 +581,75 @@ static bool isPromotableZeroStoreInst(MachineInstr &MI) { getLdStRegOp(MI).getReg() == AArch64::WZR; } +static bool isPromotableLoadFromStore(MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRXui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDURXi: + return true; + } +} + +static bool isMergeableLdStUpdate(MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + switch (Opc) { + default: + return false; + // Scaled instructions. + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + case AArch64::STRXui: + case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::LDRXui: + case AArch64::LDRWui: + case AArch64::LDRHHui: + case AArch64::LDRBBui: + // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + // Paired instructions. + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) + return false; + + return true; + } +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MergeMI, @@ -758,8 +830,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (SExtIdx != -1) { // Generate the sign extension for the proper result of the ldp. // I.e., with X1, that would be: - // %W1<def> = KILL %W1, %X1<imp-def> - // %X1<def> = SBFMXri %X1<kill>, 0, 31 + // %w1 = KILL %w1, implicit-def %x1 + // %x1 = SBFMXri killed %x1, 0, 31 MachineOperand &DstMO = MIB->getOperand(SExtIdx); // Right now, DstMO has the extended register, since it comes from an // extended opcode. @@ -1294,10 +1366,13 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, } (void)MIB; - if (IsPreIdx) + if (IsPreIdx) { + ++NumPreFolded; DEBUG(dbgs() << "Creating pre-indexed load/store."); - else + } else { + ++NumPostFolded; DEBUG(dbgs() << "Creating post-indexed load/store."); + } DEBUG(dbgs() << " Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); @@ -1558,6 +1633,60 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { return false; } +bool AArch64LoadStoreOpt::tryToMergeLdStUpdate + (MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); + MachineBasicBlock::iterator Update; + + // Look forward to try to form a post-index instruction. For example, + // ldr x0, [x20] + // add x20, x20, #32 + // merged into: + // ldr x0, [x20], #32 + Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); + return true; + } + + // Don't know how to handle unscaled pre/post-index versions below, so bail. + if (TII->isUnscaledLdSt(MI.getOpcode())) + return false; + + // Look back to try to find a pre-index instruction. For example, + // add x0, x0, #8 + // ldr x1, [x0] + // merged into: + // ldr x1, [x0, #8]! + Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); + return true; + } + + // The immediate in the load/store is scaled by the size of the memory + // operation. The immediate in the add we're looking for, + // however, is not, so adjust here. + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); + + // Look forward to try to find a post-index instruction. For example, + // ldr x1, [x0, #64] + // add x0, x0, #64 + // merged into: + // ldr x1, [x0, #64]! + Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); + return true; + } + + return false; +} + bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt) { bool Modified = false; @@ -1573,29 +1702,10 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // lsr w2, w1, #16 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr &MI = *MBBI; - switch (MI.getOpcode()) { - default: - // Just move on to the next instruction. - ++MBBI; - break; - // Scaled instructions. - case AArch64::LDRBBui: - case AArch64::LDRHHui: - case AArch64::LDRWui: - case AArch64::LDRXui: - // Unscaled instructions. - case AArch64::LDURBBi: - case AArch64::LDURHHi: - case AArch64::LDURWi: - case AArch64::LDURXi: - if (tryToPromoteLoadFromStore(MBBI)) { - Modified = true; - break; - } + if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI)) + Modified = true; + else ++MBBI; - break; - } } // 2) Merge adjacent zero stores into a wider store. // e.g., @@ -1608,17 +1718,14 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // str wzr, [x0, #4] // ; becomes // str xzr, [x0] - for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - EnableNarrowZeroStOpt && MBBI != E;) { - if (isPromotableZeroStoreInst(*MBBI)) { - if (tryToMergeZeroStInst(MBBI)) { + if (EnableNarrowZeroStOpt) + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI)) Modified = true; - } else + else ++MBBI; - } else - ++MBBI; - } - + } // 3) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., @@ -1642,124 +1749,17 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // ldr x0, [x2], #4 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr &MI = *MBBI; - // Do update merging. It's simpler to keep this separate from the above - // switchs, though not strictly necessary. - unsigned Opc = MI.getOpcode(); - switch (Opc) { - default: - // Just move on to the next instruction. - ++MBBI; - break; - // Scaled instructions. - case AArch64::STRSui: - case AArch64::STRDui: - case AArch64::STRQui: - case AArch64::STRXui: - case AArch64::STRWui: - case AArch64::STRHHui: - case AArch64::STRBBui: - case AArch64::LDRSui: - case AArch64::LDRDui: - case AArch64::LDRQui: - case AArch64::LDRXui: - case AArch64::LDRWui: - case AArch64::LDRHHui: - case AArch64::LDRBBui: - // Unscaled instructions. - case AArch64::STURSi: - case AArch64::STURDi: - case AArch64::STURQi: - case AArch64::STURWi: - case AArch64::STURXi: - case AArch64::LDURSi: - case AArch64::LDURDi: - case AArch64::LDURQi: - case AArch64::LDURWi: - case AArch64::LDURXi: - // Paired instructions. - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPDi: - case AArch64::LDPQi: - case AArch64::LDPWi: - case AArch64::LDPXi: - case AArch64::STPSi: - case AArch64::STPDi: - case AArch64::STPQi: - case AArch64::STPWi: - case AArch64::STPXi: { - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) { - ++MBBI; - break; - } - // Look forward to try to form a post-index instruction. For example, - // ldr x0, [x20] - // add x20, x20, #32 - // merged into: - // ldr x0, [x20], #32 - MachineBasicBlock::iterator Update = - findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); - Modified = true; - ++NumPostFolded; - break; - } - - // Don't know how to handle unscaled pre/post-index versions below, so - // move to the next instruction. - if (TII->isUnscaledLdSt(Opc)) { - ++MBBI; - break; - } - - // Look back to try to find a pre-index instruction. For example, - // add x0, x0, #8 - // ldr x1, [x0] - // merged into: - // ldr x1, [x0, #8]! - Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); - Modified = true; - ++NumPreFolded; - break; - } - // The immediate in the load/store is scaled by the size of the memory - // operation. The immediate in the add we're looking for, - // however, is not, so adjust here. - int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); - - // Look forward to try to find a post-index instruction. For example, - // ldr x1, [x0, #64] - // add x0, x0, #64 - // merged into: - // ldr x1, [x0, #64]! - Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); - Modified = true; - ++NumPreFolded; - break; - } - - // Nothing found. Just move to the next instruction. + if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI)) + Modified = true; + else ++MBBI; - break; - } - } } return Modified; } bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - if (skipFunction(*Fn.getFunction())) + if (skipFunction(Fn.getFunction())) return false; Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index f82b9dbc2c9f..65dae03a24db 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -18,7 +18,9 @@ #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/IR/Mangler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CodeGen.h" @@ -33,7 +35,25 @@ AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer) MCSymbol * AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { - return Printer.getSymbol(MO.getGlobal()); + const GlobalValue *GV = MO.getGlobal(); + unsigned TargetFlags = MO.getTargetFlags(); + const Triple &TheTriple = Printer.TM.getTargetTriple(); + if (!TheTriple.isOSBinFormatCOFF()) + return Printer.getSymbol(GV); + + assert(TheTriple.isOSWindows() && + "Windows is the only supported COFF target"); + + bool IsIndirect = (TargetFlags & AArch64II::MO_DLLIMPORT); + if (!IsIndirect) + return Printer.getSymbol(GV); + + SmallString<128> Name; + Name = "__imp_"; + Printer.TM.getNameWithPrefix(Name, GV, + Printer.getObjFileLowering().getMangler()); + + return Ctx.getOrCreateSymbol(Name); } MCSymbol * diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index f0bffe544158..9f354c009461 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -23,6 +23,8 @@ namespace llvm { +class MachineInstr; + /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and /// contains private AArch64-specific information for each MachineFunction. class AArch64FunctionInfo final : public MachineFunctionInfo { @@ -145,7 +147,7 @@ public: unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; } void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; } - typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions; + using SetOfInstructions = SmallPtrSet<const MachineInstr *, 16>; const SetOfInstructions &getLOHRelated() const { return LOHRelated; } @@ -157,7 +159,7 @@ public: SmallVector<const MachineInstr *, 3> Args; public: - typedef ArrayRef<const MachineInstr *> LOHArgs; + using LOHArgs = ArrayRef<const MachineInstr *>; MILOHDirective(MCLOHType Kind, LOHArgs Args) : Kind(Kind), Args(Args.begin(), Args.end()) { @@ -168,8 +170,8 @@ public: LOHArgs getArgs() const { return Args; } }; - typedef MILOHDirective::LOHArgs MILOHArgs; - typedef SmallVector<MILOHDirective, 32> MILOHContainer; + using MILOHArgs = MILOHDirective::LOHArgs; + using MILOHContainer = SmallVector<MILOHDirective, 32>; const MILOHContainer &getLOHContainer() const { return LOHContainerSet; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp index 963cfadc54fd..6930c816b5ae 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -12,10 +12,9 @@ // //===----------------------------------------------------------------------===// -#include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" #include "llvm/CodeGen/MacroFusion.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" using namespace llvm; @@ -33,8 +32,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, // Assume wildcards for unspecified instrs. unsigned FirstOpcode = - FirstMI ? FirstMI->getOpcode() - : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END); + FirstMI ? FirstMI->getOpcode() + : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END); unsigned SecondOpcode = SecondMI.getOpcode(); if (ST.hasArithmeticBccFusion()) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index fe4ef4b40ece..ee6703aed1e2 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -20,7 +20,7 @@ #include "AArch64PBQPRegAlloc.h" #include "AArch64.h" #include "AArch64RegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -247,13 +247,13 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, // Do some Chain management if (Chains.count(Ra)) { if (Rd != Ra) { - DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to " - << PrintReg(Rd, TRI) << '\n';); + DEBUG(dbgs() << "Moving acc chain from " << printReg(Ra, TRI) << " to " + << printReg(Rd, TRI) << '\n';); Chains.remove(Ra); Chains.insert(Rd); } } else { - DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI) + DEBUG(dbgs() << "Creating new acc chain for " << printReg(Rd, TRI) << '\n';); Chains.insert(Rd); } @@ -340,7 +340,7 @@ void A57ChainingConstraint::apply(PBQPRAGraph &G) { for (auto r : Chains) { SmallVector<unsigned, 8> toDel; if(regJustKilledBefore(LIs, r, MI)) { - DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at "; + DEBUG(dbgs() << "Killing chain " << printReg(r, TRI) << " at "; MI.print(dbgs());); toDel.push_back(r); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index 8693f76d7c32..a8dc6e74ef6a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -1,4 +1,4 @@ -//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==// +//==- AArch64PromoteConstant.cpp - Promote constant to global for AArch64 --==// // // The LLVM Compiler Infrastructure // @@ -22,23 +22,31 @@ #include "AArch64.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <utility> using namespace llvm; @@ -56,6 +64,7 @@ STATISTIC(NumPromotedUses, "Number of promoted constants uses"); //===----------------------------------------------------------------------===// namespace { + /// Promotes interesting constant into global variables. /// The motivating example is: /// static const uint16_t TableA[32] = { @@ -83,13 +92,12 @@ namespace { /// Therefore the final assembly final has 4 different loads. With this pass /// enabled, only one load is issued for the constants. class AArch64PromoteConstant : public ModulePass { - public: struct PromotedConstant { bool ShouldConvert = false; GlobalVariable *GV = nullptr; }; - typedef SmallDenseMap<Constant *, PromotedConstant, 16> PromotionCacheTy; + using PromotionCacheTy = SmallDenseMap<Constant *, PromotedConstant, 16>; struct UpdateRecord { Constant *C; @@ -101,6 +109,7 @@ public: }; static char ID; + AArch64PromoteConstant() : ModulePass(ID) { initializeAArch64PromoteConstantPass(*PassRegistry::getPassRegistry()); } @@ -135,9 +144,9 @@ private: } /// Type to store a list of Uses. - typedef SmallVector<std::pair<Instruction *, unsigned>, 4> Uses; + using Uses = SmallVector<std::pair<Instruction *, unsigned>, 4>; /// Map an insertion point to all the uses it dominates. - typedef DenseMap<Instruction *, Uses> InsertionPoints; + using InsertionPoints = DenseMap<Instruction *, Uses>; /// Find the closest point that dominates the given Use. Instruction *findInsertionPoint(Instruction &User, unsigned OpNo); @@ -212,6 +221,7 @@ private: InsertPts.erase(OldInstr); } }; + } // end anonymous namespace char AArch64PromoteConstant::ID = 0; @@ -357,7 +367,6 @@ Instruction *AArch64PromoteConstant::findInsertionPoint(Instruction &User, bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) { - DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( *NewPt->getParent()->getParent()).getDomTree(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp index 22c11c7276d2..e5822b114324 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -5,27 +5,51 @@ // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -// This pass removes unnecessary zero copies in BBs that are targets of -// cbz/cbnz instructions. For instance, the copy instruction in the code below -// can be removed because the CBZW jumps to BB#2 when W0 is zero. -// BB#1: -// CBZW %W0, <BB#2> -// BB#2: -// %W0 = COPY %WZR -// Similarly, this pass also handles non-zero copies. -// BB#0: -// cmp x0, #1 +// This pass removes unnecessary copies/moves in BBs based on a dominating +// condition. +// +// We handle three cases: +// 1. For BBs that are targets of CBZ/CBNZ instructions, we know the value of +// the CBZ/CBNZ source register is zero on the taken/not-taken path. For +// instance, the copy instruction in the code below can be removed because +// the CBZW jumps to %bb.2 when w0 is zero. +// +// %bb.1: +// cbz w0, .LBB0_2 +// .LBB0_2: +// mov w0, wzr ; <-- redundant +// +// 2. If the flag setting instruction defines a register other than WZR/XZR, we +// can remove a zero copy in some cases. +// +// %bb.0: +// subs w0, w1, w2 +// str w0, [x1] +// b.ne .LBB0_2 +// %bb.1: +// mov w0, wzr ; <-- redundant +// str w0, [x2] +// .LBB0_2 +// +// 3. Finally, if the flag setting instruction is a comparison against a +// constant (i.e., ADDS[W|X]ri, SUBS[W|X]ri), we can remove a mov immediate +// in some cases. +// +// %bb.0: +// subs xzr, x0, #1 // b.eq .LBB0_1 // .LBB0_1: -// orr x0, xzr, #0x1 +// orr x0, xzr, #0x1 ; <-- redundant // // This pass should be run after register allocation. // // FIXME: This could also be extended to check the whole dominance subtree below // the comparison if the compile time regression is acceptable. // +// FIXME: Add support for handling CCMP instructions. +// FIXME: If the known register value is zero, we should be able to rewrite uses +// to use WZR/XZR directly in some cases. //===----------------------------------------------------------------------===// - #include "AArch64.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" @@ -45,7 +69,13 @@ namespace { class AArch64RedundantCopyElimination : public MachineFunctionPass { const MachineRegisterInfo *MRI; const TargetRegisterInfo *TRI; - BitVector ClobberedRegs; + + // DomBBClobberedRegs is used when computing known values in the dominating + // BB. + BitVector DomBBClobberedRegs; + + // OptBBClobberedRegs is used when optimizing away redundant copies/moves. + BitVector OptBBClobberedRegs; public: static char ID; @@ -60,10 +90,10 @@ public: RegImm(MCPhysReg Reg, int32_t Imm) : Reg(Reg), Imm(Imm) {} }; - Optional<RegImm> knownRegValInBlock(MachineInstr &CondBr, - MachineBasicBlock *MBB, - MachineBasicBlock::iterator &FirstUse); - bool optimizeCopy(MachineBasicBlock *MBB); + bool knownRegValInBlock(MachineInstr &CondBr, MachineBasicBlock *MBB, + SmallVectorImpl<RegImm> &KnownRegs, + MachineBasicBlock::iterator &FirstUse); + bool optimizeBlock(MachineBasicBlock *MBB); bool runOnMachineFunction(MachineFunction &MF) override; MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( @@ -103,13 +133,19 @@ static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs, /// It's possible to determine the value of a register based on a dominating /// condition. To do so, this function checks to see if the basic block \p MBB -/// is the target to which a conditional branch \p CondBr jumps and whose -/// equality comparison is against a constant. If so, return a known physical -/// register and constant value pair. Otherwise, return None. -Optional<AArch64RedundantCopyElimination::RegImm> -AArch64RedundantCopyElimination::knownRegValInBlock( +/// is the target of a conditional branch \p CondBr with an equality comparison. +/// If the branch is a CBZ/CBNZ, we know the value of its source operand is zero +/// in \p MBB for some cases. Otherwise, we find and inspect the NZCV setting +/// instruction (e.g., SUBS, ADDS). If this instruction defines a register +/// other than WZR/XZR, we know the value of the destination register is zero in +/// \p MMB for some cases. In addition, if the NZCV setting instruction is +/// comparing against a constant we know the other source register is equal to +/// the constant in \p MBB for some cases. If we find any constant values, push +/// a physical register and constant value pair onto the KnownRegs vector and +/// return true. Otherwise, return false if no known values were found. +bool AArch64RedundantCopyElimination::knownRegValInBlock( MachineInstr &CondBr, MachineBasicBlock *MBB, - MachineBasicBlock::iterator &FirstUse) { + SmallVectorImpl<RegImm> &KnownRegs, MachineBasicBlock::iterator &FirstUse) { unsigned Opc = CondBr.getOpcode(); // Check if the current basic block is the target block to which the @@ -119,41 +155,39 @@ AArch64RedundantCopyElimination::knownRegValInBlock( ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) && MBB != CondBr.getOperand(1).getMBB())) { FirstUse = CondBr; - return RegImm(CondBr.getOperand(0).getReg(), 0); + KnownRegs.push_back(RegImm(CondBr.getOperand(0).getReg(), 0)); + return true; } // Otherwise, must be a conditional branch. if (Opc != AArch64::Bcc) - return None; + return false; // Must be an equality check (i.e., == or !=). AArch64CC::CondCode CC = (AArch64CC::CondCode)CondBr.getOperand(0).getImm(); if (CC != AArch64CC::EQ && CC != AArch64CC::NE) - return None; + return false; MachineBasicBlock *BrTarget = CondBr.getOperand(1).getMBB(); if ((CC == AArch64CC::EQ && BrTarget != MBB) || (CC == AArch64CC::NE && BrTarget == MBB)) - return None; + return false; // Stop if we get to the beginning of PredMBB. MachineBasicBlock *PredMBB = *MBB->pred_begin(); assert(PredMBB == CondBr.getParent() && "Conditional branch not in predecessor block!"); if (CondBr == PredMBB->begin()) - return None; + return false; // Registers clobbered in PredMBB between CondBr instruction and current // instruction being checked in loop. - ClobberedRegs.reset(); + DomBBClobberedRegs.reset(); // Find compare instruction that sets NZCV used by CondBr. MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator(); for (MachineInstr &PredI : make_range(std::next(RIt), PredMBB->rend())) { - // Track clobbered registers. - trackRegDefs(PredI, ClobberedRegs, TRI); - bool IsCMN = false; switch (PredI.getOpcode()) { default: @@ -169,37 +203,100 @@ AArch64RedundantCopyElimination::knownRegValInBlock( case AArch64::SUBSXri: { // Sometimes the first operand is a FrameIndex. Bail if tht happens. if (!PredI.getOperand(1).isReg()) - return None; + return false; + MCPhysReg DstReg = PredI.getOperand(0).getReg(); MCPhysReg SrcReg = PredI.getOperand(1).getReg(); - // Must not be a symbolic immediate. - if (!PredI.getOperand(2).isImm()) - return None; - - // The src register must not be modified between the cmp and conditional - // branch. This includes a self-clobbering compare. - if (ClobberedRegs[SrcReg]) - return None; - - // We've found the Cmp that sets NZCV. - int32_t KnownImm = PredI.getOperand(2).getImm(); - int32_t Shift = PredI.getOperand(3).getImm(); - KnownImm <<= Shift; - if (IsCMN) - KnownImm = -KnownImm; + bool Res = false; + // If we're comparing against a non-symbolic immediate and the source + // register of the compare is not modified (including a self-clobbering + // compare) between the compare and conditional branch we known the value + // of the 1st source operand. + if (PredI.getOperand(2).isImm() && !DomBBClobberedRegs[SrcReg] && + SrcReg != DstReg) { + // We've found the instruction that sets NZCV. + int32_t KnownImm = PredI.getOperand(2).getImm(); + int32_t Shift = PredI.getOperand(3).getImm(); + KnownImm <<= Shift; + if (IsCMN) + KnownImm = -KnownImm; + FirstUse = PredI; + KnownRegs.push_back(RegImm(SrcReg, KnownImm)); + Res = true; + } + + // If this instructions defines something other than WZR/XZR, we know it's + // result is zero in some cases. + if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) + return Res; + + // The destination register must not be modified between the NZCV setting + // instruction and the conditional branch. + if (DomBBClobberedRegs[DstReg]) + return Res; + FirstUse = PredI; - return RegImm(SrcReg, KnownImm); + KnownRegs.push_back(RegImm(DstReg, 0)); + return true; + } + + // Look for NZCV setting instructions that define something other than + // WZR/XZR. + case AArch64::ADCSWr: + case AArch64::ADCSXr: + case AArch64::ADDSWrr: + case AArch64::ADDSWrs: + case AArch64::ADDSWrx: + case AArch64::ADDSXrr: + case AArch64::ADDSXrs: + case AArch64::ADDSXrx: + case AArch64::ADDSXrx64: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + case AArch64::BICSXrr: + case AArch64::SBCSWr: + case AArch64::SBCSXr: + case AArch64::SUBSWrr: + case AArch64::SUBSWrs: + case AArch64::SUBSWrx: + case AArch64::SUBSXrr: + case AArch64::SUBSXrs: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: { + MCPhysReg DstReg = PredI.getOperand(0).getReg(); + if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) + return false; + + // The destination register of the NZCV setting instruction must not be + // modified before the conditional branch. + if (DomBBClobberedRegs[DstReg]) + return false; + + // We've found the instruction that sets NZCV whose DstReg == 0. + FirstUse = PredI; + KnownRegs.push_back(RegImm(DstReg, 0)); + return true; } } // Bail if we see an instruction that defines NZCV that we don't handle. if (PredI.definesRegister(AArch64::NZCV)) - return None; + return false; + + // Track clobbered registers. + trackRegDefs(PredI, DomBBClobberedRegs, TRI); } - return None; + return false; } -bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { +bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) { // Check if the current basic block has a single predecessor. if (MBB->pred_size() != 1) return false; @@ -230,14 +327,11 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { do { --Itr; - Optional<RegImm> KnownRegImm = knownRegValInBlock(*Itr, MBB, FirstUse); - if (KnownRegImm == None) + if (!knownRegValInBlock(*Itr, MBB, KnownRegs, FirstUse)) continue; - KnownRegs.push_back(*KnownRegImm); - - // Reset the clobber list, which is used by knownRegValInBlock. - ClobberedRegs.reset(); + // Reset the clobber list. + OptBBClobberedRegs.reset(); // Look backward in PredMBB for COPYs from the known reg to find other // registers that are known to be a constant value. @@ -249,11 +343,11 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { MCPhysReg CopyDstReg = PredI->getOperand(0).getReg(); MCPhysReg CopySrcReg = PredI->getOperand(1).getReg(); for (auto &KnownReg : KnownRegs) { - if (ClobberedRegs[KnownReg.Reg]) + if (OptBBClobberedRegs[KnownReg.Reg]) continue; // If we have X = COPY Y, and Y is known to be zero, then now X is // known to be zero. - if (CopySrcReg == KnownReg.Reg && !ClobberedRegs[CopyDstReg]) { + if (CopySrcReg == KnownReg.Reg && !OptBBClobberedRegs[CopyDstReg]) { KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm)); if (SeenFirstUse) FirstUse = PredI; @@ -261,7 +355,7 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { } // If we have X = COPY Y, and X is known to be zero, then now Y is // known to be zero. - if (CopyDstReg == KnownReg.Reg && !ClobberedRegs[CopySrcReg]) { + if (CopyDstReg == KnownReg.Reg && !OptBBClobberedRegs[CopySrcReg]) { KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm)); if (SeenFirstUse) FirstUse = PredI; @@ -274,10 +368,10 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { if (PredI == PredMBB->begin()) break; - trackRegDefs(*PredI, ClobberedRegs, TRI); + trackRegDefs(*PredI, OptBBClobberedRegs, TRI); // Stop if all of the known-zero regs have been clobbered. if (all_of(KnownRegs, [&](RegImm KnownReg) { - return ClobberedRegs[KnownReg.Reg]; + return OptBBClobberedRegs[KnownReg.Reg]; })) break; } @@ -293,7 +387,7 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { // UsedKnownRegs is the set of KnownRegs that have had uses added to MBB. SmallSetVector<unsigned, 4> UsedKnownRegs; MachineBasicBlock::iterator LastChange = MBB->begin(); - // Remove redundant Copy instructions unless KnownReg is modified. + // Remove redundant copy/move instructions unless KnownReg is modified. for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { MachineInstr *MI = &*I; ++I; @@ -391,18 +485,19 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { bool AArch64RedundantCopyElimination::runOnMachineFunction( MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); - // Resize the clobber register bitfield tracker. We do this once per - // function and then clear the bitfield each time we optimize a copy. - ClobberedRegs.resize(TRI->getNumRegs()); + // Resize the clobber register bitfield trackers. We do this once per + // function. + DomBBClobberedRegs.resize(TRI->getNumRegs()); + OptBBClobberedRegs.resize(TRI->getNumRegs()); bool Changed = false; for (MachineBasicBlock &MBB : MF) - Changed |= optimizeCopy(&MBB); + Changed |= optimizeBlock(&MBB); return Changed; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 69124dbd0f83..c497669f937f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -22,10 +22,10 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetOpcodes.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> #include <cassert> @@ -37,10 +37,6 @@ using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) : AArch64GenRegisterBankInfo() { static bool AlreadyInit = false; @@ -63,10 +59,9 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) assert(&AArch64::FPRRegBank == &RBFPR && "The order in RegBanks is messed up"); - const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID); + const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID); (void)RBCCR; - assert(&AArch64::CCRRegBank == &RBCCR && - "The order in RegBanks is messed up"); + assert(&AArch64::CCRegBank == &RBCCR && "The order in RegBanks is messed up"); // The GPR register bank is fully defined by all the registers in // GR64all + its subclasses. @@ -92,9 +87,9 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, {PMI_GPR32, PMI_GPR64}) && "PartialMappingIdx's are incorrectly ordered"); - assert(checkPartialMappingIdx( - PMI_FirstFPR, PMI_LastFPR, - {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) && + assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR, + {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128, + PMI_FPR256, PMI_FPR512}) && "PartialMappingIdx's are incorrectly ordered"); // Now, the content. // Check partial mapping. @@ -107,6 +102,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); + CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR); CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR); CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR); CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR); @@ -126,6 +122,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP(GPR, 32); CHECK_VALUEMAP(GPR, 64); + CHECK_VALUEMAP(FPR, 16); CHECK_VALUEMAP(FPR, 32); CHECK_VALUEMAP(FPR, 64); CHECK_VALUEMAP(FPR, 128); @@ -178,6 +175,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64); CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64); +#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize) \ + do { \ + unsigned PartialMapDstIdx = PMI_FPR##DstSize - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_FPR##SrcSize - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getFPExtMapping(DstSize, SrcSize); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ + Map[0].NumBreakDowns == 1 && "FPR" #DstSize \ + " Dst is incorrectly initialized"); \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ + Map[1].NumBreakDowns == 1 && "FPR" #SrcSize \ + " Src is incorrectly initialized"); \ + \ + } while (false) + + CHECK_VALUEMAP_FPEXT(32, 16); + CHECK_VALUEMAP_FPEXT(64, 16); + CHECK_VALUEMAP_FPEXT(64, 32); + CHECK_VALUEMAP_FPEXT(128, 64); + assert(verify(TRI) && "Invalid register bank information"); } @@ -233,7 +254,7 @@ const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass( case AArch64::XSeqPairsClassRegClassID: return getRegBank(AArch64::GPRRegBankID); case AArch64::CCRRegClassID: - return getRegBank(AArch64::CCRRegBankID); + return getRegBank(AArch64::CCRegBankID); default: llvm_unreachable("Register class not supported"); } @@ -419,18 +440,22 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping( const RegisterBankInfo::InstructionMapping & AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const unsigned Opc = MI.getOpcode(); - const MachineFunction &MF = *MI.getParent()->getParent(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); // Try the default logic for non-generic instructions that are either copies // or already have some operands assigned to banks. - if (!isPreISelGenericOpcode(Opc)) { + if ((Opc != TargetOpcode::COPY && !isPreISelGenericOpcode(Opc)) || + Opc == TargetOpcode::G_PHI) { const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); if (Mapping.isValid()) return Mapping; } + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + switch (Opc) { // G_{F|S|U}REM are not listed because they are not legal. // Arithmetic ops. @@ -454,12 +479,47 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: return getSameKindOfOperandsMapping(MI); + case TargetOpcode::G_FPEXT: { + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + return getInstructionMapping( + DefaultMappingID, /*Cost*/ 1, + getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()), + /*NumOperands*/ 2); + } + case TargetOpcode::COPY: { + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); + // Check if one of the register is not a generic register. + if ((TargetRegisterInfo::isPhysicalRegister(DstReg) || + !MRI.getType(DstReg).isValid()) || + (TargetRegisterInfo::isPhysicalRegister(SrcReg) || + !MRI.getType(SrcReg).isValid())) { + const RegisterBank *DstRB = getRegBank(DstReg, MRI, TRI); + const RegisterBank *SrcRB = getRegBank(SrcReg, MRI, TRI); + if (!DstRB) + DstRB = SrcRB; + else if (!SrcRB) + SrcRB = DstRB; + // If both RB are null that means both registers are generic. + // We shouldn't be here. + assert(DstRB && SrcRB && "Both RegBank were nullptr"); + unsigned Size = getSizeInBits(DstReg, MRI, TRI); + return getInstructionMapping( + DefaultMappingID, copyCost(*DstRB, *SrcRB, Size), + getCopyMapping(DstRB->getID(), SrcRB->getID(), Size), + // We only care about the mapping of the destination. + /*NumOperands*/ 1); + } + // Both registers are generic, use G_BITCAST. + LLVM_FALLTHROUGH; + } case TargetOpcode::G_BITCAST: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); unsigned Size = DstTy.getSizeInBits(); - bool DstIsGPR = !DstTy.isVector(); - bool SrcIsGPR = !SrcTy.isVector(); + bool DstIsGPR = !DstTy.isVector() && DstTy.getSizeInBits() <= 64; + bool SrcIsGPR = !SrcTy.isVector() && SrcTy.getSizeInBits() <= 64; const RegisterBank &DstRB = DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; const RegisterBank &SrcRB = @@ -467,7 +527,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { return getInstructionMapping( DefaultMappingID, copyCost(DstRB, SrcRB, Size), getCopyMapping(DstRB.getID(), SrcRB.getID(), Size), - /*NumOperands*/ 2); + // We only care about the mapping of the destination for COPY. + /*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1); } default: break; @@ -488,7 +549,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs. // For floating-point instructions, scalars go in FPRs. - if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc)) + if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) || + Ty.getSizeInBits() > 64) OpRegBankIdx[Idx] = PMI_FirstFPR; else OpRegBankIdx[Idx] = PMI_FirstGPR; @@ -532,15 +594,24 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // In that case, we want the default mapping to be on FPR // instead of blind map every scalar to GPR. for (const MachineInstr &UseMI : - MRI.use_instructions(MI.getOperand(0).getReg())) + MRI.use_instructions(MI.getOperand(0).getReg())) { // If we have at least one direct use in a FP instruction, // assume this was a floating point load in the IR. // If it was not, we would have had a bitcast before // reaching that instruction. - if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) { + unsigned UseOpc = UseMI.getOpcode(); + if (isPreISelGenericFloatingPointOpcode(UseOpc) || + // Check if we feed a copy-like instruction with + // floating point constraints. In that case, we are still + // feeding fp instructions, but indirectly + // (e.g., through ABI copies). + ((UseOpc == TargetOpcode::COPY || UseMI.isPHI()) && + getRegBank(UseMI.getOperand(0).getReg(), MRI, TRI) == + &AArch64::FPRRegBank)) { OpRegBankIdx[0] = PMI_FirstFPR; break; } + } break; case TargetOpcode::G_STORE: // Check if that store is fed by fp instructions. @@ -549,7 +620,15 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { if (!VReg) break; MachineInstr *DefMI = MRI.getVRegDef(VReg); - if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode())) + unsigned DefOpc = DefMI->getOpcode(); + if (isPreISelGenericFloatingPointOpcode(DefOpc) || + // Check if we come from a copy-like instruction with + // floating point constraints. In that case, we are still + // fed by fp instructions, but indirectly + // (e.g., through ABI copies). + ((DefOpc == TargetOpcode::COPY || DefMI->isPHI()) && + getRegBank(DefMI->getOperand(0).getReg(), MRI, TRI) == + &AArch64::FPRRegBank)) OpRegBankIdx[0] = PMI_FirstFPR; break; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h index 6d74a47095a9..008221dbef58 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -25,10 +25,10 @@ class TargetRegisterInfo; class AArch64GenRegisterBankInfo : public RegisterBankInfo { protected: - enum PartialMappingIdx { PMI_None = -1, - PMI_FPR32 = 1, + PMI_FPR16 = 1, + PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, @@ -37,7 +37,7 @@ protected: PMI_GPR64, PMI_FirstGPR = PMI_GPR32, PMI_LastGPR = PMI_GPR64, - PMI_FirstFPR = PMI_FPR32, + PMI_FirstFPR = PMI_FPR16, PMI_LastFPR = PMI_FPR512, PMI_Min = PMI_FirstFPR, }; @@ -49,11 +49,15 @@ protected: enum ValueMappingIdx { InvalidIdx = 0, First3OpsIdx = 1, - Last3OpsIdx = 19, + Last3OpsIdx = 22, DistanceBetweenRegBanks = 3, - FirstCrossRegCpyIdx = 22, - LastCrossRegCpyIdx = 34, - DistanceBetweenCrossRegCpy = 2 + FirstCrossRegCpyIdx = 25, + LastCrossRegCpyIdx = 39, + DistanceBetweenCrossRegCpy = 2, + FPExt16To32Idx = 41, + FPExt16To64Idx = 43, + FPExt32To64Idx = 45, + FPExt64To128Idx = 47, }; static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, @@ -82,6 +86,15 @@ protected: static const RegisterBankInfo::ValueMapping * getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); + /// Get the instruction mapping for G_FPEXT. + /// + /// \pre (DstSize, SrcSize) pair is one of the following: + /// (32, 16), (64, 16), (64, 32), (128, 64) + /// + /// \return An InstructionMapping with statically allocated OperandsMapping. + static const RegisterBankInfo::ValueMapping * + getFPExtMapping(unsigned DstSize, unsigned SrcSize); + #define GET_TARGET_REGBANK_CLASS #include "AArch64GenRegisterBank.inc" }; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td index c2b6c0b04e9b..eee584708f69 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td @@ -17,4 +17,4 @@ def GPRRegBank : RegisterBank<"GPR", [GPR64all]>; def FPRRegBank : RegisterBank<"FPR", [QQQQ]>; /// Conditional register: NZCV. -def CCRRegBank : RegisterBank<"CCR", [CCR]>; +def CCRegBank : RegisterBank<"CC", [CCR]>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 9f7dcb3fe1c3..88dd297e0079 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -26,7 +26,7 @@ #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; @@ -35,27 +35,29 @@ using namespace llvm; #include "AArch64GenRegisterInfo.inc" AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) - : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {} + : AArch64GenRegisterInfo(AArch64::LR), TT(TT) { + AArch64_MC::initLLVMToCVRegMapping(this); +} const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); - if (MF->getFunction()->getCallingConv() == CallingConv::GHC) + if (MF->getFunction().getCallingConv() == CallingConv::GHC) // GHC set of callee saved regs is empty as all those regs are // used for passing STG regs around return CSR_AArch64_NoRegs_SaveList; - if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) + if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; - if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS) + if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ? CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : CSR_AArch64_CXX_TLS_Darwin_SaveList; if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering() ->supportSwiftError() && - MF->getFunction()->getAttributes().hasAttrSomewhere( + MF->getFunction().getAttributes().hasAttrSomewhere( Attribute::SwiftError)) return CSR_AArch64_AAPCS_SwiftError_SaveList; - if (MF->getFunction()->getCallingConv() == CallingConv::PreserveMost) + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) return CSR_AArch64_RT_MostRegs_SaveList; else return CSR_AArch64_AAPCS_SaveList; @@ -64,7 +66,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); - if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()) return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; return nullptr; @@ -82,7 +84,7 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_AArch64_CXX_TLS_Darwin_RegMask; if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering() ->supportSwiftError() && - MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) return CSR_AArch64_AAPCS_SwiftError_RegMask; if (CC == CallingConv::PreserveMost) return CSR_AArch64_RT_MostRegs_RegMask; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 7e29ee5e9baf..39e3e33b0d27 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -32,6 +32,12 @@ let Namespace = "AArch64" in { def qsub : SubRegIndex<64>; def sube64 : SubRegIndex<64>; def subo64 : SubRegIndex<64>; + // SVE + def zsub : SubRegIndex<128>; + // Note: zsub_hi should never be used directly because it represents + // the scalable part of the SVE vector and cannot be manipulated as a + // subvector in the same way the lower 128bits can. + def zsub_hi : SubRegIndex<128>; // Note: Code depends on these having consecutive numbers def dsub0 : SubRegIndex<64>; def dsub1 : SubRegIndex<64>; @@ -169,6 +175,15 @@ def GPR64sp0 : RegisterOperand<GPR64sp> { let ParserMatchClass = GPR64spPlus0Operand; } +// GPR32/GPR64 but with zero-register substitution enabled. +// TODO: Roll this out to GPR32/GPR64/GPR32all/GPR64all. +def GPR32z : RegisterOperand<GPR32> { + let GIZeroRegister = WZR; +} +def GPR64z : RegisterOperand<GPR64> { + let GIZeroRegister = XZR; +} + // GPR register classes which include WZR/XZR AND SP/WSP. This is not a // constraint used by any instructions, it is used as a common super-class. def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>; @@ -451,11 +466,11 @@ def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> { // assmebler matching. def VectorReg64AsmOperand : AsmOperandClass { let Name = "VectorReg64"; - let PredicateMethod = "isVectorReg"; + let PredicateMethod = "isNeonVectorReg"; } def VectorReg128AsmOperand : AsmOperandClass { let Name = "VectorReg128"; - let PredicateMethod = "isVectorReg"; + let PredicateMethod = "isNeonVectorReg"; } def V64 : RegisterOperand<FPR64, "printVRegOperand"> { @@ -466,7 +481,10 @@ def V128 : RegisterOperand<FPR128, "printVRegOperand"> { let ParserMatchClass = VectorReg128AsmOperand; } -def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; } +def VectorRegLoAsmOperand : AsmOperandClass { + let Name = "VectorRegLo"; + let PredicateMethod = "isNeonVectorRegLo"; +} def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> { let ParserMatchClass = VectorRegLoAsmOperand; } @@ -633,3 +651,170 @@ def XSeqPairClassOperand : //===----- END: v8.1a atomic CASP register operands -----------------------===// + +// SVE predicate registers +def P0 : AArch64Reg<0, "p0">, DwarfRegNum<[48]>; +def P1 : AArch64Reg<1, "p1">, DwarfRegNum<[49]>; +def P2 : AArch64Reg<2, "p2">, DwarfRegNum<[50]>; +def P3 : AArch64Reg<3, "p3">, DwarfRegNum<[51]>; +def P4 : AArch64Reg<4, "p4">, DwarfRegNum<[52]>; +def P5 : AArch64Reg<5, "p5">, DwarfRegNum<[53]>; +def P6 : AArch64Reg<6, "p6">, DwarfRegNum<[54]>; +def P7 : AArch64Reg<7, "p7">, DwarfRegNum<[55]>; +def P8 : AArch64Reg<8, "p8">, DwarfRegNum<[56]>; +def P9 : AArch64Reg<9, "p9">, DwarfRegNum<[57]>; +def P10 : AArch64Reg<10, "p10">, DwarfRegNum<[58]>; +def P11 : AArch64Reg<11, "p11">, DwarfRegNum<[59]>; +def P12 : AArch64Reg<12, "p12">, DwarfRegNum<[60]>; +def P13 : AArch64Reg<13, "p13">, DwarfRegNum<[61]>; +def P14 : AArch64Reg<14, "p14">, DwarfRegNum<[62]>; +def P15 : AArch64Reg<15, "p15">, DwarfRegNum<[63]>; + +// The part of SVE registers that don't overlap Neon registers. +// These are only used as part of clobber lists. +def Z0_HI : AArch64Reg<0, "z0_hi">; +def Z1_HI : AArch64Reg<1, "z1_hi">; +def Z2_HI : AArch64Reg<2, "z2_hi">; +def Z3_HI : AArch64Reg<3, "z3_hi">; +def Z4_HI : AArch64Reg<4, "z4_hi">; +def Z5_HI : AArch64Reg<5, "z5_hi">; +def Z6_HI : AArch64Reg<6, "z6_hi">; +def Z7_HI : AArch64Reg<7, "z7_hi">; +def Z8_HI : AArch64Reg<8, "z8_hi">; +def Z9_HI : AArch64Reg<9, "z9_hi">; +def Z10_HI : AArch64Reg<10, "z10_hi">; +def Z11_HI : AArch64Reg<11, "z11_hi">; +def Z12_HI : AArch64Reg<12, "z12_hi">; +def Z13_HI : AArch64Reg<13, "z13_hi">; +def Z14_HI : AArch64Reg<14, "z14_hi">; +def Z15_HI : AArch64Reg<15, "z15_hi">; +def Z16_HI : AArch64Reg<16, "z16_hi">; +def Z17_HI : AArch64Reg<17, "z17_hi">; +def Z18_HI : AArch64Reg<18, "z18_hi">; +def Z19_HI : AArch64Reg<19, "z19_hi">; +def Z20_HI : AArch64Reg<20, "z20_hi">; +def Z21_HI : AArch64Reg<21, "z21_hi">; +def Z22_HI : AArch64Reg<22, "z22_hi">; +def Z23_HI : AArch64Reg<23, "z23_hi">; +def Z24_HI : AArch64Reg<24, "z24_hi">; +def Z25_HI : AArch64Reg<25, "z25_hi">; +def Z26_HI : AArch64Reg<26, "z26_hi">; +def Z27_HI : AArch64Reg<27, "z27_hi">; +def Z28_HI : AArch64Reg<28, "z28_hi">; +def Z29_HI : AArch64Reg<29, "z29_hi">; +def Z30_HI : AArch64Reg<30, "z30_hi">; +def Z31_HI : AArch64Reg<31, "z31_hi">; + +// SVE variable-size vector registers +let SubRegIndices = [zsub,zsub_hi] in { +def Z0 : AArch64Reg<0, "z0", [Q0, Z0_HI]>, DwarfRegNum<[96]>; +def Z1 : AArch64Reg<1, "z1", [Q1, Z1_HI]>, DwarfRegNum<[97]>; +def Z2 : AArch64Reg<2, "z2", [Q2, Z2_HI]>, DwarfRegNum<[98]>; +def Z3 : AArch64Reg<3, "z3", [Q3, Z3_HI]>, DwarfRegNum<[99]>; +def Z4 : AArch64Reg<4, "z4", [Q4, Z4_HI]>, DwarfRegNum<[100]>; +def Z5 : AArch64Reg<5, "z5", [Q5, Z5_HI]>, DwarfRegNum<[101]>; +def Z6 : AArch64Reg<6, "z6", [Q6, Z6_HI]>, DwarfRegNum<[102]>; +def Z7 : AArch64Reg<7, "z7", [Q7, Z7_HI]>, DwarfRegNum<[103]>; +def Z8 : AArch64Reg<8, "z8", [Q8, Z8_HI]>, DwarfRegNum<[104]>; +def Z9 : AArch64Reg<9, "z9", [Q9, Z9_HI]>, DwarfRegNum<[105]>; +def Z10 : AArch64Reg<10, "z10", [Q10, Z10_HI]>, DwarfRegNum<[106]>; +def Z11 : AArch64Reg<11, "z11", [Q11, Z11_HI]>, DwarfRegNum<[107]>; +def Z12 : AArch64Reg<12, "z12", [Q12, Z12_HI]>, DwarfRegNum<[108]>; +def Z13 : AArch64Reg<13, "z13", [Q13, Z13_HI]>, DwarfRegNum<[109]>; +def Z14 : AArch64Reg<14, "z14", [Q14, Z14_HI]>, DwarfRegNum<[110]>; +def Z15 : AArch64Reg<15, "z15", [Q15, Z15_HI]>, DwarfRegNum<[111]>; +def Z16 : AArch64Reg<16, "z16", [Q16, Z16_HI]>, DwarfRegNum<[112]>; +def Z17 : AArch64Reg<17, "z17", [Q17, Z17_HI]>, DwarfRegNum<[113]>; +def Z18 : AArch64Reg<18, "z18", [Q18, Z18_HI]>, DwarfRegNum<[114]>; +def Z19 : AArch64Reg<19, "z19", [Q19, Z19_HI]>, DwarfRegNum<[115]>; +def Z20 : AArch64Reg<20, "z20", [Q20, Z20_HI]>, DwarfRegNum<[116]>; +def Z21 : AArch64Reg<21, "z21", [Q21, Z21_HI]>, DwarfRegNum<[117]>; +def Z22 : AArch64Reg<22, "z22", [Q22, Z22_HI]>, DwarfRegNum<[118]>; +def Z23 : AArch64Reg<23, "z23", [Q23, Z23_HI]>, DwarfRegNum<[119]>; +def Z24 : AArch64Reg<24, "z24", [Q24, Z24_HI]>, DwarfRegNum<[120]>; +def Z25 : AArch64Reg<25, "z25", [Q25, Z25_HI]>, DwarfRegNum<[121]>; +def Z26 : AArch64Reg<26, "z26", [Q26, Z26_HI]>, DwarfRegNum<[122]>; +def Z27 : AArch64Reg<27, "z27", [Q27, Z27_HI]>, DwarfRegNum<[123]>; +def Z28 : AArch64Reg<28, "z28", [Q28, Z28_HI]>, DwarfRegNum<[124]>; +def Z29 : AArch64Reg<29, "z29", [Q29, Z29_HI]>, DwarfRegNum<[125]>; +def Z30 : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>; +def Z31 : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>; +} + +class SVERegOp <string Suffix, AsmOperandClass C, + RegisterClass RC> : RegisterOperand<RC> { + let PrintMethod = !if(!eq(Suffix, ""), + "printSVERegOp<>", + "printSVERegOp<'" # Suffix # "'>"); + let ParserMatchClass = C; +} + +class PPRRegOp <string Suffix, AsmOperandClass C, + RegisterClass RC> : SVERegOp<Suffix, C, RC> {} +class ZPRRegOp <string Suffix, AsmOperandClass C, + RegisterClass RC> : SVERegOp<Suffix, C, RC> {} + +//****************************************************************************** + +// SVE predicate register class. +def PPR : RegisterClass<"AArch64", + [nxv16i1, nxv8i1, nxv4i1, nxv2i1], + 16, (sequence "P%u", 0, 15)> { + let Size = 16; +} + +class PPRAsmOperand <string name, int Width>: AsmOperandClass { + let Name = "SVE" # name # "Reg"; + let PredicateMethod = "isSVEVectorRegOfWidth<" + # Width # ", AArch64::PPRRegClassID>"; + let DiagnosticType = "InvalidSVE" # name # "Reg"; + let RenderMethod = "addRegOperands"; + let ParserMethod = "tryParseSVEPredicateVector"; +} + +def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", -1>; +def PPRAsmOp8 : PPRAsmOperand<"PredicateB", 8>; +def PPRAsmOp16 : PPRAsmOperand<"PredicateH", 16>; +def PPRAsmOp32 : PPRAsmOperand<"PredicateS", 32>; +def PPRAsmOp64 : PPRAsmOperand<"PredicateD", 64>; + +def PPRAny : PPRRegOp<"", PPRAsmOpAny, PPR>; +def PPR8 : PPRRegOp<"b", PPRAsmOp8, PPR>; +def PPR16 : PPRRegOp<"h", PPRAsmOp16, PPR>; +def PPR32 : PPRRegOp<"s", PPRAsmOp32, PPR>; +def PPR64 : PPRRegOp<"d", PPRAsmOp64, PPR>; + +//****************************************************************************** + +// SVE vector register class +def ZPR : RegisterClass<"AArch64", + [nxv16i8, nxv8i16, nxv4i32, nxv2i64, + nxv2f16, nxv4f16, nxv8f16, + nxv1f32, nxv2f32, nxv4f32, + nxv1f64, nxv2f64], + 128, (sequence "Z%u", 0, 31)> { + let Size = 128; +} + +class ZPRAsmOperand <string name, int Width>: AsmOperandClass { + let Name = "SVE" # name # "Reg"; + let PredicateMethod = "isSVEVectorRegOfWidth<" + # Width # ", AArch64::ZPRRegClassID>"; + let RenderMethod = "addRegOperands"; + let ParserMethod = "tryParseSVEDataVector<" + # !if(!eq(Width, -1), "false", "true") # ">"; +} + +def ZPRAsmOpAny : ZPRAsmOperand<"VectorAny", -1>; +def ZPRAsmOp8 : ZPRAsmOperand<"VectorB", 8>; +def ZPRAsmOp16 : ZPRAsmOperand<"VectorH", 16>; +def ZPRAsmOp32 : ZPRAsmOperand<"VectorS", 32>; +def ZPRAsmOp64 : ZPRAsmOperand<"VectorD", 64>; +def ZPRAsmOp128 : ZPRAsmOperand<"VectorQ", 128>; + +def ZPRAny : ZPRRegOp<"", ZPRAsmOpAny, ZPR>; +def ZPR8 : ZPRRegOp<"b", ZPRAsmOp8, ZPR>; +def ZPR16 : ZPRRegOp<"h", ZPRAsmOp16, ZPR>; +def ZPR32 : ZPRRegOp<"s", ZPRAsmOp32, ZPR>; +def ZPR64 : ZPRRegOp<"d", ZPRAsmOp64, ZPR>; +def ZPR128 : ZPRRegOp<"q", ZPRAsmOp128, ZPR>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp new file mode 100644 index 000000000000..e1851875abc5 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -0,0 +1,741 @@ +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs optimization on SIMD instructions +// with high latency by splitting them into more efficient series of +// instructions. +// +// 1. Rewrite certain SIMD instructions with vector element due to their +// inefficiency on some targets. +// +// For example: +// fmla v0.4s, v1.4s, v2.s[1] +// +// Is rewritten into: +// dup v3.4s, v2.s[1] +// fmla v0.4s, v1.4s, v3.4s +// +// 2. Rewrite interleaved memory access instructions due to their +// inefficiency on some targets. +// +// For example: +// st2 {v0.4s, v1.4s}, addr +// +// Is rewritten into: +// zip1 v2.4s, v0.4s, v1.4s +// zip2 v3.4s, v0.4s, v1.4s +// stp q2, q3, addr +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/Pass.h" +#include <unordered_map> + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-simdinstr-opt" + +STATISTIC(NumModifiedInstr, + "Number of SIMD instructions modified"); + +#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ + "AArch64 SIMD instructions optimization pass" + +namespace { + +struct AArch64SIMDInstrOpt : public MachineFunctionPass { + static char ID; + + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + TargetSchedModel SchedModel; + + // The two maps below are used to cache decisions instead of recomputing: + // This is used to cache instruction replacement decisions within function + // units and across function units. + std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable; + // This is used to cache the decision of whether to leave the interleaved + // store instructions replacement pass early or not for a particular target. + std::unordered_map<std::string, bool> InterlEarlyExit; + + typedef enum { + VectorElem, + Interleave + } Subpass; + + // Instruction represented by OrigOpc is replaced by instructions in ReplOpc. + struct InstReplInfo { + unsigned OrigOpc; + std::vector<unsigned> ReplOpc; + const TargetRegisterClass RC; + }; + +#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ + {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} +#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ + OpcR7, OpcR8, OpcR9, RC) \ + {OpcOrg, \ + {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} + + // The Instruction Replacement Table: + std::vector<InstReplInfo> IRT = { + // ST2 instructions + RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, + AArch64::STPQi, AArch64::FPR128RegClass), + RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, + AArch64::STPQi, AArch64::FPR128RegClass), + RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, + AArch64::STPDi, AArch64::FPR64RegClass), + RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, + AArch64::STPQi, AArch64::FPR128RegClass), + RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, + AArch64::STPDi, AArch64::FPR64RegClass), + RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, + AArch64::STPQi, AArch64::FPR128RegClass), + RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, + AArch64::STPDi, AArch64::FPR64RegClass), + // ST4 instructions + RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, + AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, + AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, + AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), + RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, + AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, + AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, + AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), + RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, + AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, + AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, + AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), + RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, + AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, + AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, + AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), + RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, + AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, + AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, + AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), + RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, + AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, + AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, + AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), + RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, + AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, + AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, + AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) + }; + + // A costly instruction is replaced in this work by N efficient instructions + // The maximum of N is curently 10 and it is for ST4 case. + static const unsigned MaxNumRepl = 10; + + AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { + initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); + } + + /// Based only on latency of instructions, determine if it is cost efficient + /// to replace the instruction InstDesc by the instructions stored in the + /// array InstDescRepl. + /// Return true if replacement is expected to be faster. + bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, + SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID); + + /// Determine if we need to exit the instruction replacement optimization + /// passes early. This makes sure that no compile time is spent in this pass + /// for targets with no need for any of these optimizations. + /// Return true if early exit of the pass is recommended. + bool shouldExitEarly(MachineFunction *MF, Subpass SP); + + /// Check whether an equivalent DUP instruction has already been + /// created or not. + /// Return true when the DUP instruction already exists. In this case, + /// DestReg will point to the destination of the already created DUP. + bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, + unsigned LaneNumber, unsigned *DestReg) const; + + /// Certain SIMD instructions with vector element operand are not efficient. + /// Rewrite them into SIMD instructions with vector operands. This rewrite + /// is driven by the latency of the instructions. + /// Return true if the SIMD instruction is modified. + bool optimizeVectElement(MachineInstr &MI); + + /// Process The REG_SEQUENCE instruction, and extract the source + /// operands of the ST2/4 instruction from it. + /// Example of such instructions. + /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; + /// Return true when the instruction is processed successfully. + bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, + unsigned* StRegKill, unsigned NumArg) const; + + /// Load/Store Interleaving instructions are not always beneficial. + /// Replace them by ZIP instructionand classical load/store. + /// Return true if the SIMD instruction is modified. + bool optimizeLdStInterleave(MachineInstr &MI); + + /// Return the number of useful source registers for this + /// instruction (2 for ST2 and 4 for ST4). + unsigned determineSrcReg(MachineInstr &MI) const; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { + return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; + } +}; + +char AArch64SIMDInstrOpt::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", + AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) + +/// Based only on latency of instructions, determine if it is cost efficient +/// to replace the instruction InstDesc by the instructions stored in the +/// array InstDescRepl. +/// Return true if replacement is expected to be faster. +bool AArch64SIMDInstrOpt:: +shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, + SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { + // Check if replacement decision is already available in the cached table. + // if so, return it. + std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); + auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); + if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) + return SIMDInstrTable[InstID]; + + unsigned SCIdx = InstDesc->getSchedClass(); + const MCSchedClassDesc *SCDesc = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); + + // If a target does not define resources for the instructions + // of interest, then return false for no replacement. + const MCSchedClassDesc *SCDescRepl; + if (!SCDesc->isValid() || SCDesc->isVariant()) + { + SIMDInstrTable[InstID] = false; + return false; + } + for (auto IDesc : InstDescRepl) + { + SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( + IDesc->getSchedClass()); + if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) + { + SIMDInstrTable[InstID] = false; + return false; + } + } + + // Replacement cost. + unsigned ReplCost = 0; + for (auto IDesc :InstDescRepl) + ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); + + if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) + { + SIMDInstrTable[InstID] = true; + return true; + } + else + { + SIMDInstrTable[InstID] = false; + return false; + } +} + +/// Determine if we need to exit this pass for a kind of instruction replacement +/// early. This makes sure that no compile time is spent in this pass for +/// targets with no need for any of these optimizations beyond performing this +/// check. +/// Return true if early exit of this pass for a kind of instruction +/// replacement is recommended for a target. +bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { + const MCInstrDesc* OriginalMCID; + SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; + + switch (SP) { + // For this optimization, check by comparing the latency of a representative + // instruction to that of the replacement instructions. + // TODO: check for all concerned instructions. + case VectorElem: + OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); + ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); + ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32)); + if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) + return false; + break; + + // For this optimization, check for all concerned instructions. + case Interleave: + std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); + if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) + return InterlEarlyExit[Subtarget]; + + for (auto &I : IRT) { + OriginalMCID = &TII->get(I.OrigOpc); + for (auto &Repl : I.ReplOpc) + ReplInstrMCID.push_back(&TII->get(Repl)); + if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { + InterlEarlyExit[Subtarget] = false; + return false; + } + ReplInstrMCID.clear(); + } + InterlEarlyExit[Subtarget] = true; + break; + } + + return true; +} + +/// Check whether an equivalent DUP instruction has already been +/// created or not. +/// Return true when the DUP instruction already exists. In this case, +/// DestReg will point to the destination of the already created DUP. +bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, + unsigned SrcReg, unsigned LaneNumber, + unsigned *DestReg) const { + for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); + MII != MIE;) { + MII--; + MachineInstr *CurrentMI = &*MII; + + if (CurrentMI->getOpcode() == DupOpcode && + CurrentMI->getNumOperands() == 3 && + CurrentMI->getOperand(1).getReg() == SrcReg && + CurrentMI->getOperand(2).getImm() == LaneNumber) { + *DestReg = CurrentMI->getOperand(0).getReg(); + return true; + } + } + + return false; +} + +/// Certain SIMD instructions with vector element operand are not efficient. +/// Rewrite them into SIMD instructions with vector operands. This rewrite +/// is driven by the latency of the instructions. +/// The instruction of concerns are for the time being FMLA, FMLS, FMUL, +/// and FMULX and hence they are hardcoded. +/// +/// For example: +/// fmla v0.4s, v1.4s, v2.s[1] +/// +/// Is rewritten into +/// dup v3.4s, v2.s[1] // DUP not necessary if redundant +/// fmla v0.4s, v1.4s, v3.4s +/// +/// Return true if the SIMD instruction is modified. +bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { + const MCInstrDesc *MulMCID, *DupMCID; + const TargetRegisterClass *RC = &AArch64::FPR128RegClass; + + switch (MI.getOpcode()) { + default: + return false; + + // 4X32 instructions + case AArch64::FMLAv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMLAv4f32); + break; + case AArch64::FMLSv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMLSv4f32); + break; + case AArch64::FMULXv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMULXv4f32); + break; + case AArch64::FMULv4i32_indexed: + DupMCID = &TII->get(AArch64::DUPv4i32lane); + MulMCID = &TII->get(AArch64::FMULv4f32); + break; + + // 2X64 instructions + case AArch64::FMLAv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMLAv2f64); + break; + case AArch64::FMLSv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMLSv2f64); + break; + case AArch64::FMULXv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMULXv2f64); + break; + case AArch64::FMULv2i64_indexed: + DupMCID = &TII->get(AArch64::DUPv2i64lane); + MulMCID = &TII->get(AArch64::FMULv2f64); + break; + + // 2X32 instructions + case AArch64::FMLAv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMLAv2f32); + break; + case AArch64::FMLSv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMLSv2f32); + break; + case AArch64::FMULXv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMULXv2f32); + break; + case AArch64::FMULv2i32_indexed: + RC = &AArch64::FPR64RegClass; + DupMCID = &TII->get(AArch64::DUPv2i32lane); + MulMCID = &TII->get(AArch64::FMULv2f32); + break; + } + + SmallVector<const MCInstrDesc*, 2> ReplInstrMCID; + ReplInstrMCID.push_back(DupMCID); + ReplInstrMCID.push_back(MulMCID); + if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), + ReplInstrMCID)) + return false; + + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // Get the operands of the current SIMD arithmetic instruction. + unsigned MulDest = MI.getOperand(0).getReg(); + unsigned SrcReg0 = MI.getOperand(1).getReg(); + unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); + unsigned SrcReg1 = MI.getOperand(2).getReg(); + unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); + unsigned DupDest; + + // Instructions of interest have either 4 or 5 operands. + if (MI.getNumOperands() == 5) { + unsigned SrcReg2 = MI.getOperand(3).getReg(); + unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); + unsigned LaneNumber = MI.getOperand(4).getImm(); + // Create a new DUP instruction. Note that if an equivalent DUP instruction + // has already been created before, then use that one instead of creating + // a new one. + if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { + DupDest = MRI.createVirtualRegister(RC); + BuildMI(MBB, MI, DL, *DupMCID, DupDest) + .addReg(SrcReg2, Src2IsKill) + .addImm(LaneNumber); + } + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, Src0IsKill) + .addReg(SrcReg1, Src1IsKill) + .addReg(DupDest, Src2IsKill); + } else if (MI.getNumOperands() == 4) { + unsigned LaneNumber = MI.getOperand(3).getImm(); + if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { + DupDest = MRI.createVirtualRegister(RC); + BuildMI(MBB, MI, DL, *DupMCID, DupDest) + .addReg(SrcReg1, Src1IsKill) + .addImm(LaneNumber); + } + BuildMI(MBB, MI, DL, *MulMCID, MulDest) + .addReg(SrcReg0, Src0IsKill) + .addReg(DupDest, Src1IsKill); + } else { + return false; + } + + ++NumModifiedInstr; + return true; +} + +/// Load/Store Interleaving instructions are not always beneficial. +/// Replace them by ZIP instructions and classical load/store. +/// +/// For example: +/// st2 {v0.4s, v1.4s}, addr +/// +/// Is rewritten into: +/// zip1 v2.4s, v0.4s, v1.4s +/// zip2 v3.4s, v0.4s, v1.4s +/// stp q2, q3, addr +// +/// For example: +/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr +/// +/// Is rewritten into: +/// zip1 v4.4s, v0.4s, v2.4s +/// zip2 v5.4s, v0.4s, v2.4s +/// zip1 v6.4s, v1.4s, v3.4s +/// zip2 v7.4s, v1.4s, v3.4s +/// zip1 v8.4s, v4.4s, v6.4s +/// zip2 v9.4s, v4.4s, v6.4s +/// zip1 v10.4s, v5.4s, v7.4s +/// zip2 v11.4s, v5.4s, v7.4s +/// stp q8, q9, addr +/// stp q10, q11, addr+32 +/// +/// Currently only instructions related to ST2 and ST4 are considered. +/// Other may be added later. +/// Return true if the SIMD instruction is modified. +bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { + + unsigned SeqReg, AddrReg; + unsigned StReg[4], StRegKill[4]; + MachineInstr *DefiningMI; + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock &MBB = *MI.getParent(); + SmallVector<unsigned, MaxNumRepl> ZipDest; + SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; + + // If current instruction matches any of the rewriting rules, then + // gather information about parameters of the new instructions. + bool Match = false; + for (auto &I : IRT) { + if (MI.getOpcode() == I.OrigOpc) { + SeqReg = MI.getOperand(0).getReg(); + AddrReg = MI.getOperand(1).getReg(); + DefiningMI = MRI->getUniqueVRegDef(SeqReg); + unsigned NumReg = determineSrcReg(MI); + if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) + return false; + + for (auto &Repl : I.ReplOpc) { + ReplInstrMCID.push_back(&TII->get(Repl)); + // Generate destination registers but only for non-store instruction. + if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) + ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); + } + Match = true; + break; + } + } + + if (!Match) + return false; + + // Determine if it is profitable to replace MI by the series of instructions + // represented in ReplInstrMCID. + if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), + ReplInstrMCID)) + return false; + + // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at + // this point, the code generation is hardcoded and does not rely on the IRT + // table used above given that code generation for ST2 replacement is somewhat + // different than for ST4 replacement. We could have added more info into the + // table related to how we build new instructions but we may be adding more + // complexity with that). + switch (MI.getOpcode()) { + default: + return false; + + case AArch64::ST2Twov16b: + case AArch64::ST2Twov8b: + case AArch64::ST2Twov8h: + case AArch64::ST2Twov4h: + case AArch64::ST2Twov4s: + case AArch64::ST2Twov2s: + case AArch64::ST2Twov2d: + // ZIP instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) + .addReg(StReg[0]) + .addReg(StReg[1]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) + .addReg(StReg[0], StRegKill[0]) + .addReg(StReg[1], StRegKill[1]); + // STP instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) + .addReg(ZipDest[0]) + .addReg(ZipDest[1]) + .addReg(AddrReg) + .addImm(0); + break; + + case AArch64::ST4Fourv16b: + case AArch64::ST4Fourv8b: + case AArch64::ST4Fourv8h: + case AArch64::ST4Fourv4h: + case AArch64::ST4Fourv4s: + case AArch64::ST4Fourv2s: + case AArch64::ST4Fourv2d: + // ZIP instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) + .addReg(StReg[0]) + .addReg(StReg[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) + .addReg(StReg[0], StRegKill[0]) + .addReg(StReg[2], StRegKill[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) + .addReg(StReg[1]) + .addReg(StReg[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) + .addReg(StReg[1], StRegKill[1]) + .addReg(StReg[3], StRegKill[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) + .addReg(ZipDest[0]) + .addReg(ZipDest[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) + .addReg(ZipDest[0]) + .addReg(ZipDest[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) + .addReg(ZipDest[1]) + .addReg(ZipDest[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) + .addReg(ZipDest[1]) + .addReg(ZipDest[3]); + // stp instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) + .addReg(ZipDest[4]) + .addReg(ZipDest[5]) + .addReg(AddrReg) + .addImm(0); + BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) + .addReg(ZipDest[6]) + .addReg(ZipDest[7]) + .addReg(AddrReg) + .addImm(2); + break; + } + + ++NumModifiedInstr; + return true; +} + +/// Process The REG_SEQUENCE instruction, and extract the source +/// operands of the ST2/4 instruction from it. +/// Example of such instruction. +/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; +/// Return true when the instruction is processed successfully. +bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, + unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { + assert (DefiningMI != NULL); + if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) + return false; + + for (unsigned i=0; i<NumArg; i++) { + StReg[i] = DefiningMI->getOperand(2*i+1).getReg(); + StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); + + // Sanity check for the other arguments. + if (DefiningMI->getOperand(2*i+2).isImm()) { + switch (DefiningMI->getOperand(2*i+2).getImm()) { + default: + return false; + + case AArch64::dsub0: + case AArch64::dsub1: + case AArch64::dsub2: + case AArch64::dsub3: + case AArch64::qsub0: + case AArch64::qsub1: + case AArch64::qsub2: + case AArch64::qsub3: + break; + } + } + else + return false; + } + return true; +} + +/// Return the number of useful source registers for this instruction +/// (2 for ST2 and 4 for ST4). +unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unsupported instruction for this pass"); + + case AArch64::ST2Twov16b: + case AArch64::ST2Twov8b: + case AArch64::ST2Twov8h: + case AArch64::ST2Twov4h: + case AArch64::ST2Twov4s: + case AArch64::ST2Twov2s: + case AArch64::ST2Twov2d: + return 2; + + case AArch64::ST4Fourv16b: + case AArch64::ST4Fourv8b: + case AArch64::ST4Fourv8h: + case AArch64::ST4Fourv4h: + case AArch64::ST4Fourv4s: + case AArch64::ST4Fourv2s: + case AArch64::ST4Fourv2d: + return 4; + } +} + +bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + TII = MF.getSubtarget().getInstrInfo(); + MRI = &MF.getRegInfo(); + const TargetSubtargetInfo &ST = MF.getSubtarget(); + const AArch64InstrInfo *AAII = + static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); + if (!AAII) + return false; + SchedModel.init(ST.getSchedModel(), &ST, AAII); + if (!SchedModel.hasInstrSchedModel()) + return false; + + bool Changed = false; + for (auto OptimizationKind : {VectorElem, Interleave}) { + if (!shouldExitEarly(&MF, OptimizationKind)) { + SmallVector<MachineInstr *, 8> RemoveMIs; + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); + MII != MIE;) { + MachineInstr &MI = *MII; + bool InstRewrite; + if (OptimizationKind == VectorElem) + InstRewrite = optimizeVectElement(MI) ; + else + InstRewrite = optimizeLdStInterleave(MI); + if (InstRewrite) { + // Add MI to the list of instructions to be removed given that it + // has been replaced. + RemoveMIs.push_back(&MI); + Changed = true; + } + ++MII; + } + } + for (MachineInstr *MI : RemoveMIs) + MI->eraseFromParent(); + } + } + + return Changed; +} + +/// Returns an instance of the high cost ASIMD instruction replacement +/// optimization pass. +FunctionPass *llvm::createAArch64SIMDInstrOptPass() { + return new AArch64SIMDInstrOpt(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td new file mode 100644 index 000000000000..bcd7b60875a2 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -0,0 +1,23 @@ +//=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -*- tablegen -*-----=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AArch64 Scalable Vector Extension (SVE) Instruction definitions. +// +//===----------------------------------------------------------------------===// + +let Predicates = [HasSVE] in { + defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add">; + defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub">; + + defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1">; + defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2">; + + defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1">; + defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2">; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td index 18d000ace94c..90ebd78f4ab9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -26,6 +26,8 @@ def CortexA53Model : SchedMachineModel { // Specification - Instruction Timings" // v 1.0 Spreadsheet let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = [HasSVE]; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td index 5d1608ef04af..ade03f23f8c7 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -31,6 +31,8 @@ def CortexA57Model : SchedMachineModel { // experiments and benchmarking data. let LoopMicroOpBufferSize = 16; let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = [HasSVE]; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td index 9fd3ae6818e5..7a474ba8ef9b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -18,6 +18,8 @@ def CycloneModel : SchedMachineModel { let LoadLatency = 4; // Optimistic load latency. let MispredictPenalty = 16; // 14-19 cycles are typical. let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = [HasSVE]; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index 44fd94fc3d48..7277198b585f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -23,6 +23,8 @@ def FalkorModel : SchedMachineModel { let LoadLatency = 3; // Optimistic load latency. let MispredictPenalty = 11; // Minimum branch misprediction penalty. let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = [HasSVE]; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td index 4e491a04c78d..ce2afd499afb 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td @@ -27,6 +27,8 @@ def KryoModel : SchedMachineModel { // experiments and benchmarking data. let LoopMicroOpBufferSize = 16; let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = [HasSVE]; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td index 3b71cf8399a0..91b6ffcd7083 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td @@ -24,6 +24,8 @@ def ExynosM1Model : SchedMachineModel { let LoadLatency = 4; // Optimistic load cases. let MispredictPenalty = 14; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. + + list<Predicate> UnsupportedFeatures = [HasSVE]; } //===----------------------------------------------------------------------===// @@ -62,39 +64,98 @@ let SchedModel = ExynosM1Model in { let SchedModel = ExynosM1Model in { //===----------------------------------------------------------------------===// -// Coarse scheduling model for the Exynos-M1. +// Predicates. + +def M1BranchLinkFastPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR && + MI->getOperand(0).getReg() != AArch64::LR}]>; +def M1ShiftLeftFastPred : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>; + +//===----------------------------------------------------------------------===// +// Coarse scheduling model. def M1WriteA1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } def M1WriteA2 : SchedWriteRes<[M1UnitALU]> { let Latency = 2; } +def M1WriteAA : SchedWriteRes<[M1UnitALU]> { let Latency = 2; + let ResourceCycles = [2]; } +def M1WriteAB : SchedWriteRes<[M1UnitALU, + M1UnitC]> { let Latency = 1; + let NumMicroOps = 2; } +def M1WriteAC : SchedWriteRes<[M1UnitALU, + M1UnitALU, + M1UnitC]> { let Latency = 2; + let NumMicroOps = 3; } +def M1WriteAD : SchedWriteRes<[M1UnitALU, + M1UnitC]> { let Latency = 2; + let NumMicroOps = 2; } +def M1WriteAX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteA1]>, + SchedVar<NoSchedPred, [M1WriteAA]>]>; def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; } def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } -def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } - -def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } -def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5, - M1WriteA1]>, - SchedVar<NoSchedPred, [M1WriteL5]>]>; - -def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } -def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; } -def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } -def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2, - M1WriteA1]>, - SchedVar<NoSchedPred, [M1WriteS1]>]>; +def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } +def M1WriteBX : SchedWriteVariant<[SchedVar<M1BranchLinkFastPred, [M1WriteAB]>, + SchedVar<NoSchedPred, [M1WriteAC]>]>; + +def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } +def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; } +def M1WriteLA : SchedWriteRes<[M1UnitL]> { let Latency = 6; + let ResourceCycles = [2]; } +def M1WriteLB : SchedWriteRes<[M1UnitL, + M1UnitA]> { let Latency = 4; + let NumMicroOps = 2; } +def M1WriteLC : SchedWriteRes<[M1UnitL, + M1UnitA]> { let Latency = 5; + let NumMicroOps = 2; } +def M1WriteLD : SchedWriteRes<[M1UnitL, + M1UnitA]> { let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; } +def M1WriteLH : SchedWriteRes<[]> { let Latency = 5; + let NumMicroOps = 0; } +def M1WriteLX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteL5]>, + SchedVar<NoSchedPred, [M1WriteLC]>]>; +def M1WriteLY : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteL5]>, + SchedVar<NoSchedPred, [M1WriteLD]>]>; + +def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } +def M1WriteS3 : SchedWriteRes<[M1UnitS]> { let Latency = 3; } +def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } +def M1WriteSA : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST]> { let Latency = 1; + let NumMicroOps = 2; } +def M1WriteSB : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitA]> { let Latency = 3; + let NumMicroOps = 2; } +def M1WriteSC : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitA]> { let Latency = 3; + let NumMicroOps = 3; } +def M1WriteSD : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitA]> { let Latency = 1; + let NumMicroOps = 2; } +def M1WriteSE : SchedWriteRes<[M1UnitS, + M1UnitA]> { let Latency = 2; + let NumMicroOps = 2; } +def M1WriteSX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteS1]>, + SchedVar<NoSchedPred, [M1WriteSE]>]>; +def M1WriteSY : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteS1]>, + SchedVar<NoSchedPred, [M1WriteSB]>]>; def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>, SchedVar<NoSchedPred, [ReadDefault]>]>; -def : SchedAlias<ReadAdrBase, M1ReadAdrBase>; // Branch instructions. -// NOTE: Unconditional direct branches actually take neither cycles nor units. -def : WriteRes<WriteBr, [M1UnitB]> { let Latency = 1; } +def : WriteRes<WriteBr, []> { let Latency = 0; } def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; } // Arithmetic and logical integer instructions. def : WriteRes<WriteI, [M1UnitALU]> { let Latency = 1; } -// TODO: Shift over 3 and some extensions take 2 cycles. def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; } def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; } def : WriteRes<WriteIS, [M1UnitALU]> { let Latency = 1; } @@ -110,21 +171,24 @@ def : WriteRes<WriteID64, [M1UnitC, M1UnitD]> { let Latency = 21; let ResourceCycles = [1, 21]; } // TODO: Long multiplication take 5 cycles and also the ALU. -// TODO: Multiplication with accumulation can be advanced. def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; } -// TODO: 64-bit multiplication has a throughput of 1/2. -def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; } +def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; + let ResourceCycles = [2]; } // Miscellaneous instructions. def : WriteRes<WriteExtr, [M1UnitALU, - M1UnitALU]> { let Latency = 2; } + M1UnitALU]> { let Latency = 2; + let NumMicroOps = 2; } -// TODO: The latency for the post or pre register is 1 cycle. -def : WriteRes<WriteAdr, []> { let Latency = 0; } +// Addressing modes. +def : WriteRes<WriteAdr, []> { let Latency = 1; + let NumMicroOps = 0; } +def : SchedAlias<ReadAdrBase, M1ReadAdrBase>; // Load instructions. def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; } -def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; } +def : WriteRes<WriteLDHi, []> { let Latency = 4; + let NumMicroOps = 0; } def : SchedAlias<WriteLDIdx, M1WriteLX>; // Store instructions. @@ -135,25 +199,23 @@ def : SchedAlias<WriteSTIdx, M1WriteSX>; // FP data instructions. def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; } -// TODO: FCCMP is much different. def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; } def : WriteRes<WriteFDiv, [M1UnitFVAR]> { let Latency = 15; let ResourceCycles = [15]; } def : WriteRes<WriteFMul, [M1UnitFMAC]> { let Latency = 4; } // FP miscellaneous instructions. -// TODO: Conversion between register files is much different. def : WriteRes<WriteFCvt, [M1UnitFCVT]> { let Latency = 3; } def : WriteRes<WriteFImm, [M1UnitNALU]> { let Latency = 1; } def : WriteRes<WriteFCopy, [M1UnitS]> { let Latency = 4; } // FP load instructions. -// TODO: ASIMD loads are much different. -def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; } +def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; } // FP store instructions. -// TODO: ASIMD stores are much different. -def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; } +def : WriteRes<WriteVST, [M1UnitS, + M1UnitFST]> { let Latency = 1; + let NumMicroOps = 1; } // ASIMD FP instructions. def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; } @@ -165,55 +227,67 @@ def : WriteRes<WriteHint, []> { let Latency = 1; } def : WriteRes<WriteSys, []> { let Latency = 1; } //===----------------------------------------------------------------------===// -// Generic fast forwarding. +// Fast forwarding. // TODO: Add FP register forwarding rules. - def : ReadAdvance<ReadI, 0>; def : ReadAdvance<ReadISReg, 0>; def : ReadAdvance<ReadIEReg, 0>; def : ReadAdvance<ReadIM, 0>; -// Integer multiply-accumulate. -// TODO: The forwarding for WriteIM64 saves actually 3 cycles. -def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>; +// TODO: The forwarding for WriteIM32 saves actually 2 cycles. +def : ReadAdvance<ReadIMA, 3, [WriteIM32, WriteIM64]>; def : ReadAdvance<ReadID, 0>; def : ReadAdvance<ReadExtrHi, 0>; def : ReadAdvance<ReadAdrBase, 0>; def : ReadAdvance<ReadVLD, 0>; //===----------------------------------------------------------------------===// -// Finer scheduling model for the Exynos-M1. +// Finer scheduling model. def M1WriteNEONA : SchedWriteRes<[M1UnitNALU, M1UnitNALU, - M1UnitFADD]> { let Latency = 9; } + M1UnitFADD]> { let Latency = 9; + let NumMicroOps = 3; } def M1WriteNEONB : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 5; } + M1UnitFST]> { let Latency = 5; + let NumMicroOps = 2;} def M1WriteNEONC : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 6; } + M1UnitFST]> { let Latency = 6; + let NumMicroOps = 2; } def M1WriteNEOND : SchedWriteRes<[M1UnitNALU, M1UnitFST, - M1UnitL]> { let Latency = 10; } + M1UnitL]> { let Latency = 10; + let NumMicroOps = 3; } def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT, - M1UnitFST]> { let Latency = 8; } + M1UnitFST]> { let Latency = 8; + let NumMicroOps = 2; } def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT, M1UnitFST, - M1UnitL]> { let Latency = 13; } + M1UnitL]> { let Latency = 13; + let NumMicroOps = 3; } def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC, - M1UnitFST]> { let Latency = 6; } + M1UnitFST]> { let Latency = 6; + let NumMicroOps = 2; } def M1WriteNEONH : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 3; } + M1UnitFST]> { let Latency = 3; + let NumMicroOps = 2; } def M1WriteNEONI : SchedWriteRes<[M1UnitFST, - M1UnitL]> { let Latency = 9; } + M1UnitL]> { let Latency = 9; + let NumMicroOps = 2; } def M1WriteNEONJ : SchedWriteRes<[M1UnitNMISC, - M1UnitFMAC]> { let Latency = 6; } + M1UnitFMAC]> { let Latency = 6; + let NumMicroOps = 2; } def M1WriteNEONK : SchedWriteRes<[M1UnitNMISC, - M1UnitFMAC]> { let Latency = 7; } + M1UnitFMAC]> { let Latency = 7; + let NumMicroOps = 2; } +def M1WriteNEONL : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; + let ResourceCycles = [2]; } def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; } def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; } def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; } def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; } def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; } +// TODO def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15; let ResourceCycles = [15]; } def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23; @@ -230,75 +304,93 @@ def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; } def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } def M1WriteTB : SchedWriteRes<[M1UnitC, - M1UnitALU]> { let Latency = 2; } + M1UnitALU]> { let Latency = 2; + let NumMicroOps = 2; } def M1WriteVLDA : SchedWriteRes<[M1UnitL, - M1UnitL]> { let Latency = 6; } + M1UnitL]> { let Latency = 6; + let NumMicroOps = 2; } def M1WriteVLDB : SchedWriteRes<[M1UnitL, M1UnitL, - M1UnitL]> { let Latency = 7; } + M1UnitL]> { let Latency = 7; + let NumMicroOps = 3; } def M1WriteVLDC : SchedWriteRes<[M1UnitL, M1UnitL, M1UnitL, - M1UnitL]> { let Latency = 8; } + M1UnitL]> { let Latency = 8; + let NumMicroOps = 4; } def M1WriteVLDD : SchedWriteRes<[M1UnitL, M1UnitNALU]> { let Latency = 7; + let NumMicroOps = 2; let ResourceCycles = [2]; } def M1WriteVLDE : SchedWriteRes<[M1UnitL, - M1UnitNALU]> { let Latency = 6; } + M1UnitNALU]> { let Latency = 6; + let NumMicroOps = 2; } def M1WriteVLDF : SchedWriteRes<[M1UnitL, M1UnitL]> { let Latency = 10; + let NumMicroOps = 2; let ResourceCycles = [5]; } def M1WriteVLDG : SchedWriteRes<[M1UnitL, M1UnitNALU, M1UnitNALU]> { let Latency = 7; + let NumMicroOps = 3; let ResourceCycles = [2]; } def M1WriteVLDH : SchedWriteRes<[M1UnitL, M1UnitNALU, - M1UnitNALU]> { let Latency = 6; } + M1UnitNALU]> { let Latency = 6; + let NumMicroOps = 3; } def M1WriteVLDI : SchedWriteRes<[M1UnitL, M1UnitL, M1UnitL]> { let Latency = 12; + let NumMicroOps = 3; let ResourceCycles = [6]; } def M1WriteVLDJ : SchedWriteRes<[M1UnitL, M1UnitNALU, M1UnitNALU, M1UnitNALU]> { let Latency = 9; + let NumMicroOps = 4; let ResourceCycles = [4]; } def M1WriteVLDK : SchedWriteRes<[M1UnitL, M1UnitNALU, M1UnitNALU, M1UnitNALU, M1UnitNALU]> { let Latency = 9; + let NumMicroOps = 5; let ResourceCycles = [4]; } def M1WriteVLDL : SchedWriteRes<[M1UnitL, M1UnitNALU, M1UnitNALU, + M1UnitL, M1UnitNALU]> { let Latency = 7; + let NumMicroOps = 5; let ResourceCycles = [2]; } def M1WriteVLDM : SchedWriteRes<[M1UnitL, M1UnitNALU, M1UnitNALU, + M1UnitL, M1UnitNALU, M1UnitNALU]> { let Latency = 7; + let NumMicroOps = 6; let ResourceCycles = [2]; } def M1WriteVLDN : SchedWriteRes<[M1UnitL, M1UnitL, M1UnitL, M1UnitL]> { let Latency = 14; + let NumMicroOps = 4; let ResourceCycles = [7]; } - def M1WriteVSTA : WriteSequence<[WriteVST], 2>; def M1WriteVSTB : WriteSequence<[WriteVST], 3>; def M1WriteVSTC : WriteSequence<[WriteVST], 4>; def M1WriteVSTD : SchedWriteRes<[M1UnitS, M1UnitFST, M1UnitFST]> { let Latency = 7; + let NumMicroOps = 2; let ResourceCycles = [7]; } def M1WriteVSTE : SchedWriteRes<[M1UnitS, M1UnitFST, M1UnitS, M1UnitFST, M1UnitFST]> { let Latency = 8; + let NumMicroOps = 3; let ResourceCycles = [8]; } def M1WriteVSTF : SchedWriteRes<[M1UnitNALU, M1UnitS, @@ -307,6 +399,7 @@ def M1WriteVSTF : SchedWriteRes<[M1UnitNALU, M1UnitFST, M1UnitFST, M1UnitFST]> { let Latency = 15; + let NumMicroOps = 5; let ResourceCycles = [15]; } def M1WriteVSTG : SchedWriteRes<[M1UnitNALU, M1UnitS, @@ -317,12 +410,14 @@ def M1WriteVSTG : SchedWriteRes<[M1UnitNALU, M1UnitFST, M1UnitFST, M1UnitFST]> { let Latency = 16; + let NumMicroOps = 6; let ResourceCycles = [16]; } def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, M1UnitS, M1UnitFST, M1UnitFST, M1UnitFST]> { let Latency = 14; + let NumMicroOps = 4; let ResourceCycles = [14]; } def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, M1UnitS, @@ -335,27 +430,30 @@ def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, M1UnitFST, M1UnitFST, M1UnitFST]> { let Latency = 17; + let NumMicroOps = 7; let ResourceCycles = [17]; } // Branch instructions def : InstRW<[M1WriteB1], (instrs Bcc)>; -// NOTE: Conditional branch and link adds a B uop. def : InstRW<[M1WriteA1], (instrs BL)>; -// NOTE: Indirect branch and link with LR adds an ALU uop. -def : InstRW<[M1WriteA1, - M1WriteC1], (instrs BLR)>; +def : InstRW<[M1WriteBX], (instrs BLR)>; def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>; -def : InstRW<[M1WriteC1, - M1WriteA2], (instregex "^TBN?Z[WX]")>; +def : InstRW<[M1WriteAD], (instregex "^TBN?Z[WX]")>; // Arithmetic and logical integer instructions. def : InstRW<[M1WriteA1], (instrs COPY)>; +def : InstRW<[M1WriteAX], (instregex ".+r[sx](64)?$")>; // Divide and multiply instructions. // Miscellaneous instructions. // Load instructions. +def : InstRW<[M1WriteLB, + WriteLDHi, + WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; +def : InstRW<[M1WriteLX, + ReadAdrBase], (instregex "^PRFMro[WX]")>; // Store instructions. @@ -375,16 +473,51 @@ def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>; def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>; // FP miscellaneous instructions. -def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>; -def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; -def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>; -def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>; -def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>; -def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>; +def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>; +def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; +def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>; +def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>; +def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev1")>; +def : InstRW<[M1WriteNMISC1], (instregex "^FRECPXv1")>; +def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)S(16|32|64)")>; +def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>; +def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>; // FP load instructions. +def : InstRW<[WriteVLD], (instregex "^LDR[DSQ]l")>; +def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>; +def : InstRW<[WriteVLD, + WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; +def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>; +def : InstRW<[M1WriteLY, + ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>; +def : InstRW<[M1WriteLD, + ReadAdrBase], (instregex "^LDRQro[WX]")>; +def : InstRW<[WriteVLD, + M1WriteLH], (instregex "^LDN?P[DS]i")>; +def : InstRW<[M1WriteLA, + M1WriteLH], (instregex "^LDN?PQi")>; +def : InstRW<[M1WriteLC, + M1WriteLH, + WriteAdr], (instregex "^LDP[DS](post|pre)")>; +def : InstRW<[M1WriteLD, + M1WriteLH, + WriteAdr], (instregex "^LDPQ(post|pre)")>; // FP store instructions. +def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>; +def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>; +def : InstRW<[M1WriteSY, + ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>; +def : InstRW<[M1WriteSB, + ReadAdrBase], (instregex "^STRQro[WX]")>; +def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "^STP[DS](post|pre)")>; +def : InstRW<[M1WriteSC, + WriteAdr], (instregex "^STPQ(post|pre)")>; // ASIMD instructions. def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>; @@ -409,10 +542,12 @@ def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>; def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>; def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>; def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>; -def : InstRW<[M1WriteNALU1], (instregex "^[SU]?SH(L|LL|R)2?v")>; -def : InstRW<[M1WriteNALU1], (instregex "^S[LR]Iv")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU](Q|QR|R)SHLU?v")>; +def : InstRW<[M1WriteNALU1], (instregex "^SHL[dv]")>; +def : InstRW<[M1WriteNALU1], (instregex "^[SU]SH[LR][dv]")>; +def : InstRW<[M1WriteNALU1], (instregex "^S[RS]I[dv]")>; +def : InstRW<[M1WriteNAL13], (instregex "^(([SU]Q)?R)?SHRU?N[bhsv]")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]RSH[LR][dv]")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]QR?SHLU?[bdhsv]")>; // ASIMD FP instructions. def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>; @@ -435,13 +570,16 @@ def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>; // ASIMD miscellaneous instructions. def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>; def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>; -def : InstRW<[M1WriteNALU1], (instregex "^CPY")>; def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>; +def : InstRW<[M1WriteNALU1], (instregex "^EXTv8")>; +def : InstRW<[M1WriteNEONL], (instregex "^EXTv16")>; def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>; -def : InstRW<[M1WriteNEONC], (instregex "^INSv.+gpr")>; -def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev")>; -def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>; +def : InstRW<[M1WriteNALU1], (instregex "^CPY")>; +def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>; +def : InstRW<[M1WriteNALU1], (instregex "^MOVI[Dv]")>; +def : InstRW<[M1WriteNALU1], (instregex "^FMOVv")>; +def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev[248]")>; def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)Sv")>; def : InstRW<[M1WriteNALU1], (instregex "^REV(16|32|64)v")>; def : InstRW<[M1WriteNAL11], (instregex "^TB[LX]v8i8One")>; @@ -459,7 +597,7 @@ def : InstRW<[WriteSequence<[M1WriteNAL12], 3>], def : InstRW<[WriteSequence<[M1WriteNAL12], 4>], (instregex "^TB[LX]v16i8Four")>; def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>; -def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>; +def : InstRW<[M1WriteNEONC], (instregex "^INSv.+gpr")>; def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>; def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>; def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td index 9a0cb702518d..585688aae279 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -25,6 +25,8 @@ def ThunderXT8XModel : SchedMachineModel { let MispredictPenalty = 8; // Branch mispredict penalty. let PostRAScheduler = 1; // Use PostRA scheduler. let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = [HasSVE]; } // Modeling each pipeline with BufferSize == 0 since T8X is in-order. @@ -239,20 +241,20 @@ def : ReadAdvance<ReadID, 1, [WriteImm, WriteI, //--- // Branch //--- -def : InstRW<[THXT8XWriteBR], (instregex "^B")>; -def : InstRW<[THXT8XWriteBR], (instregex "^BL")>; -def : InstRW<[THXT8XWriteBR], (instregex "^B.*")>; +def : InstRW<[THXT8XWriteBR], (instregex "^B$")>; +def : InstRW<[THXT8XWriteBR], (instregex "^BL$")>; +def : InstRW<[THXT8XWriteBR], (instregex "^B..$")>; def : InstRW<[THXT8XWriteBR], (instregex "^CBNZ")>; def : InstRW<[THXT8XWriteBR], (instregex "^CBZ")>; def : InstRW<[THXT8XWriteBR], (instregex "^TBNZ")>; def : InstRW<[THXT8XWriteBR], (instregex "^TBZ")>; -def : InstRW<[THXT8XWriteBRR], (instregex "^BR")>; -def : InstRW<[THXT8XWriteBRR], (instregex "^BLR")>; +def : InstRW<[THXT8XWriteBRR], (instregex "^BR$")>; +def : InstRW<[THXT8XWriteBRR], (instregex "^BLR$")>; //--- // Ret //--- -def : InstRW<[THXT8XWriteRET], (instregex "^RET")>; +def : InstRW<[THXT8XWriteRET], (instregex "^RET$")>; //--- // Miscellaneous diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index 10df50bcf156..22f272edd680 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -22,9 +22,11 @@ def ThunderX2T99Model : SchedMachineModel { let LoadLatency = 4; // Optimistic load latency. let MispredictPenalty = 12; // Extra cycles for mispredicted branch. // Determined via a mix of micro-arch details and experimentation. - let LoopMicroOpBufferSize = 32; + let LoopMicroOpBufferSize = 128; let PostRAScheduler = 1; // Using PostRA sched. let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = [HasSVE]; } // Define the issue ports. @@ -315,6 +317,36 @@ def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let NumMicroOps = 3; } +// 8 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_8Cyc_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 8; + let NumMicroOps = 4; +} + +// 12 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_12Cyc_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 12; + let NumMicroOps = 6; +} + +// 16 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_16Cyc_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 16; + let NumMicroOps = 8; +} + +// 24 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_24Cyc_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 24; + let NumMicroOps = 12; +} + +// 32 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_32Cyc_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 32; + let NumMicroOps = 16; +} + // Define commonly used read types. // No forwarding is provided for these types. @@ -368,7 +400,7 @@ def : WriteRes<WriteAtomic, []> { //--- def : InstRW<[THX2T99Write_1Cyc_I2], (instrs B, BL, BR, BLR)>; def : InstRW<[THX2T99Write_1Cyc_I2], (instrs RET)>; -def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^B.*")>; +def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^B..$")>; def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>; @@ -1741,5 +1773,108 @@ def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>; def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST4i(8|16|32|64)_POST$")>; +// V8.1a Atomics (LSE) +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs CASB, CASH, CASW, CASX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs CASAB, CASAH, CASAW, CASAX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs CASLB, CASLH, CASLW, CASLX)>; + +def : InstRW<[THX2T99Write_16Cyc_I012, WriteAtomic], + (instrs CASALB, CASALH, CASALW, CASALX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDLARB, LDLARH, LDLARW, LDLARX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs LDADDB, LDADDH, LDADDW, LDADDX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>; + +def : InstRW<[THX2T99Write_16Cyc_I012, WriteAtomic], + (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>; + +def : InstRW<[THX2T99Write_16Cyc_I012, WriteAtomic], + (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs LDEORB, LDEORH, LDEORW, LDEORX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>; + +def : InstRW<[THX2T99Write_16Cyc_I012, WriteAtomic], + (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs LDSETB, LDSETH, LDSETW, LDSETX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>; + +def : InstRW<[THX2T99Write_16Cyc_I012, WriteAtomic], + (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX, + LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX, + LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX, + LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX, + LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX, + LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX, + LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX, + LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX, + LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX, + LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX, + LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX, + LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX, + LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs SWPB, SWPH, SWPW, SWPX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs SWPAB, SWPAH, SWPAW, SWPAX)>; + +def : InstRW<[THX2T99Write_12Cyc_I012, WriteAtomic], + (instrs SWPLB, SWPLH, SWPLW, SWPLX)>; + +def : InstRW<[THX2T99Write_16Cyc_I012, WriteAtomic], + (instrs SWPALB, SWPALH, SWPALW, SWPALX)>; + +def : InstRW<[THX2T99Write_8Cyc_I012, WriteAtomic], + (instrs STLLRB, STLLRH, STLLRW, STLLRX)>; + } // SchedModel = ThunderX2T99Model diff --git a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp index fe984ccbaf1d..571e61d7083c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -16,10 +16,10 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; @@ -120,7 +120,7 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) { } bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; const TargetSubtargetInfo &ST = MF.getSubtarget(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index ea6112452736..e397d585ae77 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -18,19 +18,12 @@ #include "AArch64PBQPRegAlloc.h" #include "AArch64TargetMachine.h" -#ifdef LLVM_BUILD_GLOBAL_ISEL #include "AArch64CallLowering.h" #include "AArch64LegalizerInfo.h" #include "AArch64RegisterBankInfo.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" -#include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" -#include "llvm/CodeGen/GlobalISel/Legalizer.h" -#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" -#endif #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/GlobalValue.h" -#include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -98,6 +91,11 @@ void AArch64Subtarget::initializeProperties() { MinPrefetchStride = 2048; MaxPrefetchIterationsAhead = 8; break; + case Saphira: + MaxInterleaveFactor = 4; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; + break; case Kryo: MaxInterleaveFactor = 4; VectorInsertExtractBaseCost = 2; @@ -130,93 +128,55 @@ void AArch64Subtarget::initializeProperties() { MinVectorRegisterBitWidth = 128; break; case CortexA35: break; - case CortexA53: break; - case CortexA72: - PrefFunctionAlignment = 4; + case CortexA53: + PrefFunctionAlignment = 3; break; + case CortexA55: break; + case CortexA72: case CortexA73: + case CortexA75: PrefFunctionAlignment = 4; break; case Others: break; } } -#ifdef LLVM_BUILD_GLOBAL_ISEL -namespace { - -struct AArch64GISelActualAccessor : public GISelAccessor { - std::unique_ptr<CallLowering> CallLoweringInfo; - std::unique_ptr<InstructionSelector> InstSelector; - std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; - - const CallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); - } - - const InstructionSelector *getInstructionSelector() const override { - return InstSelector.get(); - } - - const LegalizerInfo *getLegalizerInfo() const override { - return Legalizer.get(); - } - - const RegisterBankInfo *getRegBankInfo() const override { - return RegBankInfo.get(); - } -}; - -} // end anonymous namespace -#endif - AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), - ReserveX18(TT.isOSDarwin() || TT.isOSWindows()), - IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(), + ReserveX18(TT.isOSDarwin() || TT.isOSWindows()), IsLittle(LittleEndian), + TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), - TLInfo(TM, *this), GISel() { -#ifndef LLVM_BUILD_GLOBAL_ISEL - GISelAccessor *AArch64GISel = new GISelAccessor(); -#else - AArch64GISelActualAccessor *AArch64GISel = new AArch64GISelActualAccessor(); - AArch64GISel->CallLoweringInfo.reset( - new AArch64CallLowering(*getTargetLowering())); - AArch64GISel->Legalizer.reset(new AArch64LegalizerInfo()); + TLInfo(TM, *this) { + CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); + Legalizer.reset(new AArch64LegalizerInfo(*this)); auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); // FIXME: At this point, we can't rely on Subtarget having RBI. // It's awkward to mix passing RBI and the Subtarget; should we pass // TII/TRI as well? - AArch64GISel->InstSelector.reset(createAArch64InstructionSelector( + InstSelector.reset(createAArch64InstructionSelector( *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); - AArch64GISel->RegBankInfo.reset(RBI); -#endif - setGISelAccessor(*AArch64GISel); + RegBankInfo.reset(RBI); } const CallLowering *AArch64Subtarget::getCallLowering() const { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getCallLowering(); + return CallLoweringInfo.get(); } const InstructionSelector *AArch64Subtarget::getInstructionSelector() const { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getInstructionSelector(); + return InstSelector.get(); } const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getLegalizerInfo(); + return Legalizer.get(); } const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getRegBankInfo(); + return RegBankInfo.get(); } /// Find the target operand flags that describe how a global value should be diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h index 5a1f45ee2552..5d9759d363dd 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -19,9 +19,12 @@ #include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64SelectionDAGInfo.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <string> #define GET_SUBTARGETINFO_HEADER @@ -38,13 +41,16 @@ public: Others, CortexA35, CortexA53, + CortexA55, CortexA57, CortexA72, CortexA73, + CortexA75, Cyclone, ExynosM1, Falkor, Kryo, + Saphira, ThunderX2T99, ThunderX, ThunderXT81, @@ -58,10 +64,12 @@ protected: bool HasV8_1aOps = false; bool HasV8_2aOps = false; + bool HasV8_3aOps = false; bool HasFPARMv8 = false; bool HasNEON = false; bool HasCrypto = false; + bool HasDotProd = false; bool HasCRC = false; bool HasLSE = false; bool HasRAS = false; @@ -71,12 +79,14 @@ protected: bool HasSPE = false; bool HasLSLFast = false; bool HasSVE = false; + bool HasRCPC = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. bool HasZeroCycleZeroing = false; + bool HasZeroCycleZeroingFPWorkaround = false; // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign = false; @@ -94,6 +104,7 @@ protected: bool UsePostRAScheduler = false; bool Misaligned128StoreIsSlow = false; bool Paired128IsSlow = false; + bool STRQroIsSlow = false; bool UseAlternateSExtLoadCVTF32Pattern = false; bool HasArithmeticBccFusion = false; bool HasArithmeticCbzFusion = false; @@ -124,10 +135,12 @@ protected: AArch64InstrInfo InstrInfo; AArch64SelectionDAGInfo TSInfo; AArch64TargetLowering TLInfo; - /// Gather the accessor points to GlobalISel-related APIs. - /// This is used to avoid ifndefs spreading around while GISel is - /// an optional library. - std::unique_ptr<GISelAccessor> GISel; + + /// GlobalISel related APIs. + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; private: /// initializeSubtargetDependencies - Initializes using CPUString and the @@ -146,11 +159,6 @@ public: const std::string &FS, const TargetMachine &TM, bool LittleEndian); - /// This object will take onwership of \p GISelAccessor. - void setGISelAccessor(GISelAccessor &GISel) { - this->GISel.reset(&GISel); - } - const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } @@ -184,11 +192,16 @@ public: bool hasV8_1aOps() const { return HasV8_1aOps; } bool hasV8_2aOps() const { return HasV8_2aOps; } + bool hasV8_3aOps() const { return HasV8_3aOps; } bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + bool hasZeroCycleZeroingFPWorkaround() const { + return HasZeroCycleZeroingFPWorkaround; + } + bool requiresStrictAlign() const { return StrictAlign; } bool isXRaySupported() const override { return true; } @@ -201,6 +214,7 @@ public: bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } + bool hasDotProd() const { return HasDotProd; } bool hasCRC() const { return HasCRC; } bool hasLSE() const { return HasLSE; } bool hasRAS() const { return HasRAS; } @@ -212,6 +226,7 @@ public: bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } bool isPaired128Slow() const { return Paired128IsSlow; } + bool isSTRQroSlow() const { return STRQroIsSlow; } bool useAlternateSExtLoadCVTF32Pattern() const { return UseAlternateSExtLoadCVTF32Pattern; } @@ -253,6 +268,7 @@ public: bool hasSPE() const { return HasSPE; } bool hasLSLFast() const { return HasLSLFast; } bool hasSVE() const { return HasSVE; } + bool hasRCPC() const { return HasRCPC; } bool isLittleEndian() const { return IsLittle; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 7c5dcb0853eb..df939add70fa 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -342,6 +342,9 @@ def : ROSysReg<"ID_ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b010>; def : ROSysReg<"ID_ISAR3_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b011>; def : ROSysReg<"ID_ISAR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b100>; def : ROSysReg<"ID_ISAR5_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b101>; +def : ROSysReg<"ID_ISAR6_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b111> { + let Requires = [{ {AArch64::HasV8_2aOps} }]; +} def : ROSysReg<"ID_AA64PFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b000>; def : ROSysReg<"ID_AA64PFR1_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b001>; def : ROSysReg<"ID_AA64DFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b000>; @@ -1016,6 +1019,21 @@ def : RWSysReg<"VDISR_EL2", 0b11, 0b100, 0b1100, 0b0001, 0b001>; def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>; } +// v8.3a "Pointer authentication extension" registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_3aOps} }] in { +def : RWSysReg<"APIAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b000>; +def : RWSysReg<"APIAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b001>; +def : RWSysReg<"APIBKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b010>; +def : RWSysReg<"APIBKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b011>; +def : RWSysReg<"APDAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0010, 0b000>; +def : RWSysReg<"APDAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0010, 0b001>; +def : RWSysReg<"APDBKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0010, 0b010>; +def : RWSysReg<"APDBKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0010, 0b011>; +def : RWSysReg<"APGAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b000>; +def : RWSysReg<"APGAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b001>; +} + // Cyclone specific system registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::ProcCyclone} }] in diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index ba28c01a2eff..64583ead73f2 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -35,7 +36,6 @@ #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Scalar.h" #include <memory> @@ -157,7 +157,7 @@ extern "C" void LLVMInitializeAArch64Target() { initializeAArch64DeadRegisterDefinitionsPass(*PR); initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); - initializeAArch64VectorByElementOptPass(*PR); + initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); @@ -206,20 +206,42 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, return *RM; } +static CodeModel::Model getEffectiveCodeModel(const Triple &TT, + Optional<CodeModel::Model> CM, + bool JIT) { + if (CM) { + if (*CM != CodeModel::Small && *CM != CodeModel::Large) { + if (!TT.isOSFuchsia()) + report_fatal_error( + "Only small and large code models are allowed on AArch64"); + else if (CM != CodeModel::Kernel) + report_fatal_error( + "Only small, kernel, and large code models are allowed on AArch64"); + } + return *CM; + } + // The default MCJIT memory managers make no guarantees about where they can + // find an executable page; JITed code needs to be able to refer to globals + // no matter how far away they are. + if (JIT) + return CodeModel::Large; + return CodeModel::Small; +} + /// Create an AArch64 architecture model. /// -AArch64TargetMachine::AArch64TargetMachine( - const Target &T, const Triple &TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian) - // This nested ternary is horrible, but DL needs to be properly - // initialized before TLInfo is constructed. - : LLVMTargetMachine(T, computeDataLayout(TT, Options.MCOptions, - LittleEndian), - TT, CPU, FS, Options, - getEffectiveRelocModel(TT, RM), CM, OL), - TLOF(createTLOF(getTargetTriple())), - isLittle(LittleEndian) { +AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Optional<Reloc::Model> RM, + Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT, + bool LittleEndian) + : LLVMTargetMachine(T, + computeDataLayout(TT, Options.MCOptions, LittleEndian), + TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM), + getEffectiveCodeModel(TT, CM, JIT), OL), + TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) { initAsmInfo(); } @@ -254,16 +276,16 @@ void AArch64leTargetMachine::anchor() { } AArch64leTargetMachine::AArch64leTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} + Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT) + : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, true) {} void AArch64beTargetMachine::anchor() { } AArch64beTargetMachine::AArch64beTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} + Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT) + : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, false) {} namespace { @@ -308,13 +330,11 @@ public: void addIRPasses() override; bool addPreISel() override; bool addInstSelector() override; -#ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; bool addLegalizeMachineIR() override; bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; -#endif bool addILPOpts() override; void addPreRegAlloc() override; void addPostRegAlloc() override; @@ -345,7 +365,7 @@ void AArch64PassConfig::addIRPasses() { // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) - addPass(createCFGSimplificationPass()); + addPass(createCFGSimplificationPass(1, true, true, false, true)); // Run LoopDataPrefetch // @@ -410,7 +430,6 @@ bool AArch64PassConfig::addInstSelector() { return false; } -#ifdef LLVM_BUILD_GLOBAL_ISEL bool AArch64PassConfig::addIRTranslator() { addPass(new IRTranslator()); return false; @@ -436,7 +455,6 @@ bool AArch64PassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); return false; } -#endif bool AArch64PassConfig::isGlobalISelEnabled() const { return TM->getOptLevel() <= EnableGlobalISelAtO; @@ -455,7 +473,7 @@ bool AArch64PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); - addPass(createAArch64VectorByElementOptPass()); + addPass(createAArch64SIMDInstrOptPass()); return true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 85de02e859e0..2bbfb2da3db6 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -31,13 +31,14 @@ protected: public: AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool IsLittleEndian); + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT, bool IsLittleEndian); ~AArch64TargetMachine() override; const AArch64Subtarget *getSubtargetImpl(const Function &F) const override; - // The no argument getSubtargetImpl, while it exists on some, targets is - // deprecated and should not be used. + // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget, + // subtargets are per-function entities based on the target-specific + // attributes of each function. const AArch64Subtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration @@ -61,8 +62,9 @@ class AArch64leTargetMachine : public AArch64TargetMachine { public: AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + Optional<Reloc::Model> RM, + Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, + bool JIT); }; // AArch64 big endian target machine. @@ -72,8 +74,9 @@ class AArch64beTargetMachine : public AArch64TargetMachine { public: AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + Optional<Reloc::Model> RM, + Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, + bool JIT); }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h index 9077eb7902fd..f081d7caba67 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -10,8 +10,8 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Target/TargetLoweringObjectFile.h" namespace llvm { class AArch64TargetMachine; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a76f080530bb..1820ad959fcb 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -12,9 +12,10 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/CodeGen/CostTable.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/CostTable.h" -#include "llvm/Target/TargetLowering.h" #include <algorithm> using namespace llvm; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 31c037354925..08c693ff38a8 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -1,4 +1,4 @@ -//===-- AArch64TargetTransformInfo.h - AArch64 specific TTI -----*- C++ -*-===// +//===- AArch64TargetTransformInfo.h - AArch64 specific TTI ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,17 +18,31 @@ #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H #include "AArch64.h" +#include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/Target/TargetLowering.h" -#include <algorithm> +#include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" +#include <cstdint> namespace llvm { +class APInt; +class Instruction; +class IntrinsicInst; +class Loop; +class SCEV; +class ScalarEvolution; +class Type; +class Value; +class VectorType; + class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { - typedef BasicTTIImplBase<AArch64TTIImpl> BaseT; - typedef TargetTransformInfo TTI; + using BaseT = BasicTTIImplBase<AArch64TTIImpl>; + using TTI = TargetTransformInfo; + friend BaseT; const AArch64Subtarget *ST; @@ -157,4 +171,4 @@ public: } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H diff --git a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp deleted file mode 100644 index f53af2315ec9..000000000000 --- a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp +++ /dev/null @@ -1,388 +0,0 @@ -//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass that performs optimization for vector by element -// SIMD instructions. -// -// Certain SIMD instructions with vector element operand are not efficient. -// Rewrite them into SIMD instructions with vector operands. This rewrite -// is driven by the latency of the instructions. -// -// Example: -// fmla v0.4s, v1.4s, v2.s[1] -// is rewritten into -// dup v3.4s, v2.s[1] -// fmla v0.4s, v1.4s, v3.4s -// -//===----------------------------------------------------------------------===// - -#include "AArch64InstrInfo.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetSchedule.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCSchedule.h" -#include "llvm/Pass.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" -#include <map> - -using namespace llvm; - -#define DEBUG_TYPE "aarch64-vectorbyelement-opt" - -STATISTIC(NumModifiedInstr, - "Number of vector by element instructions modified"); - -#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ - "AArch64 vector by element instruction optimization pass" - -namespace { - -struct AArch64VectorByElementOpt : public MachineFunctionPass { - static char ID; - - const TargetInstrInfo *TII; - MachineRegisterInfo *MRI; - TargetSchedModel SchedModel; - - AArch64VectorByElementOpt() : MachineFunctionPass(ID) { - initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); - } - - /// Based only on latency of instructions, determine if it is cost efficient - /// to replace the instruction InstDesc by the two instructions InstDescRep1 - /// and InstDescRep2. - /// Return true if replacement is recommended. - bool - shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc, - const MCInstrDesc *InstDescRep1, - const MCInstrDesc *InstDescRep2, - std::map<unsigned, bool> &VecInstElemTable) const; - - /// Determine if we need to exit the vector by element instruction - /// optimization pass early. This makes sure that Targets with no need - /// for this optimization do not spent any compile time on this pass. - /// This check is done by comparing the latency of an indexed FMLA - /// instruction to the latency of the DUP + the latency of a vector - /// FMLA instruction. We do not check on other related instructions such - /// as FMLS as we assume that if the situation shows up for one - /// instruction, then it is likely to show up for the related ones. - /// Return true if early exit of the pass is recommended. - bool earlyExitVectElement(MachineFunction *MF); - - /// Check whether an equivalent DUP instruction has already been - /// created or not. - /// Return true when the dup instruction already exists. In this case, - /// DestReg will point to the destination of the already created DUP. - bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, - unsigned LaneNumber, unsigned *DestReg) const; - - /// Certain SIMD instructions with vector element operand are not efficient. - /// Rewrite them into SIMD instructions with vector operands. This rewrite - /// is driven by the latency of the instructions. - /// Return true if the SIMD instruction is modified. - bool optimizeVectElement(MachineInstr &MI, - std::map<unsigned, bool> *VecInstElemTable) const; - - bool runOnMachineFunction(MachineFunction &Fn) override; - - StringRef getPassName() const override { - return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; - } -}; - -char AArch64VectorByElementOpt::ID = 0; - -} // end anonymous namespace - -INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt", - AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) - -/// Based only on latency of instructions, determine if it is cost efficient -/// to replace the instruction InstDesc by the two instructions InstDescRep1 -/// and InstDescRep2. Note that it is assumed in this fuction that an -/// instruction of type InstDesc is always replaced by the same two -/// instructions as results are cached here. -/// Return true if replacement is recommended. -bool AArch64VectorByElementOpt::shouldReplaceInstruction( - MachineFunction *MF, const MCInstrDesc *InstDesc, - const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2, - std::map<unsigned, bool> &VecInstElemTable) const { - // Check if replacment decision is alredy available in the cached table. - // if so, return it. - if (!VecInstElemTable.empty() && - VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end()) - return VecInstElemTable[InstDesc->getOpcode()]; - - unsigned SCIdx = InstDesc->getSchedClass(); - unsigned SCIdxRep1 = InstDescRep1->getSchedClass(); - unsigned SCIdxRep2 = InstDescRep2->getSchedClass(); - const MCSchedClassDesc *SCDesc = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); - const MCSchedClassDesc *SCDescRep1 = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1); - const MCSchedClassDesc *SCDescRep2 = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2); - - // If a subtarget does not define resources for any of the instructions - // of interest, then return false for no replacement. - if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() || - SCDescRep1->isVariant() || !SCDescRep2->isValid() || - SCDescRep2->isVariant()) { - VecInstElemTable[InstDesc->getOpcode()] = false; - return false; - } - - if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > - SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) + - SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) { - VecInstElemTable[InstDesc->getOpcode()] = true; - return true; - } - VecInstElemTable[InstDesc->getOpcode()] = false; - return false; -} - -/// Determine if we need to exit the vector by element instruction -/// optimization pass early. This makes sure that Targets with no need -/// for this optimization do not spent any compile time on this pass. -/// This check is done by comparing the latency of an indexed FMLA -/// instruction to the latency of the DUP + the latency of a vector -/// FMLA instruction. We do not check on other related instructions such -/// as FMLS as we assume that if the situation shows up for one -/// instruction, then it is likely to show up for the related ones. -/// Return true if early exit of the pass is recommended. -bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) { - std::map<unsigned, bool> VecInstElemTable; - const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed); - const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane); - const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32); - - if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID, - VecInstElemTable)) - return true; - return false; -} - -/// Check whether an equivalent DUP instruction has already been -/// created or not. -/// Return true when the dup instruction already exists. In this case, -/// DestReg will point to the destination of the already created DUP. -bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, - unsigned SrcReg, unsigned LaneNumber, - unsigned *DestReg) const { - for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); - MII != MIE;) { - MII--; - MachineInstr *CurrentMI = &*MII; - - if (CurrentMI->getOpcode() == DupOpcode && - CurrentMI->getNumOperands() == 3 && - CurrentMI->getOperand(1).getReg() == SrcReg && - CurrentMI->getOperand(2).getImm() == LaneNumber) { - *DestReg = CurrentMI->getOperand(0).getReg(); - return true; - } - } - - return false; -} - -/// Certain SIMD instructions with vector element operand are not efficient. -/// Rewrite them into SIMD instructions with vector operands. This rewrite -/// is driven by the latency of the instructions. -/// The instruction of concerns are for the time being fmla, fmls, fmul, -/// and fmulx and hence they are hardcoded. -/// -/// Example: -/// fmla v0.4s, v1.4s, v2.s[1] -/// is rewritten into -/// dup v3.4s, v2.s[1] // dup not necessary if redundant -/// fmla v0.4s, v1.4s, v3.4s -/// Return true if the SIMD instruction is modified. -bool AArch64VectorByElementOpt::optimizeVectElement( - MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const { - const MCInstrDesc *MulMCID, *DupMCID; - const TargetRegisterClass *RC = &AArch64::FPR128RegClass; - - switch (MI.getOpcode()) { - default: - return false; - - // 4X32 instructions - case AArch64::FMLAv4i32_indexed: - DupMCID = &TII->get(AArch64::DUPv4i32lane); - MulMCID = &TII->get(AArch64::FMLAv4f32); - break; - case AArch64::FMLSv4i32_indexed: - DupMCID = &TII->get(AArch64::DUPv4i32lane); - MulMCID = &TII->get(AArch64::FMLSv4f32); - break; - case AArch64::FMULXv4i32_indexed: - DupMCID = &TII->get(AArch64::DUPv4i32lane); - MulMCID = &TII->get(AArch64::FMULXv4f32); - break; - case AArch64::FMULv4i32_indexed: - DupMCID = &TII->get(AArch64::DUPv4i32lane); - MulMCID = &TII->get(AArch64::FMULv4f32); - break; - - // 2X64 instructions - case AArch64::FMLAv2i64_indexed: - DupMCID = &TII->get(AArch64::DUPv2i64lane); - MulMCID = &TII->get(AArch64::FMLAv2f64); - break; - case AArch64::FMLSv2i64_indexed: - DupMCID = &TII->get(AArch64::DUPv2i64lane); - MulMCID = &TII->get(AArch64::FMLSv2f64); - break; - case AArch64::FMULXv2i64_indexed: - DupMCID = &TII->get(AArch64::DUPv2i64lane); - MulMCID = &TII->get(AArch64::FMULXv2f64); - break; - case AArch64::FMULv2i64_indexed: - DupMCID = &TII->get(AArch64::DUPv2i64lane); - MulMCID = &TII->get(AArch64::FMULv2f64); - break; - - // 2X32 instructions - case AArch64::FMLAv2i32_indexed: - RC = &AArch64::FPR64RegClass; - DupMCID = &TII->get(AArch64::DUPv2i32lane); - MulMCID = &TII->get(AArch64::FMLAv2f32); - break; - case AArch64::FMLSv2i32_indexed: - RC = &AArch64::FPR64RegClass; - DupMCID = &TII->get(AArch64::DUPv2i32lane); - MulMCID = &TII->get(AArch64::FMLSv2f32); - break; - case AArch64::FMULXv2i32_indexed: - RC = &AArch64::FPR64RegClass; - DupMCID = &TII->get(AArch64::DUPv2i32lane); - MulMCID = &TII->get(AArch64::FMULXv2f32); - break; - case AArch64::FMULv2i32_indexed: - RC = &AArch64::FPR64RegClass; - DupMCID = &TII->get(AArch64::DUPv2i32lane); - MulMCID = &TII->get(AArch64::FMULv2f32); - break; - } - - if (!shouldReplaceInstruction(MI.getParent()->getParent(), - &TII->get(MI.getOpcode()), DupMCID, MulMCID, - *VecInstElemTable)) - return false; - - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock &MBB = *MI.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - // get the operands of the current SIMD arithmetic instruction. - unsigned MulDest = MI.getOperand(0).getReg(); - unsigned SrcReg0 = MI.getOperand(1).getReg(); - unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); - unsigned SrcReg1 = MI.getOperand(2).getReg(); - unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); - unsigned DupDest; - - // Instructions of interest have either 4 or 5 operands. - if (MI.getNumOperands() == 5) { - unsigned SrcReg2 = MI.getOperand(3).getReg(); - unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); - unsigned LaneNumber = MI.getOperand(4).getImm(); - - // Create a new DUP instruction. Note that if an equivalent DUP instruction - // has already been created before, then use that one instread of creating - // a new one. - if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { - DupDest = MRI.createVirtualRegister(RC); - BuildMI(MBB, MI, DL, *DupMCID, DupDest) - .addReg(SrcReg2, Src2IsKill) - .addImm(LaneNumber); - } - BuildMI(MBB, MI, DL, *MulMCID, MulDest) - .addReg(SrcReg0, Src0IsKill) - .addReg(SrcReg1, Src1IsKill) - .addReg(DupDest, Src2IsKill); - } else if (MI.getNumOperands() == 4) { - unsigned LaneNumber = MI.getOperand(3).getImm(); - if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { - DupDest = MRI.createVirtualRegister(RC); - BuildMI(MBB, MI, DL, *DupMCID, DupDest) - .addReg(SrcReg1, Src1IsKill) - .addImm(LaneNumber); - } - BuildMI(MBB, MI, DL, *MulMCID, MulDest) - .addReg(SrcReg0, Src0IsKill) - .addReg(DupDest, Src1IsKill); - } else { - return false; - } - - ++NumModifiedInstr; - return true; -} - -bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) - return false; - - TII = MF.getSubtarget().getInstrInfo(); - MRI = &MF.getRegInfo(); - const TargetSubtargetInfo &ST = MF.getSubtarget(); - const AArch64InstrInfo *AAII = - static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); - if (!AAII) - return false; - SchedModel.init(ST.getSchedModel(), &ST, AAII); - if (!SchedModel.hasInstrSchedModel()) - return false; - - // A simple check to exit this pass early for targets that do not need it. - if (earlyExitVectElement(&MF)) - return false; - - bool Changed = false; - std::map<unsigned, bool> VecInstElemTable; - SmallVector<MachineInstr *, 8> RemoveMIs; - - for (MachineBasicBlock &MBB : MF) { - for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); - MII != MIE;) { - MachineInstr &MI = *MII; - if (optimizeVectElement(MI, &VecInstElemTable)) { - // Add MI to the list of instructions to be removed given that it has - // been replaced. - RemoveMIs.push_back(&MI); - Changed = true; - } - ++MII; - } - } - - for (MachineInstr *MI : RemoveMIs) - MI->eraseFromParent(); - - return Changed; -} - -/// createAArch64VectorByElementOptPass - returns an instance of the -/// vector by element optimization pass. -FunctionPass *llvm::createAArch64VectorByElementOptPass() { - return new AArch64VectorByElementOpt(); -} diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index a79d51820545..aeffbd70fc81 100644 --- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -59,12 +59,19 @@ using namespace llvm; namespace { +enum class RegKind { + Scalar, + NeonVector, + SVEDataVector, + SVEPredicateVector +}; + class AArch64AsmParser : public MCTargetAsmParser { private: StringRef Mnemonic; ///< Instruction mnemonic. // Map of register aliases registers via the .req directive. - StringMap<std::pair<bool, unsigned>> RegisterReqs; + StringMap<std::pair<RegKind, unsigned>> RegisterReqs; AArch64TargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); @@ -77,7 +84,7 @@ private: void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S); AArch64CC::CondCode parseCondCodeString(StringRef Cond); bool parseCondCode(OperandVector &Operands, bool invertCondCode); - unsigned matchRegisterNameAlias(StringRef Name, bool isVector); + unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind); int tryParseRegister(); int tryMatchVectorRegister(StringRef &Kind, bool expected); bool parseRegister(OperandVector &Operands); @@ -114,6 +121,8 @@ private: /// } + OperandMatchResultTy tryParseSVERegister(int &Reg, StringRef &Kind, + RegKind MatchKind); OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands); OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands); OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands); @@ -126,8 +135,11 @@ private: OperandMatchResultTy tryParseFPImm(OperandVector &Operands); OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands); OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands); - bool tryParseVectorRegister(OperandVector &Operands); + bool tryParseNeonVectorRegister(OperandVector &Operands); OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands); + template <bool ParseSuffix> + OperandMatchResultTy tryParseSVEDataVector(OperandVector &Operands); + OperandMatchResultTy tryParseSVEPredicateVector(OperandVector &Operands); public: enum AArch64MatchResultTy { @@ -139,7 +151,7 @@ public: AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI) { + : MCTargetAsmParser(Options, STI, MII) { IsILP32 = Options.getABIName() == "ilp32"; MCAsmParserExtension::Initialize(Parser); MCStreamer &S = getParser().getStreamer(); @@ -194,7 +206,9 @@ private: struct RegOp { unsigned RegNum; - bool isVector; + RegKind Kind; + + int ElementWidth; }; struct VectorListOp { @@ -465,6 +479,15 @@ public: int64_t Val = MCE->getValue(); return (Val >= -256 && Val < 256); } + bool isSImm10s8() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -4096 && Val < 4089 && (Val & 7) == 0); + } bool isSImm7s4() const { if (!isImm()) return false; @@ -795,37 +818,76 @@ public: return SysReg.PStateField != -1U; } - bool isReg() const override { return Kind == k_Register && !Reg.isVector; } - bool isVectorReg() const { return Kind == k_Register && Reg.isVector; } + bool isReg() const override { + return Kind == k_Register && Reg.Kind == RegKind::Scalar; + } + + bool isNeonVectorReg() const { + return Kind == k_Register && Reg.Kind == RegKind::NeonVector; + } - bool isVectorRegLo() const { - return Kind == k_Register && Reg.isVector && + bool isNeonVectorRegLo() const { + return Kind == k_Register && Reg.Kind == RegKind::NeonVector && AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( Reg.RegNum); } + template <unsigned Class> bool isSVEVectorReg() const { + RegKind RK; + switch (Class) { + case AArch64::ZPRRegClassID: + RK = RegKind::SVEDataVector; + break; + case AArch64::PPRRegClassID: + RK = RegKind::SVEPredicateVector; + break; + default: + llvm_unreachable("Unsupport register class"); + } + + return (Kind == k_Register && Reg.Kind == RK) && + AArch64MCRegisterClasses[Class].contains(getReg()); + } + + template <int ElementWidth, unsigned Class> + bool isSVEVectorRegOfWidth() const { + return isSVEVectorReg<Class>() && + (ElementWidth == -1 || Reg.ElementWidth == ElementWidth); + } + bool isGPR32as64() const { - return Kind == k_Register && !Reg.isVector && + return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum); } bool isWSeqPair() const { - return Kind == k_Register && !Reg.isVector && + return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains( Reg.RegNum); } bool isXSeqPair() const { - return Kind == k_Register && !Reg.isVector && + return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains( Reg.RegNum); } bool isGPR64sp0() const { - return Kind == k_Register && !Reg.isVector && + return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum); } + template<int64_t Angle, int64_t Remainder> + bool isComplexRotation() const { + if (!isImm()) return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + uint64_t Value = CE->getValue(); + + return (Value % Angle == Remainder && Value <= 270); + } + /// Is this a vector list with the type implicit (presumably attached to the /// instruction itself)? template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const { @@ -1213,6 +1275,12 @@ public: Inst.addOperand(MCOperand::createImm(MCE->getValue())); } + void addSImm10s8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue() / 8)); + } + void addSImm7s4Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); @@ -1512,6 +1580,18 @@ public: Inst.addOperand(MCOperand::createImm((~Value >> Shift) & 0xffff)); } + void addComplexRotationEvenOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue() / 90)); + } + + void addComplexRotationOddOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm((MCE->getValue() - 90) / 180)); + } + void print(raw_ostream &OS) const override; static std::unique_ptr<AArch64Operand> @@ -1526,10 +1606,22 @@ public: } static std::unique_ptr<AArch64Operand> - CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) { + CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) { auto Op = make_unique<AArch64Operand>(k_Register, Ctx); Op->Reg.RegNum = RegNum; - Op->Reg.isVector = isVector; + Op->Reg.Kind = Kind; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AArch64Operand> + CreateReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth, + SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Register, Ctx); + Op->Reg.RegNum = RegNum; + Op->Reg.ElementWidth = ElementWidth; + Op->Reg.Kind = Kind; Op->StartLoc = S; Op->EndLoc = E; return Op; @@ -1753,7 +1845,7 @@ static unsigned MatchRegisterName(StringRef Name); /// } -static unsigned matchVectorRegName(StringRef Name) { +static unsigned MatchNeonVectorRegName(StringRef Name) { return StringSwitch<unsigned>(Name.lower()) .Case("v0", AArch64::Q0) .Case("v1", AArch64::Q1) @@ -1810,9 +1902,83 @@ static bool isValidVectorKind(StringRef Name) { .Case(".d", true) // Needed for fp16 scalar pairwise reductions .Case(".2h", true) + // another special case for the ARMv8.2a dot product operand + .Case(".4b", true) .Default(false); } +static unsigned matchSVEDataVectorRegName(StringRef Name) { + return StringSwitch<unsigned>(Name.lower()) + .Case("z0", AArch64::Z0) + .Case("z1", AArch64::Z1) + .Case("z2", AArch64::Z2) + .Case("z3", AArch64::Z3) + .Case("z4", AArch64::Z4) + .Case("z5", AArch64::Z5) + .Case("z6", AArch64::Z6) + .Case("z7", AArch64::Z7) + .Case("z8", AArch64::Z8) + .Case("z9", AArch64::Z9) + .Case("z10", AArch64::Z10) + .Case("z11", AArch64::Z11) + .Case("z12", AArch64::Z12) + .Case("z13", AArch64::Z13) + .Case("z14", AArch64::Z14) + .Case("z15", AArch64::Z15) + .Case("z16", AArch64::Z16) + .Case("z17", AArch64::Z17) + .Case("z18", AArch64::Z18) + .Case("z19", AArch64::Z19) + .Case("z20", AArch64::Z20) + .Case("z21", AArch64::Z21) + .Case("z22", AArch64::Z22) + .Case("z23", AArch64::Z23) + .Case("z24", AArch64::Z24) + .Case("z25", AArch64::Z25) + .Case("z26", AArch64::Z26) + .Case("z27", AArch64::Z27) + .Case("z28", AArch64::Z28) + .Case("z29", AArch64::Z29) + .Case("z30", AArch64::Z30) + .Case("z31", AArch64::Z31) + .Default(0); +} + +static unsigned matchSVEPredicateVectorRegName(StringRef Name) { + return StringSwitch<unsigned>(Name.lower()) + .Case("p0", AArch64::P0) + .Case("p1", AArch64::P1) + .Case("p2", AArch64::P2) + .Case("p3", AArch64::P3) + .Case("p4", AArch64::P4) + .Case("p5", AArch64::P5) + .Case("p6", AArch64::P6) + .Case("p7", AArch64::P7) + .Case("p8", AArch64::P8) + .Case("p9", AArch64::P9) + .Case("p10", AArch64::P10) + .Case("p11", AArch64::P11) + .Case("p12", AArch64::P12) + .Case("p13", AArch64::P13) + .Case("p14", AArch64::P14) + .Case("p15", AArch64::P15) + .Default(0); +} + +static bool isValidSVEKind(StringRef Name) { + return StringSwitch<bool>(Name.lower()) + .Case(".b", true) + .Case(".h", true) + .Case(".s", true) + .Case(".d", true) + .Case(".q", true) + .Default(false); +} + +static bool isSVERegister(StringRef Name) { + return Name[0] == 'z' || Name[0] == 'p'; +} + static void parseValidVectorKind(StringRef Name, unsigned &NumElements, char &ElementKind) { assert(isValidVectorKind(Name)); @@ -1841,19 +2007,33 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, // Matches a register name or register alias previously defined by '.req' unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, - bool isVector) { - unsigned RegNum = isVector ? matchVectorRegName(Name) - : MatchRegisterName(Name); + RegKind Kind) { + unsigned RegNum; + switch (Kind) { + case RegKind::Scalar: + RegNum = MatchRegisterName(Name); + break; + case RegKind::NeonVector: + RegNum = MatchNeonVectorRegName(Name); + break; + case RegKind::SVEDataVector: + RegNum = matchSVEDataVectorRegName(Name); + break; + case RegKind::SVEPredicateVector: + RegNum = matchSVEPredicateVectorRegName(Name); + break; + } - if (RegNum == 0) { + if (!RegNum) { // Check for aliases registered via .req. Canonicalize to lower case. // That's more consistent since register names are case insensitive, and // it's how the original entry was passed in from MC/MCParser/AsmParser. auto Entry = RegisterReqs.find(Name.lower()); if (Entry == RegisterReqs.end()) return 0; + // set RegNum if the match is the right kind of register - if (isVector == Entry->getValue().first) + if (Kind == Entry->getValue().first) RegNum = Entry->getValue().second; } return RegNum; @@ -1869,7 +2049,10 @@ int AArch64AsmParser::tryParseRegister() { return -1; std::string lowerCase = Tok.getString().lower(); - unsigned RegNum = matchRegisterNameAlias(lowerCase, false); + if (isSVERegister(lowerCase)) + return -1; + + unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar); // Also handle a few aliases of registers. if (RegNum == 0) RegNum = StringSwitch<unsigned>(lowerCase) @@ -1900,7 +2083,7 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) { // a '.'. size_t Start = 0, Next = Name.find('.'); StringRef Head = Name.slice(Start, Next); - unsigned RegNum = matchRegisterNameAlias(Head, true); + unsigned RegNum = matchRegisterNameAlias(Head, RegKind::NeonVector); if (RegNum) { if (Next != StringRef::npos) { @@ -2519,8 +2702,8 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { return MatchOperand_Success; } -/// tryParseVectorRegister - Parse a vector register operand. -bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { +/// tryParseNeonVectorRegister - Parse a vector register operand. +bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) { MCAsmParser &Parser = getParser(); if (Parser.getTok().isNot(AsmToken::Identifier)) return true; @@ -2532,7 +2715,9 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { if (Reg == -1) return true; Operands.push_back( - AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext())); + AArch64Operand::CreateReg(Reg, RegKind::NeonVector, S, getLoc(), + getContext())); + // If there was an explicit qualifier, that goes on as a literal text // operand. if (!Kind.empty()) @@ -2563,19 +2748,85 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { return false; } +// tryParseSVEDataVectorRegister - Try to parse a SVE vector register name with +// optional kind specifier. If it is a register specifier, eat the token +// and return it. +OperandMatchResultTy +AArch64AsmParser::tryParseSVERegister(int &Reg, StringRef &Kind, + RegKind MatchKind) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + StringRef Name = Tok.getString(); + // If there is a kind specifier, it's separated from the register name by + // a '.'. + size_t Start = 0, Next = Name.find('.'); + StringRef Head = Name.slice(Start, Next); + unsigned RegNum = matchRegisterNameAlias(Head, MatchKind); + + if (RegNum) { + if (Next != StringRef::npos) { + Kind = Name.slice(Next, StringRef::npos); + if (!isValidSVEKind(Kind)) { + TokError("invalid sve vector kind qualifier"); + return MatchOperand_ParseFail; + } + } + Parser.Lex(); // Eat the register token. + + Reg = RegNum; + return MatchOperand_Success; + } + + return MatchOperand_NoMatch; +} + +/// tryParseSVEPredicateVector - Parse a SVE predicate register operand. +OperandMatchResultTy +AArch64AsmParser::tryParseSVEPredicateVector(OperandVector &Operands) { + // Check for a SVE predicate register specifier first. + const SMLoc S = getLoc(); + StringRef Kind; + int RegNum = -1; + auto Res = tryParseSVERegister(RegNum, Kind, RegKind::SVEPredicateVector); + if (Res != MatchOperand_Success) + return Res; + + unsigned ElementWidth = StringSwitch<unsigned>(Kind.lower()) + .Case("", -1) + .Case(".b", 8) + .Case(".h", 16) + .Case(".s", 32) + .Case(".d", 64) + .Case(".q", 128) + .Default(0); + + if (!ElementWidth) + return MatchOperand_NoMatch; + + Operands.push_back( + AArch64Operand::CreateReg(RegNum, RegKind::SVEPredicateVector, + ElementWidth, S, getLoc(), getContext())); + + return MatchOperand_Success; +} + /// parseRegister - Parse a non-vector register operand. bool AArch64AsmParser::parseRegister(OperandVector &Operands) { SMLoc S = getLoc(); - // Try for a vector register. - if (!tryParseVectorRegister(Operands)) + // Try for a vector (neon) register. + if (!tryParseNeonVectorRegister(Operands)) return false; // Try for a scalar register. int64_t Reg = tryParseRegister(); if (Reg == -1) return true; - Operands.push_back( - AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext())); + Operands.push_back(AArch64Operand::CreateReg(Reg, RegKind::Scalar, S, + getLoc(), getContext())); return false; } @@ -2743,7 +2994,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { if (!Tok.is(AsmToken::Identifier)) return MatchOperand_NoMatch; - unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false); + unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), RegKind::Scalar); MCContext &Ctx = getContext(); const MCRegisterInfo *RI = Ctx.getRegisterInfo(); @@ -2755,7 +3006,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { if (!parseOptionalToken(AsmToken::Comma)) { Operands.push_back( - AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx)); + AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx)); return MatchOperand_Success; } @@ -2774,7 +3025,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { } Operands.push_back( - AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx)); + AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx)); return MatchOperand_Success; } @@ -2783,9 +3034,12 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, bool invertCondCode) { MCAsmParser &Parser = getParser(); + + OperandMatchResultTy ResTy = + MatchOperandParserImpl(Operands, Mnemonic, /*ParseForAllFeatures=*/ true); + // Check if the current operand has a custom associated parser, if so, try to // custom parse the operand, or fallback to the general approach. - OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); if (ResTy == MatchOperand_Success) return false; // If there wasn't a custom match, try the generic matcher below. Otherwise, @@ -3257,7 +3511,8 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, } } -std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS); +static std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS, + unsigned VariantID = 0); bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands) { @@ -3297,6 +3552,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, "expected compatible register or floating-point constant"); case Match_InvalidMemoryIndexedSImm9: return Error(Loc, "index must be an integer in range [-256, 255]."); + case Match_InvalidMemoryIndexedSImm10: + return Error(Loc, "index must be a multiple of 8 in range [-4096, 4088]."); case Match_InvalidMemoryIndexed4SImm7: return Error(Loc, "index must be a multiple of 4 in range [-256, 252]."); case Match_InvalidMemoryIndexed8SImm7: @@ -3383,12 +3640,22 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, return Error(Loc, "expected readable system register"); case Match_MSR: return Error(Loc, "expected writable system register or pstate"); + case Match_InvalidComplexRotationEven: + return Error(Loc, "complex rotation must be 0, 90, 180 or 270."); + case Match_InvalidComplexRotationOdd: + return Error(Loc, "complex rotation must be 90 or 270."); case Match_MnemonicFail: { std::string Suggestion = AArch64MnemonicSpellCheck( ((AArch64Operand &)*Operands[0]).getToken(), ComputeAvailableFeatures(STI->getFeatureBits())); return Error(Loc, "unrecognized instruction mnemonic" + Suggestion); } + case Match_InvalidSVEPredicateAnyReg: + case Match_InvalidSVEPredicateBReg: + case Match_InvalidSVEPredicateHReg: + case Match_InvalidSVEPredicateSReg: + case Match_InvalidSVEPredicateDReg: + return Error(Loc, "invalid predicate register."); default: llvm_unreachable("unexpected error code!"); } @@ -3482,8 +3749,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, Operands[0] = AArch64Operand::CreateToken( "bfm", false, Op.getStartLoc(), getContext()); Operands[2] = AArch64Operand::CreateReg( - RegWidth == 32 ? AArch64::WZR : AArch64::XZR, false, SMLoc(), - SMLoc(), getContext()); + RegWidth == 32 ? AArch64::WZR : AArch64::XZR, RegKind::Scalar, + SMLoc(), SMLoc(), getContext()); Operands[3] = AArch64Operand::CreateImm( ImmRExpr, LSBOp.getStartLoc(), LSBOp.getEndLoc(), getContext()); Operands.emplace_back( @@ -3610,6 +3877,31 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } } } + + // The Cyclone CPU and early successors didn't execute the zero-cycle zeroing + // instruction for FP registers correctly in some rare circumstances. Convert + // it to a safe instruction and warn (because silently changing someone's + // assembly is rude). + if (getSTI().getFeatureBits()[AArch64::FeatureZCZeroingFPWorkaround] && + NumOperands == 4 && Tok == "movi") { + AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]); + AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]); + AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]); + if ((Op1.isToken() && Op2.isNeonVectorReg() && Op3.isImm()) || + (Op1.isNeonVectorReg() && Op2.isToken() && Op3.isImm())) { + StringRef Suffix = Op1.isToken() ? Op1.getToken() : Op2.getToken(); + if (Suffix.lower() == ".2d" && + cast<MCConstantExpr>(Op3.getImm())->getValue() == 0) { + Warning(IDLoc, "instruction movi.2d with immediate #0 may not function" + " correctly on this CPU, converting to equivalent movi.16b"); + // Switch the suffix to .16b. + unsigned Idx = Op1.isToken() ? 1 : 2; + Operands[Idx] = AArch64Operand::CreateToken(".16b", false, IDLoc, + getContext()); + } + } + } + // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands. // InstAlias can't quite handle this since the reg classes aren't // subclasses. @@ -3619,8 +3911,9 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]); if (Op.isReg()) { unsigned Reg = getXRegFromWReg(Op.getReg()); - Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), - Op.getEndLoc(), getContext()); + Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, + Op.getStartLoc(), Op.getEndLoc(), + getContext()); } } // FIXME: Likewise for sxt[bh] with a Xd dst operand @@ -3634,7 +3927,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]); if (Op.isReg()) { unsigned Reg = getXRegFromWReg(Op.getReg()); - Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), + Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, + Op.getStartLoc(), Op.getEndLoc(), getContext()); } } @@ -3650,7 +3944,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]); if (Op.isReg()) { unsigned Reg = getWRegFromXReg(Op.getReg()); - Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), + Operands[1] = AArch64Operand::CreateReg(Reg, RegKind::Scalar, + Op.getStartLoc(), Op.getEndLoc(), getContext()); } } @@ -3764,6 +4059,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidMemoryIndexed8SImm7: case Match_InvalidMemoryIndexed16SImm7: case Match_InvalidMemoryIndexedSImm9: + case Match_InvalidMemoryIndexedSImm10: case Match_InvalidImm0_1: case Match_InvalidImm0_7: case Match_InvalidImm0_15: @@ -3782,6 +4078,13 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidIndexS: case Match_InvalidIndexD: case Match_InvalidLabel: + case Match_InvalidComplexRotationEven: + case Match_InvalidComplexRotationOdd: + case Match_InvalidSVEPredicateAnyReg: + case Match_InvalidSVEPredicateBReg: + case Match_InvalidSVEPredicateHReg: + case Match_InvalidSVEPredicateSReg: + case Match_InvalidSVEPredicateDReg: case Match_MSR: case Match_MRS: { if (ErrorInfo >= Operands.size()) @@ -3862,8 +4165,8 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { std::tie(Arch, ExtensionString) = getParser().parseStringToEndOfStatement().trim().split('+'); - unsigned ID = AArch64::parseArch(Arch); - if (ID == static_cast<unsigned>(AArch64::ArchKind::AK_INVALID)) + AArch64::ArchKind ID = AArch64::parseArch(Arch); + if (ID == AArch64::ArchKind::INVALID) return Error(ArchLoc, "unknown arch name"); if (parseToken(AsmToken::EndOfStatement)) @@ -4107,18 +4410,46 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat the '.req' token. SMLoc SRegLoc = getLoc(); - unsigned RegNum = tryParseRegister(); - bool IsVector = false; + int RegNum = tryParseRegister(); + RegKind RegisterKind = RegKind::Scalar; - if (RegNum == static_cast<unsigned>(-1)) { + if (RegNum == -1) { StringRef Kind; + RegisterKind = RegKind::NeonVector; RegNum = tryMatchVectorRegister(Kind, false); if (!Kind.empty()) return Error(SRegLoc, "vector register without type specifier expected"); - IsVector = true; } - if (RegNum == static_cast<unsigned>(-1)) + if (RegNum == -1) { + StringRef Kind; + RegisterKind = RegKind::SVEDataVector; + OperandMatchResultTy Res = + tryParseSVERegister(RegNum, Kind, RegKind::SVEDataVector); + + if (Res == MatchOperand_ParseFail) + return true; + + if (Res == MatchOperand_Success && !Kind.empty()) + return Error(SRegLoc, + "sve vector register without type specifier expected"); + } + + if (RegNum == -1) { + StringRef Kind; + RegisterKind = RegKind::SVEPredicateVector; + OperandMatchResultTy Res = + tryParseSVERegister(RegNum, Kind, RegKind::SVEPredicateVector); + + if (Res == MatchOperand_ParseFail) + return true; + + if (Res == MatchOperand_Success && !Kind.empty()) + return Error(SRegLoc, + "sve predicate register without type specifier expected"); + } + + if (RegNum == -1) return Error(SRegLoc, "register name or alias expected"); // Shouldn't be anything else. @@ -4126,7 +4457,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { "unexpected input in .req directive")) return true; - auto pair = std::make_pair(IsVector, RegNum); + auto pair = std::make_pair(RegisterKind, (unsigned) RegNum); if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair) Warning(L, "ignoring redefinition of register alias '" + Name + "'"); @@ -4206,6 +4537,7 @@ extern "C" void LLVMInitializeAArch64AsmParser() { #define GET_REGISTER_MATCHER #define GET_SUBTARGET_FEATURE_NAME #define GET_MATCHER_IMPLEMENTATION +#define GET_MNEMONIC_SPELL_CHECKER #include "AArch64GenAsmMatcher.inc" // Define this matcher function after the auto-generated include so we @@ -4337,8 +4669,43 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) { &AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]); } - Operands.push_back(AArch64Operand::CreateReg(Pair, false, S, getLoc(), - getContext())); + Operands.push_back(AArch64Operand::CreateReg(Pair, RegKind::Scalar, S, + getLoc(), getContext())); + + return MatchOperand_Success; +} + +template <bool ParseSuffix> +OperandMatchResultTy +AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) { + const SMLoc S = getLoc(); + // Check for a SVE vector register specifier first. + int RegNum = -1; + StringRef Kind; + + OperandMatchResultTy Res = + tryParseSVERegister(RegNum, Kind, RegKind::SVEDataVector); + + if (Res != MatchOperand_Success) + return Res; + + if (ParseSuffix && Kind.empty()) + return MatchOperand_NoMatch; + + unsigned ElementWidth = StringSwitch<unsigned>(Kind.lower()) + .Case("", -1) + .Case(".b", 8) + .Case(".h", 16) + .Case(".s", 32) + .Case(".d", 64) + .Case(".q", 128) + .Default(0); + if (!ElementWidth) + return MatchOperand_NoMatch; + + Operands.push_back( + AArch64Operand::CreateReg(RegNum, RegKind::SVEDataVector, ElementWidth, + S, S, getContext())); return MatchOperand_Success; } diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 7870dce5c9c0..ae278caeda69 100644 --- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -1,4 +1,4 @@ -//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===// +//===- AArch64Disassembler.cpp - Disassembler for AArch64 -----------------===// // // The LLVM Compiler Infrastructure // @@ -14,160 +14,168 @@ #include "AArch64ExternalSymbolizer.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm-c/Disassembler.h" +#include "llvm/MC/MCDisassembler/MCRelocationInfo.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" +#include <algorithm> +#include <memory> using namespace llvm; #define DEBUG_TYPE "aarch64-disassembler" // Pull DecodeStatus and its enum values into the global namespace. -typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; +using DecodeStatus = MCDisassembler::DecodeStatus; // Forward declare these because the autogenerated code will reference them. // Definitions are further down. -static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, +static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst, +static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst, +static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst, +static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, +static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decode); +static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decode); -static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder); -static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, - uint32_t insn, +static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, +static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Address, +static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); -static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, +static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Address, +static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); -static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Address, +static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); -static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Address, +static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); -static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeBaseAddSubImm(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, - uint32_t insn, +static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, +static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder); static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, @@ -177,6 +185,9 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const void *Decoder); +template<int Bits> +static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm, + uint64_t Address, const void *Decoder); static bool Check(DecodeStatus &Out, DecodeStatus In) { switch (In) { @@ -196,9 +207,9 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) { #include "AArch64GenDisassemblerTables.inc" #include "AArch64GenInstrInfo.inc" -#define Success llvm::MCDisassembler::Success -#define Fail llvm::MCDisassembler::Fail -#define SoftFail llvm::MCDisassembler::SoftFail +#define Success MCDisassembler::Success +#define Fail MCDisassembler::Fail +#define SoftFail MCDisassembler::SoftFail static MCDisassembler *createAArch64Disassembler(const Target &T, const MCSubtargetInfo &STI, @@ -232,8 +243,8 @@ createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo, LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo, MCContext *Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo) { - return new llvm::AArch64ExternalSymbolizer(*Ctx, move(RelInfo), GetOpInfo, - SymbolLookUp, DisInfo); + return new AArch64ExternalSymbolizer(*Ctx, std::move(RelInfo), GetOpInfo, + SymbolLookUp, DisInfo); } extern "C" void LLVMInitializeAArch64Disassembler() { @@ -431,6 +442,44 @@ static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, Inst.addOperand(MCOperand::createReg(Register)); return Success; } +static const unsigned ZPRDecoderTable[] = { + AArch64::Z0, AArch64::Z1, AArch64::Z2, AArch64::Z3, + AArch64::Z4, AArch64::Z5, AArch64::Z6, AArch64::Z7, + AArch64::Z8, AArch64::Z9, AArch64::Z10, AArch64::Z11, + AArch64::Z12, AArch64::Z13, AArch64::Z14, AArch64::Z15, + AArch64::Z16, AArch64::Z17, AArch64::Z18, AArch64::Z19, + AArch64::Z20, AArch64::Z21, AArch64::Z22, AArch64::Z23, + AArch64::Z24, AArch64::Z25, AArch64::Z26, AArch64::Z27, + AArch64::Z28, AArch64::Z29, AArch64::Z30, AArch64::Z31 +}; + +static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void* Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = ZPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned PPRDecoderTable[] = { + AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, + AArch64::P4, AArch64::P5, AArch64::P6, AArch64::P7, + AArch64::P8, AArch64::P9, AArch64::P10, AArch64::P11, + AArch64::P12, AArch64::P13, AArch64::P14, AArch64::P15 +}; + +static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 15) + return Fail; + + unsigned Register = PPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} static const unsigned VectorDecoderTable[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, @@ -587,7 +636,7 @@ static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { // scale{5} is asserted as 1 in tblgen. @@ -596,14 +645,14 @@ static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, return Success; } -static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { Inst.addOperand(MCOperand::createImm(64 - Imm)); return Success; } -static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { int64_t ImmVal = Imm; const AArch64Disassembler *Dis = @@ -619,14 +668,14 @@ static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, return Success; } -static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeMemExtend(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder) { Inst.addOperand(MCOperand::createImm((Imm >> 1) & 1)); Inst.addOperand(MCOperand::createImm(Imm & 1)); return Success; } -static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeMRSSystemRegister(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder) { Inst.addOperand(MCOperand::createImm(Imm)); @@ -636,7 +685,7 @@ static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, return Success; } -static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeMSRSystemRegister(MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder) { Inst.addOperand(MCOperand::createImm(Imm)); @@ -644,7 +693,7 @@ static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, return Success; } -static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, +static DecodeStatus DecodeFMOVLaneInstruction(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { // This decoder exists to add the dummy Lane operand to the MCInst, which must @@ -667,78 +716,78 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, return Success; } -static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftRImm(MCInst &Inst, unsigned Imm, unsigned Add) { Inst.addOperand(MCOperand::createImm(Add - Imm)); return Success; } -static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftLImm(MCInst &Inst, unsigned Imm, unsigned Add) { Inst.addOperand(MCOperand::createImm((Imm + Add) & (Add - 1))); return Success; } -static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR64Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftRImm(Inst, Imm, 64); } -static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR64ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftRImm(Inst, Imm | 0x20, 64); } -static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR32Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftRImm(Inst, Imm, 32); } -static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR32ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftRImm(Inst, Imm | 0x10, 32); } -static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR16Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftRImm(Inst, Imm, 16); } -static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR16ImmNarrow(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftRImm(Inst, Imm | 0x8, 16); } -static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftR8Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftRImm(Inst, Imm, 8); } -static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftL64Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftLImm(Inst, Imm, 64); } -static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftL32Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftLImm(Inst, Imm, 32); } -static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftL16Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftLImm(Inst, Imm, 16); } -static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm, +static DecodeStatus DecodeVecShiftL8Imm(MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { return DecodeVecShiftLImm(Inst, Imm, 8); } -static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, +static DecodeStatus DecodeThreeAddrSRegInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); @@ -799,7 +848,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, return Success; } -static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeMoveImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); @@ -832,8 +881,8 @@ static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, return Success; } -static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, +static DecodeStatus DecodeUnsignedLdStInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); @@ -893,8 +942,8 @@ static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, return Success; } -static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, +static DecodeStatus DecodeSignedLdStInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); @@ -1078,8 +1127,8 @@ static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, return Success; } -static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, +static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); @@ -1161,7 +1210,7 @@ static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, return Success; } -static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { unsigned Rt = fieldFromInstruction(insn, 0, 5); @@ -1290,8 +1339,8 @@ static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, return Success; } -static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, +static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); @@ -1347,8 +1396,8 @@ static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, return Success; } -static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, +static DecodeStatus DecodeLogicalImmInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); @@ -1378,7 +1427,7 @@ static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, return Success; } -static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeModImmInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); @@ -1417,8 +1466,8 @@ static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, return Success; } -static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, +static DecodeStatus DecodeModImmTiedInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned cmode = fieldFromInstruction(insn, 12, 4); @@ -1435,7 +1484,7 @@ static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, return Success; } -static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeAdrInstruction(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); int64_t imm = fieldFromInstruction(insn, 5, 19) << 2; @@ -1454,7 +1503,7 @@ static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, return Success; } -static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeBaseAddSubImm(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { unsigned Rd = fieldFromInstruction(insn, 0, 5); unsigned Rn = fieldFromInstruction(insn, 5, 5); @@ -1490,7 +1539,7 @@ static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, return Success; } -static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeUnconditionalBranch(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { int64_t imm = fieldFromInstruction(insn, 0, 26); @@ -1507,8 +1556,8 @@ static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, return Success; } -static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, +static DecodeStatus DecodeSystemPStateInstruction(MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { uint64_t op1 = fieldFromInstruction(insn, 16, 3); uint64_t op2 = fieldFromInstruction(insn, 5, 3); @@ -1531,7 +1580,7 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, return Fail; } -static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, +static DecodeStatus DecodeTestAndBranch(MCInst &Inst, uint32_t insn, uint64_t Addr, const void *Decoder) { uint64_t Rt = fieldFromInstruction(insn, 0, 5); uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5; @@ -1586,3 +1635,18 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, AArch64::XSeqPairsClassRegClassID, RegNo, Addr, Decoder); } + +template<int Bits> +static DecodeStatus DecodeSImm(llvm::MCInst &Inst, uint64_t Imm, + uint64_t Address, const void *Decoder) { + if (Imm & ~((1LL << Bits) - 1)) + return Fail; + + // Imm is a signed immediate, so sign extend it. + if (Imm & (1 << (Bits - 1))) + Imm |= ~((1LL << Bits) - 1); + + Inst.addOperand(MCOperand::createImm(Imm)); + return Success; +} + diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index fc89657bffd3..bdf71b095fda 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -689,7 +689,7 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) { unsigned Opcode = MI->getOpcode(); - StringRef Layout, Mnemonic; + StringRef Layout; bool IsTbx; if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) { @@ -1331,3 +1331,32 @@ void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo, uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal); O << format("#%#016llx", Val); } + +template<int64_t Angle, int64_t Remainder> +void AArch64InstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + O << "#" << (Val * Angle) + Remainder; +} + +template <char suffix> +void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + switch (suffix) { + case 0: + case 'b': + case 'h': + case 's': + case 'd': + case 'q': + break; + default: llvm_unreachable("Invalid kind specifier."); + } + + unsigned Reg = MI->getOperand(OpNum).getReg(); + O << getRegisterName(Reg); + if (suffix != 0) + O << '.' << suffix; +}
\ No newline at end of file diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index a45258cb97b7..76f20f042cef 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -158,10 +158,16 @@ protected: const MCSubtargetInfo &STI, raw_ostream &O); void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + template<int64_t Angle, int64_t Remainder> + void printComplexRotationOp(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); template<unsigned size> void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + template <char = 0> + void printSVERegOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); }; class AArch64AppleInstPrinter : public AArch64InstPrinter { diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 2bd0cbf9f7c6..7b33b4b5b542 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -30,12 +30,14 @@ namespace { class AArch64AsmBackend : public MCAsmBackend { static const unsigned PCRelFlagVal = MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel; + Triple TheTriple; + public: bool IsLittleEndian; public: - AArch64AsmBackend(const Target &T, bool IsLittleEndian) - : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {} + AArch64AsmBackend(const Target &T, const Triple &TT, bool IsLittleEndian) + : MCAsmBackend(), TheTriple(TT), IsLittleEndian(IsLittleEndian) {} unsigned getNumFixupKinds() const override { return AArch64::NumTargetFixupKinds; @@ -88,6 +90,9 @@ public: unsigned getPointerSize() const { return 8; } unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const; + + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override; }; } // end anonymous namespace @@ -140,7 +145,8 @@ static unsigned AdrImmBits(unsigned Value) { } static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, - MCContext &Ctx) { + MCContext &Ctx, const Triple &TheTriple, + bool IsResolved) { unsigned Kind = Fixup.getKind(); int64_t SignedValue = static_cast<int64_t>(Value); switch (Kind) { @@ -151,6 +157,9 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); return AdrImmBits(Value & 0x1fffffULL); case AArch64::fixup_aarch64_pcrel_adrp_imm21: + assert(!IsResolved); + if (TheTriple.isOSBinFormatCOFF()) + return AdrImmBits(Value & 0x1fffffULL); return AdrImmBits((Value & 0x1fffff000ULL) >> 12); case AArch64::fixup_aarch64_ldr_pcrel_imm19: case AArch64::fixup_aarch64_pcrel_branch19: @@ -163,11 +172,15 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, return (Value >> 2) & 0x7ffff; case AArch64::fixup_aarch64_add_imm12: case AArch64::fixup_aarch64_ldst_imm12_scale1: + if (TheTriple.isOSBinFormatCOFF() && !IsResolved) + Value &= 0xfff; // Unsigned 12-bit immediate if (Value >= 0x1000) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); return Value; case AArch64::fixup_aarch64_ldst_imm12_scale2: + if (TheTriple.isOSBinFormatCOFF() && !IsResolved) + Value &= 0xfff; // Unsigned 12-bit immediate which gets multiplied by 2 if (Value >= 0x2000) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); @@ -175,6 +188,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Ctx.reportError(Fixup.getLoc(), "fixup must be 2-byte aligned"); return Value >> 1; case AArch64::fixup_aarch64_ldst_imm12_scale4: + if (TheTriple.isOSBinFormatCOFF() && !IsResolved) + Value &= 0xfff; // Unsigned 12-bit immediate which gets multiplied by 4 if (Value >= 0x4000) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); @@ -182,6 +197,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Ctx.reportError(Fixup.getLoc(), "fixup must be 4-byte aligned"); return Value >> 2; case AArch64::fixup_aarch64_ldst_imm12_scale8: + if (TheTriple.isOSBinFormatCOFF() && !IsResolved) + Value &= 0xfff; // Unsigned 12-bit immediate which gets multiplied by 8 if (Value >= 0x8000) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); @@ -189,6 +206,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Ctx.reportError(Fixup.getLoc(), "fixup must be 8-byte aligned"); return Value >> 3; case AArch64::fixup_aarch64_ldst_imm12_scale16: + if (TheTriple.isOSBinFormatCOFF() && !IsResolved) + Value &= 0xfff; // Unsigned 12-bit immediate which gets multiplied by 16 if (Value >= 0x10000) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); @@ -275,7 +294,7 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); MCContext &Ctx = Asm.getContext(); // Apply any target-specific value adjustments. - Value = adjustFixupValue(Fixup, Value, Ctx); + Value = adjustFixupValue(Fixup, Value, Ctx, TheTriple, IsResolved); // Shift the value into position. Value <<= Info.TargetOffset; @@ -338,6 +357,26 @@ bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { return true; } +bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target) { + // The ADRP instruction adds some multiple of 0x1000 to the current PC & + // ~0xfff. This means that the required offset to reach a symbol can vary by + // up to one step depending on where the ADRP is in memory. For example: + // + // ADRP x0, there + // there: + // + // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and + // we'll need that as an offset. At any other address "there" will be in the + // same page as the ADRP and the instruction should encode 0x0. Assuming the + // section isn't 0x1000-aligned, we therefore need to delegate this decision + // to the linker -- a relocation! + if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) + return true; + return false; +} + namespace { namespace CU { @@ -389,10 +428,12 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { } public: - DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI) - : AArch64AsmBackend(T, /*IsLittleEndian*/true), MRI(MRI) {} + DarwinAArch64AsmBackend(const Target &T, const Triple &TT, + const MCRegisterInfo &MRI) + : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64, MachO::CPU_SUBTYPE_ARM64_ALL); } @@ -537,47 +578,27 @@ public: uint8_t OSABI; bool IsILP32; - ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian, - bool IsILP32) - : AArch64AsmBackend(T, IsLittleEndian), OSABI(OSABI), IsILP32(IsILP32) {} + ELFAArch64AsmBackend(const Target &T, const Triple &TT, uint8_t OSABI, + bool IsLittleEndian, bool IsILP32) + : AArch64AsmBackend(T, TT, IsLittleEndian), OSABI(OSABI), + IsILP32(IsILP32) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32); } - - bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target) override; }; -bool ELFAArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm, - const MCFixup &Fixup, - const MCValue &Target) { - // The ADRP instruction adds some multiple of 0x1000 to the current PC & - // ~0xfff. This means that the required offset to reach a symbol can vary by - // up to one step depending on where the ADRP is in memory. For example: - // - // ADRP x0, there - // there: - // - // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and - // we'll need that as an offset. At any other address "there" will be in the - // same page as the ADRP and the instruction should encode 0x0. Assuming the - // section isn't 0x1000-aligned, we therefore need to delegate this decision - // to the linker -- a relocation! - if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) - return true; - return false; -} - } namespace { class COFFAArch64AsmBackend : public AArch64AsmBackend { public: COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple) - : AArch64AsmBackend(T, /*IsLittleEndian*/true) {} + : AArch64AsmBackend(T, TheTriple, /*IsLittleEndian*/ true) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createAArch64WinCOFFObjectWriter(OS); } }; @@ -589,7 +610,7 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, StringRef CPU, const MCTargetOptions &Options) { if (TheTriple.isOSBinFormatMachO()) - return new DarwinAArch64AsmBackend(T, MRI); + return new DarwinAArch64AsmBackend(T, TheTriple, MRI); if (TheTriple.isOSBinFormatCOFF()) return new COFFAArch64AsmBackend(T, TheTriple); @@ -598,7 +619,8 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); bool IsILP32 = Options.getABIName() == "ilp32"; - return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32); + return new ELFAArch64AsmBackend(T, TheTriple, OSABI, /*IsLittleEndian=*/true, + IsILP32); } MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, @@ -610,5 +632,6 @@ MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, "Big endian is only supported for ELF targets!"); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); bool IsILP32 = Options.getABIName() == "ilp32"; - return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/false, IsILP32); + return new ELFAArch64AsmBackend(T, TheTriple, OSABI, /*IsLittleEndian=*/false, + IsILP32); } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 89c3e5b4c76e..2d90e67960f8 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" #include <cassert> @@ -428,11 +429,10 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, llvm_unreachable("Unimplemented fixup -> relocation"); } -MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_pwrite_stream &OS, - uint8_t OSABI, - bool IsLittleEndian, - bool IsILP32) { - MCELFObjectTargetWriter *MOTW = - new AArch64ELFObjectWriter(OSABI, IsLittleEndian, IsILP32); - return createELFObjectWriter(MOTW, OS, IsLittleEndian); +std::unique_ptr<MCObjectWriter> +llvm::createAArch64ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, + bool IsLittleEndian, bool IsILP32) { + auto MOTW = + llvm::make_unique<AArch64ELFObjectWriter>(OSABI, IsLittleEndian, IsILP32); + return createELFObjectWriter(std::move(MOTW), OS, IsLittleEndian); } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index a0de3c39562b..8ee627d50df2 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -86,10 +86,11 @@ class AArch64ELFStreamer : public MCELFStreamer { public: friend class AArch64TargetELFStreamer; - AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, - raw_pwrite_stream &OS, MCCodeEmitter *Emitter) - : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0), - LastEMS(EMS_None) {} + AArch64ELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB, + raw_pwrite_stream &OS, + std::unique_ptr<MCCodeEmitter> Emitter) + : MCELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)), + MappingSymbolCounter(0), LastEMS(EMS_None) {} void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { // We have to keep track of the mapping symbol state of any sections we @@ -101,6 +102,14 @@ public: MCELFStreamer::ChangeSection(Section, Subsection); } + // Reset state between object emissions + void reset() override { + MappingSymbolCounter = 0; + MCELFStreamer::reset(); + LastMappingSymbols.clear(); + LastEMS = EMS_None; + } + /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. @@ -198,10 +207,13 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, return new AArch64TargetAsmStreamer(S, OS); } -MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, +MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, + std::unique_ptr<MCAsmBackend> TAB, raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll) { - AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter); + std::unique_ptr<MCCodeEmitter> Emitter, + bool RelaxAll) { + AArch64ELFStreamer *S = + new AArch64ELFStreamer(Context, std::move(TAB), OS, std::move(Emitter)); if (RelaxAll) S->getAssembler().setRelaxAll(true); return S; diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h index ef48203c8bc0..19b188aa1c61 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h @@ -18,9 +18,11 @@ namespace llvm { -MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, +MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, + std::unique_ptr<MCAsmBackend> TAB, raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll); + std::unique_ptr<MCCodeEmitter> Emitter, + bool RelaxAll); } #endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index c25bd8c8f6cc..12b5a27b7699 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -102,7 +102,24 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { } AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() { - CommentString = ";"; PrivateGlobalPrefix = ".L"; PrivateLabelPrefix = ".L"; + + Data16bitsDirective = "\t.hword\t"; + Data32bitsDirective = "\t.word\t"; + Data64bitsDirective = "\t.xword\t"; + + AlignmentIsInBytes = false; + SupportsDebugInformation = true; + CodePointerSize = 8; +} + +AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() { + CommentString = ";"; + ExceptionsType = ExceptionHandling::WinEH; +} + +AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() { + CommentString = "//"; + ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index 2d7107a37244..afde87b40929 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -38,6 +38,14 @@ struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF { explicit AArch64MCAsmInfoCOFF(); }; +struct AArch64MCAsmInfoMicrosoftCOFF : public AArch64MCAsmInfoCOFF { + explicit AArch64MCAsmInfoMicrosoftCOFF(); +}; + +struct AArch64MCAsmInfoGNUCOFF : public AArch64MCAsmInfoCOFF { + explicit AArch64MCAsmInfoGNUCOFF(); +}; + } // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 97c92fa0778d..f606d272bcb0 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "AArch64MCExpr.h" -#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolELF.h" diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index a2555496cdb9..c3458d625b83 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -16,6 +16,8 @@ #include "AArch64MCAsmInfo.h" #include "AArch64WinCOFFStreamer.h" #include "InstPrinter/AArch64InstPrinter.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -49,9 +51,18 @@ createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { return createAArch64MCSubtargetInfoImpl(TT, CPU, FS); } +void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { + for (unsigned Reg = AArch64::NoRegister + 1; + Reg < AArch64::NUM_TARGET_REGS; ++Reg) { + unsigned CV = MRI->getEncodingValue(Reg); + MRI->mapLLVMRegToCVReg(Reg, CV); + } +} + static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) { MCRegisterInfo *X = new MCRegisterInfo(); InitAArch64MCRegisterInfo(X, AArch64::LR); + AArch64_MC::initLLVMToCVRegMapping(X); return X; } @@ -60,8 +71,10 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI; if (TheTriple.isOSBinFormatMachO()) MAI = new AArch64MCAsmInfoDarwin(); + else if (TheTriple.isWindowsMSVCEnvironment()) + MAI = new AArch64MCAsmInfoMicrosoftCOFF(); else if (TheTriple.isOSBinFormatCOFF()) - MAI = new AArch64MCAsmInfoCOFF(); + MAI = new AArch64MCAsmInfoGNUCOFF(); else { assert(TheTriple.isOSBinFormatELF() && "Invalid target"); MAI = new AArch64MCAsmInfoELF(TheTriple); @@ -75,28 +88,6 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, - CodeModel::Model &CM) { - assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO() || - TT.isOSBinFormatCOFF()) && "Invalid target"); - - if (CM == CodeModel::Default) - CM = CodeModel::Small; - // The default MCJIT memory managers make no guarantees about where they can - // find an executable page; JITed code needs to be able to refer to globals - // no matter how far away they are. - else if (CM == CodeModel::JITDefault) - CM = CodeModel::Large; - else if (CM != CodeModel::Small && CM != CodeModel::Large) { - if (!TT.isOSFuchsia()) - report_fatal_error( - "Only small and large code models are allowed on AArch64"); - else if (CM != CodeModel::Kernel) - report_fatal_error( - "Only small, kernel, and large code models are allowed on AArch64"); - } -} - static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, @@ -111,25 +102,32 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T, } static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx, - MCAsmBackend &TAB, raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll) { - return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll); + std::unique_ptr<MCAsmBackend> &&TAB, + raw_pwrite_stream &OS, + std::unique_ptr<MCCodeEmitter> &&Emitter, + bool RelaxAll) { + return createAArch64ELFStreamer(Ctx, std::move(TAB), OS, std::move(Emitter), + RelaxAll); } -static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB, +static MCStreamer *createMachOStreamer(MCContext &Ctx, + std::unique_ptr<MCAsmBackend> &&TAB, raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll, + std::unique_ptr<MCCodeEmitter> &&Emitter, + bool RelaxAll, bool DWARFMustBeAtTheEnd) { - return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll, - DWARFMustBeAtTheEnd, + return createMachOStreamer(Ctx, std::move(TAB), OS, std::move(Emitter), + RelaxAll, DWARFMustBeAtTheEnd, /*LabelSections*/ true); } -static MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB, - raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll, - bool IncrementalLinkerCompatible) { - return createAArch64WinCOFFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, +static MCStreamer * +createWinCOFFStreamer(MCContext &Ctx, std::unique_ptr<MCAsmBackend> &&TAB, + raw_pwrite_stream &OS, + std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible) { + return createAArch64WinCOFFStreamer(Ctx, std::move(TAB), OS, + std::move(Emitter), RelaxAll, IncrementalLinkerCompatible); } @@ -144,9 +142,6 @@ extern "C" void LLVMInitializeAArch64TargetMC() { // Register the MC asm info. RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo); - // Register the MC codegen info. - TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts); - // Register the MC instruction info. TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 1404926b8124..b9e1673b9317 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -16,6 +16,8 @@ #include "llvm/Support/DataTypes.h" +#include <memory> + namespace llvm { class formatted_raw_ostream; class MCAsmBackend; @@ -51,16 +53,16 @@ MCAsmBackend *createAArch64beAsmBackend(const Target &T, const Triple &TT, StringRef CPU, const MCTargetOptions &Options); -MCObjectWriter *createAArch64ELFObjectWriter(raw_pwrite_stream &OS, - uint8_t OSABI, - bool IsLittleEndian, - bool IsILP32); +std::unique_ptr<MCObjectWriter> +createAArch64ELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, + bool IsLittleEndian, bool IsILP32); -MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS, - uint32_t CPUType, - uint32_t CPUSubtype); +std::unique_ptr<MCObjectWriter> +createAArch64MachObjectWriter(raw_pwrite_stream &OS, uint32_t CPUType, + uint32_t CPUSubtype); -MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS); +std::unique_ptr<MCObjectWriter> +createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS); MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, @@ -70,6 +72,10 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, MCTargetStreamer *createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI); +namespace AArch64_MC { +void initLLVMToCVRegMapping(MCRegisterInfo *MRI); +} + } // End llvm namespace // Defines symbolic names for AArch64 registers. This defines a mapping from diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 19b2576f6895..55151c2b8d21 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -430,10 +430,10 @@ void AArch64MachObjectWriter::recordRelocation( Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } -MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_pwrite_stream &OS, - uint32_t CPUType, - uint32_t CPUSubtype) { +std::unique_ptr<MCObjectWriter> +llvm::createAArch64MachObjectWriter(raw_pwrite_stream &OS, uint32_t CPUType, + uint32_t CPUSubtype) { return createMachObjectWriter( - new AArch64MachObjectWriter(CPUType, CPUSubtype), OS, + llvm::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype), OS, /*IsLittleEndian=*/true); } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index 31762b9e4cd5..d06c5e8862ae 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" #include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" #include "llvm/Support/ErrorHandling.h" @@ -96,9 +97,10 @@ bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { namespace llvm { -MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) { - MCWinCOFFObjectTargetWriter *MOTW = new AArch64WinCOFFObjectWriter(); - return createWinCOFFObjectWriter(MOTW, OS); +std::unique_ptr<MCObjectWriter> +createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) { + auto MOTW = llvm::make_unique<AArch64WinCOFFObjectWriter>(); + return createWinCOFFObjectWriter(std::move(MOTW), OS); } } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp index 6c8da27e398f..c88363d2c250 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -8,6 +8,8 @@ //===----------------------------------------------------------------------===// #include "AArch64WinCOFFStreamer.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCCodeEmitter.h" using namespace llvm; @@ -17,19 +19,28 @@ class AArch64WinCOFFStreamer : public MCWinCOFFStreamer { public: friend class AArch64TargetWinCOFFStreamer; - AArch64WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE, - raw_pwrite_stream &OS) - : MCWinCOFFStreamer(C, AB, CE, OS) {} + AArch64WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB, + std::unique_ptr<MCCodeEmitter> CE, + raw_pwrite_stream &OS) + : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {} + + void FinishImpl() override; }; + +void AArch64WinCOFFStreamer::FinishImpl() { + EmitFrames(nullptr); + + MCWinCOFFStreamer::FinishImpl(); +} } // end anonymous namespace namespace llvm { -MCWinCOFFStreamer -*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, - raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll, - bool IncrementalLinkerCompatible) { - auto *S = new AArch64WinCOFFStreamer(Context, MAB, *Emitter, OS); +MCWinCOFFStreamer *createAArch64WinCOFFStreamer( + MCContext &Context, std::unique_ptr<MCAsmBackend> MAB, + raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter, + bool RelaxAll, bool IncrementalLinkerCompatible) { + auto *S = new AArch64WinCOFFStreamer(Context, std::move(MAB), + std::move(Emitter), OS); S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); return S; } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h index 1b4fcd6804e2..b67a19e883e9 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h @@ -33,11 +33,10 @@ public: namespace llvm { -MCWinCOFFStreamer -*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &TAB, - raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll, - bool IncrementalLinkerCompatible); +MCWinCOFFStreamer *createAArch64WinCOFFStreamer( + MCContext &Context, std::unique_ptr<MCAsmBackend> TAB, + raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter, + bool RelaxAll, bool IncrementalLinkerCompatible); } // end llvm namespace #endif diff --git a/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td b/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td new file mode 100644 index 000000000000..15c1275f259d --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -0,0 +1,103 @@ +//=-- SVEInstrFormats.td - AArch64 SVE Instruction classes -*- tablegen -*--=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AArch64 Scalable Vector Extension (SVE) Instruction Class Definitions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SVE Integer Arithmetic - Unpredicated Group. +//===----------------------------------------------------------------------===// + +class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm, + ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-24} = 0b00000100; + let Inst{23-22} = sz8_64; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b000; + let Inst{12-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm> { + def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>; + def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>; + def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>; + def _D : sve_int_bin_cons_arit_0<0b11, opc, asm, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE Permute - In Lane Group +//===----------------------------------------------------------------------===// + +class sve_int_perm_bin_perm_zz<bits<3> opc, bits<2> sz8_64, string asm, + ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-24} = 0b00000101; + let Inst{23-22} = sz8_64; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b011; + let Inst{12-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm> { + def _B : sve_int_perm_bin_perm_zz<opc, 0b00, asm, ZPR8>; + def _H : sve_int_perm_bin_perm_zz<opc, 0b01, asm, ZPR16>; + def _S : sve_int_perm_bin_perm_zz<opc, 0b10, asm, ZPR32>; + def _D : sve_int_perm_bin_perm_zz<opc, 0b11, asm, ZPR64>; +} + +//===----------------------------------------------------------------------===// +// SVE Permute - Predicates Group +//===----------------------------------------------------------------------===// + +class sve_int_perm_bin_perm_pp<bits<3> opc, bits<2> sz8_64, string asm, + PPRRegOp pprty> +: I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm), + asm, "\t$Pd, $Pn, $Pm", + "", + []>, Sched<[]> { + bits<4> Pd; + bits<4> Pm; + bits<4> Pn; + let Inst{31-24} = 0b00000101; + let Inst{23-22} = sz8_64; + let Inst{21-20} = 0b10; + let Inst{19-16} = Pm; + let Inst{15-13} = 0b010; + let Inst{12-10} = opc; + let Inst{9} = 0b0; + let Inst{8-5} = Pn; + let Inst{4} = 0b0; + let Inst{3-0} = Pd; +} + +multiclass sve_int_perm_bin_perm_pp<bits<3> opc, string asm> { + def _B : sve_int_perm_bin_perm_pp<opc, 0b00, asm, PPR8>; + def _H : sve_int_perm_bin_perm_pp<opc, 0b01, asm, PPR16>; + def _S : sve_int_perm_bin_perm_pp<opc, 0b10, asm, PPR32>; + def _D : sve_int_perm_bin_perm_pp<opc, 0b11, asm, PPR64>; +}
\ No newline at end of file diff --git a/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp index 7ac9a5a08484..8fb161574c5b 100644 --- a/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp @@ -29,11 +29,11 @@ extern "C" void LLVMInitializeAArch64TargetInfo() { // Now register the "arm64" name for use with "-march". We don't want it to // take possession of the Triple::aarch64 tag though. TargetRegistry::RegisterTarget(getTheARM64Target(), "arm64", - "ARM64 (little endian)", + "ARM64 (little endian)", "AArch64", [](Triple::ArchType) { return false; }, true); RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z( - getTheAArch64leTarget(), "aarch64", "AArch64 (little endian)"); + getTheAArch64leTarget(), "aarch64", "AArch64 (little endian)", "AArch64"); RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W( - getTheAArch64beTarget(), "aarch64_be", "AArch64 (big endian)"); + getTheAArch64beTarget(), "aarch64_be", "AArch64 (big endian)", "AArch64"); } diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 5d76681cd97b..c1c799b7b349 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -517,7 +517,12 @@ namespace AArch64II { /// thread-local symbol. On Darwin, only one type of thread-local access /// exists (pre linker-relaxation), but on ELF the TLSModel used for the /// referee will affect interpretation. - MO_TLS = 0x40 + MO_TLS = 0x40, + + /// MO_DLLIMPORT - On a symbol operand, this represents that the reference + /// to the symbol is for an import stub. This is used for DLL import + /// storage class indication on Windows. + MO_DLLIMPORT = 0x80, }; } // end namespace AArch64II |