diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-04-16 16:25:46 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-04-16 16:25:46 +0000 |
commit | 7a7e6055035bfd93ab507051819373a6f171258b (patch) | |
tree | dc9ac22b4fea4f445748feaf7232a146623f0dfa /contrib/llvm/lib/Target/AArch64 | |
parent | b96a714f453e7f5aeeb3c2df2c3e1e8ad749f96f (diff) | |
parent | 71d5a2540a98c81f5bcaeb48805e0e2881f530ef (diff) |
Merge llvm trunk r300422 and resolve conflicts.
Notes
Notes:
svn path=/projects/clang500-import/; revision=317029
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64')
57 files changed, 4560 insertions, 2136 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h index fd106a8d9b0b..b44b13e36e15 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64.h @@ -22,8 +22,11 @@ namespace llvm { +class AArch64RegisterBankInfo; +class AArch64Subtarget; class AArch64TargetMachine; class FunctionPass; +class InstructionSelector; class MachineFunctionPass; FunctionPass *createAArch64DeadRegisterDefinitions(); @@ -45,6 +48,9 @@ FunctionPass *createAArch64A53Fix835769(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +InstructionSelector * +createAArch64InstructionSelector(const AArch64TargetMachine &, + AArch64Subtarget &, AArch64RegisterBankInfo &); void initializeAArch64A53Fix835769Pass(PassRegistry&); void initializeAArch64A57FPLoadBalancingPass(PassRegistry&); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td index 91c335fac32d..519ca2894683 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64.td @@ -27,7 +27,7 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", - "Enable cryptographic instructions">; + "Enable cryptographic instructions", [FeatureNEON]>; def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable ARMv8 CRC-32 checksum instructions">; @@ -38,6 +38,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">; +def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true", + "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">; + def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable ARMv8 PMUv3 Performance Monitors extension">; @@ -100,6 +103,14 @@ def FeatureArithmeticCbzFusion : SubtargetFeature< "arith-cbz-fusion", "HasArithmeticCbzFusion", "true", "CPU fuses arithmetic + cbz/cbnz operations">; +def FeatureFuseAES : SubtargetFeature< + "fuse-aes", "HasFuseAES", "true", + "CPU fuses AES crypto operations">; + +def FeatureFuseLiterals : SubtargetFeature< + "fuse-literals", "HasFuseLiterals", "true", + "CPU fuses literal generation operations">; + def FeatureDisableLatencySchedHeuristic : SubtargetFeature< "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; @@ -108,12 +119,22 @@ def FeatureUseRSqrt : SubtargetFeature< "use-reciprocal-square-root", "UseRSqrt", "true", "Use the reciprocal square root approximation">; +def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", + "NegativeImmediates", "false", + "Convert immediates and instructions " + "to their negated or complemented " + "equivalent when the immediate does " + "not fit in the encoding.">; + +def FeatureLSLFast : SubtargetFeature< + "lsl-fast", "HasLSLFast", "true", + "CPU has a fastpath logical shift of up to 3 places">; //===----------------------------------------------------------------------===// // Architectures. // def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", - "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE]>; + "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>; @@ -123,6 +144,7 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", //===----------------------------------------------------------------------===// include "AArch64RegisterInfo.td" +include "AArch64RegisterBanks.td" include "AArch64CallingConvention.td" //===----------------------------------------------------------------------===// @@ -149,7 +171,8 @@ include "AArch64SchedCyclone.td" include "AArch64SchedFalkor.td" include "AArch64SchedKryo.td" include "AArch64SchedM1.td" -include "AArch64SchedVulcan.td" +include "AArch64SchedThunderX.td" +include "AArch64SchedThunderX2T99.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", [ @@ -180,6 +203,8 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseLiterals, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -226,6 +251,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -256,7 +282,8 @@ def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing + FeatureZCZeroing, + FeatureLSLFast ]>; def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", @@ -269,19 +296,66 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing + FeatureRDM, + FeatureZCZeroing, + FeatureLSLFast ]>; -def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan", - "Broadcom Vulcan processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, - FeatureArithmeticBccFusion, - FeatureNEON, - FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, - HasV8_1aOps]>; +def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", + "ThunderX2T99", + "Cavium ThunderX2 processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureArithmeticBccFusion, + FeatureNEON, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureLSE, + HasV8_1aOps]>; + +def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily", + "ThunderXT88", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily", + "ThunderXT81", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily", + "ThunderXT83", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; def : ProcessorModel<"generic", NoSchedModel, [ FeatureCRC, @@ -291,11 +365,11 @@ def : ProcessorModel<"generic", NoSchedModel, [ FeaturePostRAScheduler ]>; -// FIXME: Cortex-A35 is currently modelled as a Cortex-A53 +// FIXME: Cortex-A35 is currently modeled as a Cortex-A53. def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; -// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57. +// FIXME: Cortex-A72 and Cortex-A73 are currently modeled as a Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; @@ -304,7 +378,13 @@ def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>; def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>; def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>; def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; -def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>; +// Cavium ThunderX/ThunderX T8X Processors +def : ProcessorModel<"thunderx", ThunderXT8XModel, [ProcThunderX]>; +def : ProcessorModel<"thunderxt88", ThunderXT8XModel, [ProcThunderXT88]>; +def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>; +def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>; +// Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan. +def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; //===----------------------------------------------------------------------===// // Assembly parser diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 0aa597bcdc56..4a7e0b2b803e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -493,43 +493,30 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB) { - RegScavenger RS; - RS.enterBasicBlock(MBB); - RS.forward(MachineBasicBlock::iterator(G->getStart())); - // Can we find an appropriate register that is available throughout the life - // of the chain? - unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; - BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); - for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) { - RS.forward(I); - AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID)); - - // Remove any registers clobbered by a regmask or any def register that is - // immediately dead. - for (auto J : I->operands()) { - if (J.isRegMask()) - AvailableRegs.clearBitsNotInMask(J.getRegMask()); - - if (J.isReg() && J.isDef()) { - MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true); - if (J.isDead()) - for (; AI.isValid(); ++AI) - AvailableRegs.reset(*AI); -#ifndef NDEBUG - else - for (; AI.isValid(); ++AI) - assert(!AvailableRegs[*AI] && - "Non-dead def should have been removed by now!"); -#endif - } - } + // of the chain? Simulate liveness backwards until the end of the chain. + LiveRegUnits Units(*TRI); + Units.addLiveOuts(MBB); + MachineBasicBlock::iterator I = MBB.end(); + MachineBasicBlock::iterator ChainEnd = G->end(); + while (I != ChainEnd) { + --I; + Units.stepBackward(*I); } + // Check which register units are alive throughout the chain. + MachineBasicBlock::iterator ChainBegin = G->begin(); + assert(ChainBegin != ChainEnd && "Chain should contain instructions"); + do { + --I; + Units.accumulateBackward(*I); + } while (I != ChainBegin); + // Make sure we allocate in-order, to get the cheapest registers first. + unsigned RegClassID = ChainBegin->getDesc().OpInfo[0].RegClass; auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID)); for (auto Reg : Ord) { - if (!AvailableRegs[Reg]) + if (!Units.available(Reg)) continue; if (C == getColor(Reg)) return Reg; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 0cbb2db1134a..e1b8ee6d03c3 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -31,16 +31,23 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> using namespace llvm; @@ -59,12 +66,12 @@ EnableMerge("aarch64-type-promotion-merge", cl::Hidden, //===----------------------------------------------------------------------===// namespace { -class AArch64AddressTypePromotion : public FunctionPass { +class AArch64AddressTypePromotion : public FunctionPass { public: static char ID; - AArch64AddressTypePromotion() - : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) { + + AArch64AddressTypePromotion() : FunctionPass(ID) { initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry()); } @@ -76,10 +83,11 @@ public: private: /// The current function. - Function *Func; + Function *Func = nullptr; + /// Filter out all sexts that does not have this type. /// Currently initialized with Int64Ty. - Type *ConsideredSExtType; + Type *ConsideredSExtType = nullptr; // This transformation requires dominator info. void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -129,7 +137,8 @@ private: void mergeSExts(ValueToInsts &ValToSExtendedUses, SetOfInstructions &ToRemove); }; -} // end anonymous namespace. + +} // end anonymous namespace char AArch64AddressTypePromotion::ID = 0; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp index a4950af32097..b2f55a7e1e09 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp @@ -1,4 +1,4 @@ -//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===// +//===--- AArch64CallLowering.cpp - Call lowering --------------------------===// // // The LLVM Compiler Infrastructure // @@ -15,15 +15,36 @@ #include "AArch64CallLowering.h" #include "AArch64ISelLowering.h" - +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> + using namespace llvm; #ifndef LLVM_BUILD_GLOBAL_ISEL @@ -31,12 +52,12 @@ using namespace llvm; #endif AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) - : CallLowering(&TLI) { -} + : CallLowering(&TLI) {} struct IncomingArgHandler : public CallLowering::ValueHandler { - IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI) - : ValueHandler(MIRBuilder, MRI) {} + IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {} unsigned getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -45,6 +66,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64)); MIRBuilder.buildFrameIndex(AddrReg, FI); + StackUsed = std::max(StackUsed, Size + Offset); return AddrReg; } @@ -67,11 +89,14 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { /// parameters (it's a basic-block live-in), and a call instruction /// (it's an implicit-def of the BL). virtual void markPhysRegUsed(unsigned PhysReg) = 0; + + uint64_t StackUsed; }; struct FormalArgHandler : public IncomingArgHandler { - FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI) - : IncomingArgHandler(MIRBuilder, MRI) {} + FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { MIRBuilder.getMBB().addLiveIn(PhysReg); @@ -80,8 +105,8 @@ struct FormalArgHandler : public IncomingArgHandler { struct CallReturnHandler : public IncomingArgHandler { CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB) - : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {} + MachineInstrBuilder MIB, CCAssignFn *AssignFn) + : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} void markPhysRegUsed(unsigned PhysReg) override { MIB.addDef(PhysReg, RegState::Implicit); @@ -92,8 +117,10 @@ struct CallReturnHandler : public IncomingArgHandler { struct OutgoingArgHandler : public CallLowering::ValueHandler { OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB) - : ValueHandler(MIRBuilder, MRI), MIB(MIB) {} + MachineInstrBuilder MIB, CCAssignFn *AssignFn, + CCAssignFn *AssignFnVarArg) + : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), + AssignFnVarArg(AssignFnVarArg), StackSize(0) {} unsigned getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -126,14 +153,29 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { MIRBuilder.buildStore(ValVReg, Addr, *MMO); } + bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, + CCState &State) override { + bool Res; + if (Info.IsFixed) + Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + else + Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + + StackSize = State.getNextStackOffset(); + return Res; + } + MachineInstrBuilder MIB; + CCAssignFn *AssignFnVarArg; + uint64_t StackSize; }; -void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg, - SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, - MachineRegisterInfo &MRI, - SplitArgTy PerformArgSplit) const { +void AArch64CallLowering::splitToValueTypes( + const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, + const SplitArgTy &PerformArgSplit) const { const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); LLVMContext &Ctx = OrigArg.Ty->getContext(); @@ -145,7 +187,7 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg, // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags); + OrigArg.Flags, OrigArg.IsFixed); return; } @@ -154,19 +196,12 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg, // FIXME: set split flags if they're actually used (e.g. i128 on AAPCS). Type *SplitTy = SplitVT.getTypeForEVT(Ctx); SplitArgs.push_back( - ArgInfo{MRI.createGenericVirtualRegister(LLT{*SplitTy, DL}), SplitTy, - OrigArg.Flags}); + ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), + SplitTy, OrigArg.Flags, OrigArg.IsFixed}); } - SmallVector<uint64_t, 4> BitOffsets; - for (auto Offset : Offsets) - BitOffsets.push_back(Offset * 8); - - SmallVector<unsigned, 8> SplitRegs; - for (auto I = &SplitArgs[FirstRegIdx]; I != SplitArgs.end(); ++I) - SplitRegs.push_back(I->Reg); - - PerformArgSplit(SplitRegs, BitOffsets); + for (unsigned i = 0; i < Offsets.size(); ++i) + PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8); } bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, @@ -184,16 +219,16 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, auto &DL = F.getParent()->getDataLayout(); ArgInfo OrigArg{VReg, Val->getType()}; - setArgFlags(OrigArg, AttributeSet::ReturnIndex, DL, F); + setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); SmallVector<ArgInfo, 8> SplitArgs; splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) { - MIRBuilder.buildExtract(Regs, Offsets, VReg); + [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, VReg, Offset); }); - OutgoingArgHandler Handler(MIRBuilder, MRI, MIB); - Success = handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler); + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn); + Success = handleAssignments(MIRBuilder, SplitArgs, Handler); } MIRBuilder.insertInstr(MIB); @@ -203,7 +238,6 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const { - auto &Args = F.getArgumentList(); MachineFunction &MF = MIRBuilder.getMF(); MachineBasicBlock &MBB = MIRBuilder.getMBB(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -211,13 +245,27 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, SmallVector<ArgInfo, 8> SplitArgs; unsigned i = 0; - for (auto &Arg : Args) { + for (auto &Arg : F.args()) { ArgInfo OrigArg{VRegs[i], Arg.getType()}; setArgFlags(OrigArg, i + 1, DL, F); + bool Split = false; + LLT Ty = MRI.getType(VRegs[i]); + unsigned Dst = VRegs[i]; + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) { - MIRBuilder.buildSequence(VRegs[i], Regs, Offsets); + [&](unsigned Reg, uint64_t Offset) { + if (!Split) { + Split = true; + Dst = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildUndef(Dst); + } + unsigned Tmp = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset); + Dst = Tmp; }); + + if (Dst != VRegs[i]) + MIRBuilder.buildCopy(VRegs[i], Dst); ++i; } @@ -228,10 +276,25 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, CCAssignFn *AssignFn = TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); - FormalArgHandler Handler(MIRBuilder, MRI); - if (!handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler)) + FormalArgHandler Handler(MIRBuilder, MRI, AssignFn); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; + if (F.isVarArg()) { + if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) { + // FIXME: we need to reimplement saveVarArgsRegisters from + // AArch64ISelLowering. + return false; + } + + // We currently pass all varargs at 8-byte alignment. + uint64_t StackOffset = alignTo(Handler.StackUsed, 8); + + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); + } + // Move back to the end of the basic block. MIRBuilder.setMBB(MBB); @@ -239,6 +302,7 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, } bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, + CallingConv::ID CallConv, const MachineOperand &Callee, const ArgInfo &OrigRet, ArrayRef<ArgInfo> OrigArgs) const { @@ -250,21 +314,25 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector<ArgInfo, 8> SplitArgs; for (auto &OrigArg : OrigArgs) { splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) { - MIRBuilder.buildExtract(Regs, Offsets, OrigArg.Reg); + [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset); }); } // Find out which ABI gets to decide where things go. const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); - CCAssignFn *CallAssignFn = - TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); + CCAssignFn *AssignFnFixed = + TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false); + CCAssignFn *AssignFnVarArg = + TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true); + + auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR : AArch64::BL); - MIB.addOperand(Callee); + MIB.add(Callee); // Tell the call which registers are clobbered. auto TRI = MF.getSubtarget().getRegisterInfo(); @@ -272,8 +340,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Do the actual argument marshalling. SmallVector<unsigned, 8> PhysRegs; - OutgoingArgHandler Handler(MIRBuilder, MRI, MIB); - if (!handleAssignments(MIRBuilder, CallAssignFn, SplitArgs, Handler)) + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, + AssignFnVarArg); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; // Now we can add the actual call instruction to the correct basic block. @@ -298,20 +367,23 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector<uint64_t, 8> RegOffsets; SmallVector<unsigned, 8> SplitRegs; splitToValueTypes(OrigRet, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) { - std::copy(Offsets.begin(), Offsets.end(), - std::back_inserter(RegOffsets)); - std::copy(Regs.begin(), Regs.end(), - std::back_inserter(SplitRegs)); + [&](unsigned Reg, uint64_t Offset) { + RegOffsets.push_back(Offset); + SplitRegs.push_back(Reg); }); - CallReturnHandler Handler(MIRBuilder, MRI, MIB); - if (!handleAssignments(MIRBuilder, RetAssignFn, SplitArgs, Handler)) + CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; if (!RegOffsets.empty()) MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets); } + CallSeqStart.addImm(Handler.StackSize); + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) + .addImm(Handler.StackSize) + .addImm(0); + return true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h index ce6676249df6..d96ce95c4de0 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h @@ -1,4 +1,4 @@ -//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===// +//===--- AArch64CallLowering.h - Call lowering ------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,18 +12,20 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING -#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" -#include "llvm/CodeGen/ValueTypes.h" +#include <cstdint> +#include <functional> namespace llvm { class AArch64TargetLowering; class AArch64CallLowering: public CallLowering { - public: +public: AArch64CallLowering(const AArch64TargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, @@ -32,8 +34,8 @@ class AArch64CallLowering: public CallLowering { bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, const MachineOperand &Callee, - const ArgInfo &OrigRet, + bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, + const MachineOperand &Callee, const ArgInfo &OrigRet, ArrayRef<ArgInfo> OrigArgs) const override; private: @@ -44,13 +46,14 @@ private: typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)> MemHandler; - typedef std::function<void(ArrayRef<unsigned>, ArrayRef<uint64_t>)> - SplitArgTy; + typedef std::function<void(unsigned, uint64_t)> SplitArgTy; void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, - SplitArgTy SplitArg) const; + const SplitArgTy &SplitArg) const; }; -} // End of namespace llvm; -#endif + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 8b186328d125..2dfcd2d1c393 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -265,10 +265,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, // Change immediate in comparison instruction (ADDS or SUBS). BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc)) - .addOperand(CmpMI->getOperand(0)) - .addOperand(CmpMI->getOperand(1)) + .add(CmpMI->getOperand(0)) + .add(CmpMI->getOperand(1)) .addImm(Imm) - .addOperand(CmpMI->getOperand(3)); + .add(CmpMI->getOperand(3)); CmpMI->eraseFromParent(); // The fact that this comparison was picked ensures that it's related to the @@ -278,7 +278,7 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, // Change condition in branch instruction. BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc)) .addImm(Cmp) - .addOperand(BrMI.getOperand(1)); + .add(BrMI.getOperand(1)); BrMI.eraseFromParent(); MBB->updateTerminator(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index da09b36cac9c..00a0111f2bd2 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -594,7 +594,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. BuildMI(*Head, Head->end(), TermDL, MCID) .addReg(DestReg, RegState::Define | RegState::Dead) - .addOperand(HeadCond[2]) + .add(HeadCond[2]) .addImm(0) .addImm(0); // SUBS uses the GPR*sp register classes. @@ -650,13 +650,12 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { if (CmpMI->getOperand(FirstOp + 1).isReg()) MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(), TII->getRegClass(MCID, 1, TRI, *MF)); - MachineInstrBuilder MIB = - BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) - .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn + MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) + .add(CmpMI->getOperand(FirstOp)); // Register Rn if (isZBranch) MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0 else - MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate + MIB.add(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate MIB.addImm(NZCV).addImm(HeadCmpBBCC); // If CmpMI was a terminator, we need a new conditional branch to replace it. @@ -666,7 +665,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { CmpMI->getOpcode() == AArch64::CBNZX; BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc)) .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ) - .addOperand(CmpMI->getOperand(1)); // Branch target. + .add(CmpMI->getOperand(1)); // Branch target. } CmpMI->eraseFromParent(); Head->updateTerminator(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index fe1c0beee0eb..d0c0956b87ca 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -17,6 +17,7 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "Utils/AArch64BaseInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -70,9 +71,9 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, const MachineOperand &MO = OldMI.getOperand(i); assert(MO.isReg() && MO.getReg()); if (MO.isUse()) - UseMI.addOperand(MO); + UseMI.add(MO); else - DefMI.addOperand(MO); + DefMI.add(MO); } } @@ -112,7 +113,7 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI, // Create the ORR-immediate instruction. MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AArch64::XZR) .addImm(Encoding); @@ -179,7 +180,7 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, // Create the ORR-immediate instruction. MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AArch64::XZR) .addImm(Encoding); @@ -362,7 +363,7 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AArch64::XZR) .addImm(Encoding); @@ -425,7 +426,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) .addImm(Encoding); transferImpOps(MI, MIB, MIB); @@ -539,15 +540,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, if (Imm != 0) { unsigned LZ = countLeadingZeros(Imm); unsigned TZ = countTrailingZeros(Imm); - Shift = ((63 - LZ) / 16) * 16; - LastShift = (TZ / 16) * 16; + Shift = (TZ / 16) * 16; + LastShift = ((63 - LZ) / 16) * 16; } unsigned Imm16 = (Imm >> Shift) & Mask; bool DstIsDead = MI.getOperand(0).isDead(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) .addReg(DstReg, RegState::Define | - getDeadRegState(DstIsDead && Shift == LastShift)) + getDeadRegState(DstIsDead && Shift == LastShift)) .addImm(Imm16) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); @@ -564,15 +565,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, MachineInstrBuilder MIB2; unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi); - while (Shift != LastShift) { - Shift -= 16; + while (Shift < LastShift) { + Shift += 16; Imm16 = (Imm >> Shift) & Mask; if (Imm16 == (isNeg ? Mask : 0)) continue; // This 16-bit portion is already set correctly. MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) .addReg(DstReg, RegState::Define | - getDeadRegState(DstIsDead && Shift == LastShift)) + getDeadRegState(DstIsDead && Shift == LastShift)) .addReg(DstReg) .addImm(Imm16) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); @@ -627,7 +628,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( .addReg(Addr.getReg()); BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) - .addOperand(Desired) + .add(Desired) .addImm(ExtendImm); BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) .addImm(AArch64CC::NE) @@ -643,9 +644,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( StoreBB->addLiveIn(New.getReg()); addPostLoopLiveIns(StoreBB, LiveRegs); - BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) - .addOperand(New) - .addOperand(Addr); + BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg).add(New).add(Addr); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) .addReg(StatusReg, RegState::Kill) .addMBB(LoadCmpBB); @@ -710,7 +709,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( .addReg(Addr.getReg()); BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) - .addOperand(DesiredLo) + .add(DesiredLo) .addImm(0); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) .addUse(AArch64::WZR) @@ -718,7 +717,7 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( .addImm(AArch64CC::EQ); BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) - .addOperand(DesiredHi) + .add(DesiredHi) .addImm(0); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) .addUse(StatusReg, RegState::Kill) @@ -738,9 +737,9 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( StoreBB->addLiveIn(NewHi.getReg()); addPostLoopLiveIns(StoreBB, LiveRegs); BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) - .addOperand(NewLo) - .addOperand(NewHi) - .addOperand(Addr); + .add(NewLo) + .add(NewHi) + .add(Addr); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) .addReg(StatusReg, RegState::Kill) .addMBB(LoadCmpBB); @@ -825,8 +824,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode), MI.getOperand(0).getReg()) - .addOperand(MI.getOperand(1)) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); transferImpOps(MI, MIB1, MIB1); MI.eraseFromParent(); @@ -842,7 +841,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(DstReg); if (MO1.isGlobal()) { @@ -878,19 +877,31 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, unsigned DstReg = MI.getOperand(0).getReg(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg) - .addOperand(MI.getOperand(1)); + .add(MI.getOperand(1)); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(DstReg) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(2)) .addImm(0); transferImpOps(MI, MIB1, MIB2); MI.eraseFromParent(); return true; } + case AArch64::MOVbaseTLS: { + unsigned DstReg = MI.getOperand(0).getReg(); + auto SysReg = AArch64SysReg::TPIDR_EL0; + MachineFunction *MF = MBB.getParent(); + if (MF->getTarget().getTargetTriple().isOSFuchsia() && + MF->getTarget().getCodeModel() == CodeModel::Kernel) + SysReg = AArch64SysReg::TPIDR_EL1; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg) + .addImm(SysReg); + MI.eraseFromParent(); + return true; + } case AArch64::MOVi32imm: return expandMOVImm(MBB, MBBI, 32); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp index fe2c2d4550a7..4e5e3e43a468 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -15,28 +15,62 @@ #include "AArch64.h" #include "AArch64CallingConvention.h" +#include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalAlias.h" -#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> + using namespace llvm; namespace { @@ -50,48 +84,55 @@ class AArch64FastISel final : public FastISel { } BaseKind; private: - BaseKind Kind; - AArch64_AM::ShiftExtendType ExtType; + BaseKind Kind = RegBase; + AArch64_AM::ShiftExtendType ExtType = AArch64_AM::InvalidShiftExtend; union { unsigned Reg; int FI; } Base; - unsigned OffsetReg; - unsigned Shift; - int64_t Offset; - const GlobalValue *GV; + unsigned OffsetReg = 0; + unsigned Shift = 0; + int64_t Offset = 0; + const GlobalValue *GV = nullptr; public: - Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend), - OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; } + Address() { Base.Reg = 0; } + void setKind(BaseKind K) { Kind = K; } BaseKind getKind() const { return Kind; } void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; } AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; } bool isRegBase() const { return Kind == RegBase; } bool isFIBase() const { return Kind == FrameIndexBase; } + void setReg(unsigned Reg) { assert(isRegBase() && "Invalid base register access!"); Base.Reg = Reg; } + unsigned getReg() const { assert(isRegBase() && "Invalid base register access!"); return Base.Reg; } + void setOffsetReg(unsigned Reg) { OffsetReg = Reg; } + unsigned getOffsetReg() const { return OffsetReg; } + void setFI(unsigned FI) { assert(isFIBase() && "Invalid base frame index access!"); Base.FI = FI; } + unsigned getFI() const { assert(isFIBase() && "Invalid base frame index access!"); return Base.FI; } + void setOffset(int64_t O) { Offset = O; } int64_t getOffset() { return Offset; } void setShift(unsigned S) { Shift = S; } @@ -417,7 +458,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { // MachO still uses GOT for large code-model accesses, but ELF requires // movz/movk sequences, which FastISel doesn't handle yet. - if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO()) + if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO()) return 0; unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); @@ -531,23 +572,23 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) switch (Opcode) { default: break; - case Instruction::BitCast: { + case Instruction::BitCast: // Look through bitcasts. return computeAddress(U->getOperand(0), Addr, Ty); - } - case Instruction::IntToPtr: { + + case Instruction::IntToPtr: // Look past no-op inttoptrs. if (TLI.getValueType(DL, U->getOperand(0)->getType()) == TLI.getPointerTy(DL)) return computeAddress(U->getOperand(0), Addr, Ty); break; - } - case Instruction::PtrToInt: { + + case Instruction::PtrToInt: // Look past no-op ptrtoints. if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return computeAddress(U->getOperand(0), Addr, Ty); break; - } + case Instruction::GetElementPtr: { Address SavedAddr = Addr; uint64_t TmpOffset = Addr.getOffset(); @@ -563,7 +604,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) TmpOffset += SL->getElementOffset(Idx); } else { uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); - for (;;) { + while (true) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. TmpOffset += CI->getSExtValue() * S; @@ -2813,8 +2854,8 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { MVT DestVT; if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) return false; - assert ((DestVT == MVT::f32 || DestVT == MVT::f64) && - "Unexpected value type."); + assert((DestVT == MVT::f32 || DestVT == MVT::f64) && + "Unexpected value type."); unsigned SrcReg = getRegForValue(I->getOperand(0)); if (!SrcReg) @@ -3106,8 +3147,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { return false; CodeModel::Model CM = TM.getCodeModel(); - // Only support the small and large code model. - if (CM != CodeModel::Small && CM != CodeModel::Large) + // Only support the small-addressing and large code models. + if (CM != CodeModel::Large && !Subtarget->useSmallAddressing()) return false; // FIXME: Add large code model support for ELF. @@ -3158,7 +3199,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Issue the call. MachineInstrBuilder MIB; - if (CM == CodeModel::Small) { + if (Subtarget->useSmallAddressing()) { const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II); if (Symbol) @@ -3369,8 +3410,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo(); MFI.setFrameAddressIsTaken(true); - const AArch64RegisterInfo *RegInfo = - static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo()); + const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3521,11 +3561,11 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { updateValueMap(II, ResultReg); return true; } - case Intrinsic::trap: { + case Intrinsic::trap: BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) .addImm(1); return true; - } + case Intrinsic::sqrt: { Type *RetTy = II->getCalledFunction()->getReturnType(); @@ -5092,8 +5132,10 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { } namespace llvm { -llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, + +FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) { return new AArch64FastISel(FuncInfo, LibInfo); } -} + +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index f5b8c35375f8..550174b22a89 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -90,21 +90,42 @@ #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <vector> using namespace llvm; @@ -245,14 +266,13 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { if (&MF->front() == MBB) return AArch64::X9; - const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); - LivePhysRegs LiveRegs(&TRI); + const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo(); + LivePhysRegs LiveRegs(TRI); LiveRegs.addLiveIns(*MBB); // Mark callee saved registers as used so we will not choose them. - const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); - const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF); + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(MF); for (unsigned i = 0; CSRegs[i]; ++i) LiveRegs.addReg(CSRegs[i]); @@ -319,7 +339,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) { - unsigned NewOpc; bool NewIsUnscaled = false; switch (MBBI->getOpcode()) { @@ -362,7 +381,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( unsigned OpndIdx = 0; for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; ++OpndIdx) - MIB.addOperand(MBBI->getOperand(OpndIdx)); + MIB.add(MBBI->getOperand(OpndIdx)); assert(MBBI->getOperand(OpndIdx).getImm() == 0 && "Unexpected immediate offset in first/last callee-save save/restore " @@ -863,22 +882,26 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { static bool produceCompactUnwindFrame(MachineFunction &MF) { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - AttributeSet Attrs = MF.getFunction()->getAttributes(); + AttributeList Attrs = MF.getFunction()->getAttributes(); return Subtarget.isTargetMachO() && !(Subtarget.getTargetLowering()->supportSwiftError() && Attrs.hasAttrSomewhere(Attribute::SwiftError)); } namespace { + struct RegPairInfo { - RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {} - unsigned Reg1; - unsigned Reg2; + unsigned Reg1 = AArch64::NoRegister; + unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; bool IsGPR; + + RegPairInfo() = default; + bool isPaired() const { return Reg2 != AArch64::NoRegister; } }; + } // end anonymous namespace static void computeCalleeSaveRegisterPairs( diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index d472a54d9543..8b1c9740d2ad 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -16,281 +16,198 @@ #endif namespace llvm { -namespace AArch64 { - -const uint32_t GPRCoverageData[] = { - // Classes 0-31 - (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) | - (1u << AArch64::GPR32spRegClassID) | - (1u << AArch64::GPR32commonRegClassID) | - (1u << AArch64::GPR32sponlyRegClassID) | - (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) | - (1u << AArch64::GPR64spRegClassID) | - (1u << AArch64::GPR64commonRegClassID) | - (1u << AArch64::tcGPR64RegClassID) | - (1u << AArch64::GPR64sponlyRegClassID), - // Classes 32-63 - 0, - // FIXME: The entries below this point can be safely removed once this is - // tablegenerated. It's only needed because of the hardcoded register class - // limit. - // Classes 64-96 - 0, - // Classes 97-128 - 0, - // Classes 129-160 - 0, - // Classes 161-192 - 0, - // Classes 193-224 - 0, -}; - -const uint32_t FPRCoverageData[] = { - // Classes 0-31 - (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) | - (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) | - (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) | - (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) | - (1u << AArch64::DDDDRegClassID), - // Classes 32-63 - (1u << (AArch64::QQRegClassID - 32)) | - (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) | - (1u - << (AArch64:: - QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID - - 32)) | - (1u << (AArch64::QQQQRegClassID - 32)) | - (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID - - 32)) | - (1u << (AArch64::QQQRegClassID - 32)) | - (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) | - (1u - << (AArch64:: - QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID - - 32)), - // FIXME: The entries below this point can be safely removed once this - // is tablegenerated. It's only needed because of the hardcoded register - // class limit. - // Classes 64-96 - 0, - // Classes 97-128 - 0, - // Classes 129-160 - 0, - // Classes 161-192 - 0, - // Classes 193-224 - 0, -}; - -const uint32_t CCRCoverageData[] = { - // Classes 0-31 - 1u << AArch64::CCRRegClassID, - // Classes 32-63 - 0, - // FIXME: The entries below this point can be safely removed once this - // is tablegenerated. It's only needed because of the hardcoded register - // class limit. - // Classes 64-96 - 0, - // Classes 97-128 - 0, - // Classes 129-160 - 0, - // Classes 161-192 - 0, - // Classes 193-224 - 0, -}; - -RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData); -RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData); -RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData); - -RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank}; - -// PartialMappings. -enum PartialMappingIdx { - PMI_None = -1, - PMI_GPR32 = 1, - PMI_GPR64, - PMI_FPR32, - PMI_FPR64, - PMI_FPR128, - PMI_FPR256, - PMI_FPR512, - PMI_FirstGPR = PMI_GPR32, - PMI_LastGPR = PMI_GPR64, - PMI_FirstFPR = PMI_FPR32, - PMI_LastFPR = PMI_FPR512, - PMI_Min = PMI_FirstGPR, -}; - -static unsigned getRegBankBaseIdxOffset(unsigned Size) { - assert(Size && "0-sized type!!"); - // Make anything smaller than 32 gets 32 - Size = ((Size + 31) / 32) * 32; - // 32 is 0, 64 is 1, 128 is 2, and so on. - return Log2_32(Size) - /*Log2_32(32)=*/ 5; -} - -RegisterBankInfo::PartialMapping PartMappings[] { - /* StartIdx, Length, RegBank */ - // 0: GPR 32-bit value. - {0, 32, GPRRegBank}, - // 1: GPR 64-bit value. - {0, 64, GPRRegBank}, - // 2: FPR 32-bit value. - {0, 32, FPRRegBank}, - // 3: FPR 64-bit value. - {0, 64, FPRRegBank}, - // 4: FPR 128-bit value. - {0, 128, FPRRegBank}, - // 5: FPR 256-bit value. - {0, 256, FPRRegBank}, - // 6: FPR 512-bit value. - {0, 512, FPRRegBank} -}; - -enum ValueMappingIdx { - First3OpsIdx = 0, - Last3OpsIdx = 18, - DistanceBetweenRegBanks = 3, - FirstCrossRegCpyIdx = 21, - LastCrossRegCpyIdx = 27, - DistanceBetweenCrossRegCpy = 2 +RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{ + /* StartIdx, Length, RegBank */ + // 0: FPR 32-bit value. + {0, 32, AArch64::FPRRegBank}, + // 1: FPR 64-bit value. + {0, 64, AArch64::FPRRegBank}, + // 2: FPR 128-bit value. + {0, 128, AArch64::FPRRegBank}, + // 3: FPR 256-bit value. + {0, 256, AArch64::FPRRegBank}, + // 4: FPR 512-bit value. + {0, 512, AArch64::FPRRegBank}, + // 5: GPR 32-bit value. + {0, 32, AArch64::GPRRegBank}, + // 6: GPR 64-bit value. + {0, 64, AArch64::GPRRegBank}, }; // ValueMappings. -RegisterBankInfo::ValueMapping ValMappings[]{ +RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ /* BreakDown, NumBreakDowns */ + // 0: invalid + {nullptr, 0}, // 3-operands instructions (all binary operations should end up with one of // those mapping). - // 0: GPR 32-bit value. <-- This must match First3OpsIdx. - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 3: GPR 64-bit value. - {&PartMappings[PMI_GPR64 - PMI_Min], 1}, - {&PartMappings[PMI_GPR64 - PMI_Min], 1}, - {&PartMappings[PMI_GPR64 - PMI_Min], 1}, - // 6: FPR 32-bit value. - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 9: FPR 64-bit value. - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - // 12: FPR 128-bit value. - {&PartMappings[PMI_FPR128 - PMI_Min], 1}, - {&PartMappings[PMI_FPR128 - PMI_Min], 1}, - {&PartMappings[PMI_FPR128 - PMI_Min], 1}, - // 15: FPR 256-bit value. - {&PartMappings[PMI_FPR256 - PMI_Min], 1}, - {&PartMappings[PMI_FPR256 - PMI_Min], 1}, - {&PartMappings[PMI_FPR256 - PMI_Min], 1}, - // 18: FPR 512-bit value. <-- This must match Last3OpsIdx. - {&PartMappings[PMI_FPR512 - PMI_Min], 1}, - {&PartMappings[PMI_FPR512 - PMI_Min], 1}, - {&PartMappings[PMI_FPR512 - PMI_Min], 1}, + // 1: FPR 32-bit value. <-- This must match First3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 4: FPR 64-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 7: FPR 128-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + // 10: FPR 256-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + // 13: FPR 512-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + // 16: GPR 32-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + // 19: GPR 64-bit value. <-- This must match Last3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, // Cross register bank copies. - // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match + // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match // FirstCrossRegCpyIdx. - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 23: GPR 64-bit value to FPR 64-bit value. - {&PartMappings[PMI_GPR64 - PMI_Min], 1}, - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - // 25: FPR 32-bit value to GPR 32-bit value. - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + // 24: FPR 64-bit value to GPR 64-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + // 26: FPR 128-bit value to GPR 128-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 28: FPR 256-bit value to GPR 256-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 30: FPR 512-bit value to GPR 512-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 32: GPR 32-bit value to FPR 32-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match // LastCrossRegCpyIdx. - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - {&PartMappings[PMI_GPR64 - PMI_Min], 1} + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, }; -/// Get the pointer to the ValueMapping representing the RegisterBank -/// at \p RBIdx with a size of \p Size. -/// -/// The returned mapping works for instructions with the same kind of -/// operands for up to 3 operands. -/// -/// \pre \p RBIdx != PartialMappingIdx::None +bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx, + unsigned ValStartIdx, + unsigned ValLength, + const RegisterBank &RB) { + const PartialMapping &Map = PartMappings[Idx - PartialMappingIdx::PMI_Min]; + return Map.StartIdx == ValStartIdx && Map.Length == ValLength && + Map.RegBank == &RB; +} + +bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx, + unsigned FirstInBank, + unsigned Size, + unsigned Offset) { + unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min; + const ValueMapping &Map = + AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset]; + return Map.BreakDown == &PartMappings[PartialMapBaseIdx] && + Map.NumBreakDowns == 1; +} + +bool AArch64GenRegisterBankInfo::checkPartialMappingIdx( + PartialMappingIdx FirstAlias, PartialMappingIdx LastAlias, + ArrayRef<PartialMappingIdx> Order) { + if (Order.front() != FirstAlias) + return false; + if (Order.back() != LastAlias) + return false; + if (Order.front() > Order.back()) + return false; + + PartialMappingIdx Previous = Order.front(); + bool First = true; + for (const auto &Current : Order) { + if (First) { + First = false; + continue; + } + if (Previous + 1 != Current) + return false; + Previous = Current; + } + return true; +} + +unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, + unsigned Size) { + if (RBIdx == PMI_FirstGPR) { + if (Size <= 32) + return 0; + if (Size <= 64) + return 1; + return -1; + } + if (RBIdx == PMI_FirstFPR) { + if (Size <= 32) + return 0; + if (Size <= 64) + return 1; + if (Size <= 128) + return 2; + if (Size <= 256) + return 3; + if (Size <= 512) + return 4; + return -1; + } + return -1; +} + const RegisterBankInfo::ValueMapping * -getValueMapping(PartialMappingIdx RBIdx, unsigned Size) { +AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx, + unsigned Size) { assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that"); - unsigned ValMappingIdx = First3OpsIdx + - (RBIdx - AArch64::PartialMappingIdx::PMI_Min + - getRegBankBaseIdxOffset(Size)) * - ValueMappingIdx::DistanceBetweenRegBanks; - assert(ValMappingIdx >= AArch64::First3OpsIdx && - ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound"); + unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size); + if (BaseIdxOffset == -1u) + return &ValMappings[InvalidIdx]; + + unsigned ValMappingIdx = + First3OpsIdx + (RBIdx - PartialMappingIdx::PMI_Min + BaseIdxOffset) * + ValueMappingIdx::DistanceBetweenRegBanks; + assert(ValMappingIdx >= First3OpsIdx && ValMappingIdx <= Last3OpsIdx && + "Mapping out of bound"); return &ValMappings[ValMappingIdx]; } -/// Get the pointer to the ValueMapping of the operands of a copy -/// instruction from a GPR or FPR register to a GPR or FPR register -/// with a size of \p Size. -/// -/// If \p DstIsGPR is true, the destination of the copy is on GPR, -/// otherwise it is on FPR. Same thing for \p SrcIsGPR. +AArch64GenRegisterBankInfo::PartialMappingIdx + AArch64GenRegisterBankInfo::BankIDToCopyMapIdx[]{ + PMI_None, // CCR + PMI_FirstFPR, // FPR + PMI_FirstGPR, // GPR + }; + const RegisterBankInfo::ValueMapping * -getCopyMapping(bool DstIsGPR, bool SrcIsGPR, unsigned Size) { - PartialMappingIdx DstRBIdx = DstIsGPR ? PMI_FirstGPR : PMI_FirstFPR; - PartialMappingIdx SrcRBIdx = SrcIsGPR ? PMI_FirstGPR : PMI_FirstFPR; +AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID, + unsigned SrcBankID, unsigned Size) { + assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID"); + assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID"); + PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID]; + PartialMappingIdx SrcRBIdx = BankIDToCopyMapIdx[SrcBankID]; + assert(DstRBIdx != PMI_None && "No such mapping"); + assert(SrcRBIdx != PMI_None && "No such mapping"); + if (DstRBIdx == SrcRBIdx) return getValueMapping(DstRBIdx, Size); + assert(Size <= 64 && "GPR cannot handle that size"); unsigned ValMappingIdx = FirstCrossRegCpyIdx + - (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(Size)) * + (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(DstRBIdx, Size)) * ValueMappingIdx::DistanceBetweenCrossRegCpy; - assert(ValMappingIdx >= AArch64::FirstCrossRegCpyIdx && - ValMappingIdx <= AArch64::LastCrossRegCpyIdx && - "Mapping out of bound"); + assert(ValMappingIdx >= FirstCrossRegCpyIdx && + ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound"); return &ValMappings[ValMappingIdx]; } - -} // End AArch64 namespace. } // End llvm namespace. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 3099383e5b32..ae01ea477bb9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -328,11 +328,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { } } +/// \brief Determine whether it is worth it to fold SHL into the addressing +/// mode. +static bool isWorthFoldingSHL(SDValue V) { + assert(V.getOpcode() == ISD::SHL && "invalid opcode"); + // It is worth folding logical shift of up to three places. + auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1)); + if (!CSD) + return false; + unsigned ShiftVal = CSD->getZExtValue(); + if (ShiftVal > 3) + return false; + + // Check if this particular node is reused in any non-memory related + // operation. If yes, do not try to fold this node into the address + // computation, since the computation will be kept. + const SDNode *Node = V.getNode(); + for (SDNode *UI : Node->uses()) + if (!isa<MemSDNode>(*UI)) + for (SDNode *UII : UI->uses()) + if (!isa<MemSDNode>(*UII)) + return false; + return true; +} + /// \brief Determine whether it is worth to fold V into an extended register. bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { - // it hurts if the value is used at least twice, unless we are optimizing - // for code size. - return ForCodeSize || V.hasOneUse(); + // Trivial if we are optimizing for code size or if there is only + // one use of the value. + if (ForCodeSize || V.hasOneUse()) + return true; + // If a subtarget has a fastpath LSL we can fold a logical shift into + // the addressing mode and save a cycle. + if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL && + isWorthFoldingSHL(V)) + return true; + if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) { + const SDValue LHS = V.getOperand(0); + const SDValue RHS = V.getOperand(1); + if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS)) + return true; + if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS)) + return true; + } + + // It hurts otherwise, since the value will be reused. + return false; } /// SelectShiftedRegister - Select a "shifted register" operand. If the value diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 849058bdfbdb..0d3289ac84c3 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -554,8 +555,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::Hybrid); - // Enable TBZ/TBNZ - MaskAndBranchFoldingIsLegal = true; EnableExtLdPromotion = true; // Set required alignment. @@ -793,7 +792,7 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, /// KnownZero/KnownOne bitsets. void AArch64TargetLowering::computeKnownBitsForTargetNode( const SDValue Op, APInt &KnownZero, APInt &KnownOne, - const SelectionDAG &DAG, unsigned Depth) const { + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; @@ -2113,8 +2112,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, Entry.Node = Arg; Entry.Ty = ArgTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); const char *LibcallName = @@ -2124,8 +2123,9 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2231,19 +2231,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::SIGN_EXTEND) - return true; - if (isExtendedBUILD_VECTOR(N, DAG, true)) - return true; - return false; + return N->getOpcode() == ISD::SIGN_EXTEND || + isExtendedBUILD_VECTOR(N, DAG, true); } static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::ZERO_EXTEND) - return true; - if (isExtendedBUILD_VECTOR(N, DAG, false)) - return true; - return false; + return N->getOpcode() == ISD::ZERO_EXTEND || + isExtendedBUILD_VECTOR(N, DAG, false); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { @@ -3578,7 +3572,7 @@ SDValue AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "This function expects an ELF target"); - assert(getTargetMachine().getCodeModel() == CodeModel::Small && + assert(Subtarget->useSmallAddressing() && "ELF TLS only supported in small memory model"); // Different choices can be made for the maximum size of the TLS area for a // module. For the small address model, the default TLS size is 16MiB and the @@ -3679,7 +3673,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->isTargetDarwin()) return LowerDarwinGlobalTLSAddress(Op, DAG); - else if (Subtarget->isTargetELF()) + if (Subtarget->isTargetELF()) return LowerELFGlobalTLSAddress(Op, DAG); llvm_unreachable("Unexpected platform trying to use TLS"); @@ -4516,7 +4510,12 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch<unsigned>(RegName) .Case("sp", AArch64::SP) + .Case("x18", AArch64::X18) + .Case("w18", AArch64::W18) .Default(0); + if ((Reg == AArch64::X18 || Reg == AArch64::W18) && + !Subtarget->isX18Reserved()) + Reg = 0; if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" @@ -6591,21 +6590,20 @@ FailedModImm: if (!isConstant && !usesOnlyOneValue) { SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); - unsigned ElemSize = VT.getScalarSizeInBits(); unsigned i = 0; - // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to + + // Use SCALAR_TO_VECTOR for lane zero to // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the - // value is already in an S or D register. - // Do not do this for UNDEF/LOAD nodes because we have better patterns - // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. - if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && - (ElemSize == 32 || ElemSize == 64)) { - unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; - MachineSDNode *N = - DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, - DAG.getTargetConstant(SubIdx, dl, MVT::i32)); - Vec = SDValue(N, 0); + // value is already in an S or D register, and we're forced to emit an + // INSERT_SUBREG that we can't fold anywhere. + // + // We also allow types like i8 and i16 which are illegal scalar but legal + // vector element types. After type-legalization the inserted value is + // extended (i32) and it is safe to cast them to the vector type by ignoring + // the upper bits of the lowest lane (e.g. v8i8, v4i16). + if (!Op0.isUndef()) { + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); ++i; } for (; i < NumElts; ++i) { @@ -7249,6 +7247,33 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, return NumBits == 32 || NumBits == 64; } +/// A helper function for determining the number of interleaved accesses we +/// will generate when lowering accesses of the given type. +unsigned +AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const { + return (DL.getTypeSizeInBits(VecTy) + 127) / 128; +} + +bool AArch64TargetLowering::isLegalInterleavedAccessType( + VectorType *VecTy, const DataLayout &DL) const { + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + + // Ensure the number of vector elements is greater than 1. + if (VecTy->getNumElements() < 2) + return false; + + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) + return false; + + // Ensure the total vector size is 64 or a multiple of 128. Types larger than + // 128 will be split into multiple interleaved accesses. + return VecSize == 64 || VecSize % 128 == 0; +} + /// \brief Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -7272,12 +7297,15 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); - unsigned VecSize = DL.getTypeSizeInBits(VecTy); - // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; + unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); @@ -7285,6 +7313,25 @@ bool AArch64TargetLowering::lowerInterleavedLoad( VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + IRBuilder<> Builder(LI); + + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + if (NumLoads > 1) { + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); + + // We will compute the pointer operand of each load from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); + } + Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, @@ -7293,39 +7340,49 @@ bool AArch64TargetLowering::lowerInterleavedLoad( Function *LdNFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - IRBuilder<> Builder(LI); - Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); + // Holds sub-vectors extracted from the load intrinsic return values. The + // sub-vectors are associated with the shufflevector instructions they will + // replace. + DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; - CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { - // Replace uses of each shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SVI = Shuffles[i]; - unsigned Index = Indices[i]; + // If we're generating more than one load, compute the base address of + // subsequent loads as an offset from the previous. + if (LoadCount > 0) + BaseAddr = Builder.CreateConstGEP1_32( + BaseAddr, VecTy->getVectorNumElements() * Factor); - Value *SubVec = Builder.CreateExtractValue(LdN, Index); + CallInst *LdN = Builder.CreateCall( + LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + // Extract and store the sub-vectors returned by the load intrinsic. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SVI = Shuffles[i]; + unsigned Index = Indices[i]; - SVI->replaceAllUsesWith(SubVec); - } + Value *SubVec = Builder.CreateExtractValue(LdN, Index); - return true; -} + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); -/// \brief Get a mask consisting of sequential integers starting from \p Start. -/// -/// I.e. <Start, Start + 1, ..., Start + NumElts - 1> -static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, - unsigned NumElts) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < NumElts; i++) - Mask.push_back(Builder.getInt32(Start + i)); + SubVecs[SVI].push_back(SubVec); + } + } + + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); + } - return ConstantVector::get(Mask); + return true; } /// \brief Lower an interleaved store into a stN intrinsic. @@ -7369,12 +7426,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; + unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); + Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); @@ -7394,6 +7454,25 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, SubVecTy = VectorType::get(IntTy, LaneLen); } + // The base address of the store. + Value *BaseAddr = SI->getPointerOperand(); + + if (NumStores > 1) { + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + + // We will compute the pointer operand of each store from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); + } + + auto Mask = SVI->getShuffleMask(); + Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *Tys[2] = {SubVecTy, PtrTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, @@ -7402,34 +7481,43 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, Function *StNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); - SmallVector<Value *, 5> Ops; + for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { - // Split the shufflevector operands into sub vectors for the new stN call. - auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) { - if (Mask[i] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); - } else { - unsigned StartMask = 0; - for (unsigned j = 1; j < LaneLen; j++) { - if (Mask[j*Factor + i] >= 0) { - StartMask = Mask[j*Factor + i] - j; - break; + SmallVector<Value *, 5> Ops; + + // Split the shufflevector operands into sub vectors for the new stN call. + for (unsigned i = 0; i < Factor; i++) { + unsigned IdxI = StoreCount * LaneLen * Factor + i; + if (Mask[IdxI] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + unsigned IdxJ = StoreCount * LaneLen * Factor + j; + if (Mask[IdxJ * Factor + IdxI] >= 0) { + StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; + break; + } } + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in + // isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - // Note: If all elements in a chunk are undefs, StartMask=0! - // Note: Filling undef gaps with random elements is ok, since - // those elements were being written anyway (with undefs). - // In the case of all undefs we're defaulting to using elems from 0 - // Note: StartMask cannot be negative, it's checked in isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); } - } - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); - Builder.CreateCall(StNFunc, Ops); + // If we generating more than one store, we compute the base address of + // subsequent stores as an offset from the previous. + if (StoreCount > 0) + BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); + Builder.CreateCall(StNFunc, Ops); + } return true; } @@ -7690,7 +7778,7 @@ SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV @@ -9267,7 +9355,7 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } - /// This function handles the log2-shuffle pattern produced by the +/// This function handles the log2-shuffle pattern produced by the /// LoopVectorizer for the across vector reduction. It consists of /// log2(NumVectorElements) steps and, in each step, 2^(s) elements /// are reduced, where s is an induction variable from 0 to @@ -10483,9 +10571,9 @@ void AArch64TargetLowering::ReplaceNodeResults( } bool AArch64TargetLowering::useLoadStackGuardNode() const { - if (!Subtarget->isTargetAndroid()) - return true; - return TargetLowering::useLoadStackGuardNode(); + if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) + return TargetLowering::useLoadStackGuardNode(); + return true; } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { @@ -10623,36 +10711,56 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, return false; } -Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { - if (!Subtarget->isTargetAndroid()) - return TargetLowering::getIRStackGuard(IRB); - - // Android provides a fixed TLS slot for the stack cookie. See the definition - // of TLS_SLOT_STACK_GUARD in - // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - const unsigned TlsOffset = 0x28; +static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( - IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); } -Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { - if (!Subtarget->isTargetAndroid()) - return TargetLowering::getSafeStackPointerLocation(IRB); +Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + // Android provides a fixed TLS slot for the stack cookie. See the definition + // of TLS_SLOT_STACK_GUARD in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + if (Subtarget->isTargetAndroid()) + return UseTlsOffset(IRB, 0x28); + // Fuchsia is similar. + // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value. + if (Subtarget->isTargetFuchsia()) + return UseTlsOffset(IRB, -0x10); + + return TargetLowering::getIRStackGuard(IRB); +} + +Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - const unsigned TlsOffset = 0x48; - Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); - return IRB.CreatePointerCast( - IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), - Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); + if (Subtarget->isTargetAndroid()) + return UseTlsOffset(IRB, 0x48); + + // Fuchsia is similar. + // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value. + if (Subtarget->isTargetFuchsia()) + return UseTlsOffset(IRB, -0x8); + + return TargetLowering::getSafeStackPointerLocation(IRB); +} + +bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + // Only sink 'and' mask to cmp use block if it is masking a single bit, since + // this is likely to be fold the and/cmp/br into a single tbz instruction. It + // may be beneficial to sink in other cases, but we would have to check that + // the cmp would not get folded into the br to form a cbz for these to be + // beneficial. + ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); + if (!Mask) + return false; + return Mask->getUniqueInteger().isPowerOf2(); } void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { @@ -10702,7 +10810,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR( } } -bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on AArch64 is expensive. However, when aggressively // optimizing for code size, we prefer to use a div instruction, as it is // usually smaller than the alternative sequence. @@ -10711,6 +10819,14 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = - Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } + +unsigned +AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { + if (Subtarget->isTargetDarwin()) + return getPointerTy(DL).getSizeInBits(); + + return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 054ccc31674f..2ad6c8b23df8 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -251,7 +251,8 @@ public: /// Determine which of the bits specified in Mask are known to be either zero /// or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, - APInt &KnownOne, const SelectionDAG &DAG, + APInt &KnownOne, const APInt &DemandedElts, + const SelectionDAG &DAG, unsigned Depth = 0) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; @@ -402,7 +403,7 @@ public: return AArch64::X1; } - bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool isCheapToSpeculateCttz() const override { return true; @@ -412,6 +413,8 @@ public: return true; } + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; + bool hasAndNotCompare(SDValue) const override { // 'bics' return true; @@ -435,6 +438,20 @@ public: return true; } + /// Returns the size of the platform's va_list object. + unsigned getVaListSizeInBits(const DataLayout &DL) const override; + + /// Returns true if \p VecTy is a legal interleaved access type. This + /// function checks the vector element type and the overall width of the + /// vector. + bool isLegalInterleavedAccessType(VectorType *VecTy, + const DataLayout &DL) const; + + /// Returns the number of interleaved accesses that will be generated when + /// lowering accesses of the given type. + unsigned getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const; + private: bool isExtFreeImpl(const Instruction *Ext) const override; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td index cefdf51b50d2..16be4432b160 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -39,6 +39,9 @@ class AArch64Inst<Format f, string cstr> : Instruction { let Constraints = cstr; } +class InstSubst<string Asm, dag Result, bit EmitPriority = 0> + : InstAlias<Asm, Result, EmitPriority>, Requires<[UseNegativeImmediates]>; + // Pseudo instructions (don't have encoding information) class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = ""> : AArch64Inst<PseudoFrm, cstr> { @@ -257,6 +260,7 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; class AsmImmRange<int Low, int High> : AsmOperandClass { let Name = "Imm" # Low # "_" # High; let DiagnosticType = "InvalidImm" # Low # "_" # High; + let PredicateMethod = "isImmInRange<" # Low # "," # High # ">"; } def Imm1_8Operand : AsmImmRange<1, 8>; @@ -264,6 +268,20 @@ def Imm1_16Operand : AsmImmRange<1, 16>; def Imm1_32Operand : AsmImmRange<1, 32>; def Imm1_64Operand : AsmImmRange<1, 64>; +class BranchTarget<int N> : AsmOperandClass { + let Name = "BranchTarget" # N; + let DiagnosticType = "InvalidLabel"; + let PredicateMethod = "isBranchTarget<" # N # ">"; +} + +class PCRelLabel<int N> : BranchTarget<N> { + let Name = "PCRelLabel" # N; +} + +def BranchTarget14Operand : BranchTarget<14>; +def BranchTarget26Operand : BranchTarget<26>; +def PCRelLabel19Operand : PCRelLabel<19>; + def MovZSymbolG3AsmOperand : AsmOperandClass { let Name = "MovZSymbolG3"; let RenderMethod = "addImmOperands"; @@ -500,7 +518,8 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ } // imm0_255 predicate - True if the immediate is in the range [0,255]. -def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; } +def Imm0_255Operand : AsmImmRange<0,255>; + def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 256; }]> { @@ -673,6 +692,14 @@ def addsub_shifted_imm64 : addsub_shifted_imm<i64>; def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>; def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>; +def gi_addsub_shifted_imm32 : + GIComplexOperandMatcher<s32, (ops i32imm, i32imm), "selectArithImmed">, + GIComplexPatternEquiv<addsub_shifted_imm32>; + +def gi_addsub_shifted_imm64 : + GIComplexOperandMatcher<s64, (ops i32imm, i32imm), "selectArithImmed">, + GIComplexPatternEquiv<addsub_shifted_imm64>; + class neg_addsub_shifted_imm<ValueType Ty> : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> { let PrintMethod = "printAddSubImm"; @@ -1094,10 +1121,6 @@ def inv_ccode : Operand<i32> { // Conditional branch target. 19-bit immediate. The low two bits of the target // offset are implied zero and so are not part of the immediate. -def PCRelLabel19Operand : AsmOperandClass { - let Name = "PCRelLabel19"; - let DiagnosticType = "InvalidLabel"; -} def am_brcond : Operand<OtherVT> { let EncoderMethod = "getCondBranchTargetOpValue"; let DecoderMethod = "DecodePCRelLabel19"; @@ -1154,9 +1177,6 @@ multiclass CmpBranch<bit op, string asm, SDNode node> { //--- // Test-and-branch target. 14-bit sign-extended immediate. The low two bits of // the target offset are implied zero and so are not part of the immediate. -def BranchTarget14Operand : AsmOperandClass { - let Name = "BranchTarget14"; -} def am_tbrcond : Operand<OtherVT> { let EncoderMethod = "getTestBranchTargetOpValue"; let PrintMethod = "printAlignedLabel"; @@ -1166,11 +1186,12 @@ def am_tbrcond : Operand<OtherVT> { // AsmOperand classes to emit (or not) special diagnostics def TBZImm0_31Operand : AsmOperandClass { let Name = "TBZImm0_31"; - let PredicateMethod = "isImm0_31"; + let PredicateMethod = "isImmInRange<0,31>"; let RenderMethod = "addImm0_31Operands"; } def TBZImm32_63Operand : AsmOperandClass { let Name = "Imm32_63"; + let PredicateMethod = "isImmInRange<32,63>"; let DiagnosticType = "InvalidImm0_63"; } @@ -1232,10 +1253,6 @@ multiclass TestBranch<bit op, string asm, SDNode node> { //--- // Unconditional branch (immediate) instructions. //--- -def BranchTarget26Operand : AsmOperandClass { - let Name = "BranchTarget26"; - let DiagnosticType = "InvalidLabel"; -} def am_b_target : Operand<OtherVT> { let EncoderMethod = "getBranchTargetOpValue"; let PrintMethod = "printAlignedLabel"; @@ -1784,10 +1801,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias, } // add Rd, Rb, -imm -> sub Rd, Rn, imm - def : InstAlias<alias#"\t$Rd, $Rn, $imm", + def : InstSubst<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#"\t$Rd, $Rn, $imm", + def : InstSubst<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; @@ -1859,10 +1876,10 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, } // Defs = [NZCV] // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm - def : InstAlias<alias#"\t$Rd, $Rn, $imm", + def : InstSubst<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#"\t$Rd, $Rn, $imm", + def : InstSubst<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; @@ -1883,9 +1900,9 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm - def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") + def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") + def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>; // Compare shorthands @@ -2100,10 +2117,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, let Inst{31} = 1; } - def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + def : InstSubst<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + def : InstSubst<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2122,10 +2139,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode, } } // end Defs = [NZCV] - def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + def : InstSubst<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + def : InstSubst<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2454,7 +2471,7 @@ class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat> // Load literal address: 19-bit immediate. The low two bits of the target // offset are implied zero and so are not part of the immediate. -def am_ldrlit : Operand<OtherVT> { +def am_ldrlit : Operand<iPTR> { let EncoderMethod = "getLoadLiteralOpValue"; let DecoderMethod = "DecodePCRelLabel19"; let PrintMethod = "printAlignedLabel"; @@ -9060,7 +9077,7 @@ multiclass SIMDLdSt4SingleAliases<string asm> { // AdvSIMD v8.1 Rounding Double Multiply Add/Subtract //---------------------------------------------------------------------------- -let Predicates = [HasNEON, HasV8_1a] in { +let Predicates = [HasNEON, HasRDM] in { class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode, RegisterOperand regtype, string asm, @@ -9221,7 +9238,7 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm, let Inst{21} = idx{0}; } } -} // let Predicates = [HasNeon, HasV8_1a] +} // let Predicates = [HasNeon, HasRDM] //---------------------------------------------------------------------------- // Crypto extensions diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 4c789926e3e4..41fc8eceab5c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" @@ -369,7 +370,7 @@ void AArch64InstrInfo::instantiateCondBranch( // Folded compare-and-branch // Note that we use addOperand instead of addReg to keep the flags. const MachineInstrBuilder MIB = - BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]); + BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); if (Cond.size() > 3) MIB.addImm(Cond[3].getImm()); MIB.addMBB(TBB); @@ -762,6 +763,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { llvm_unreachable("Unknown opcode to check as cheap as a move!"); } +bool AArch64InstrInfo::isFalkorLSLFast(const MachineInstr &MI) const { + if (MI.getNumOperands() < 4) + return false; + unsigned ShOpVal = MI.getOperand(3).getImm(); + unsigned ShImm = AArch64_AM::getShiftValue(ShOpVal); + if (AArch64_AM::getShiftType(ShOpVal) == AArch64_AM::LSL && + ShImm < 4) + return true; + return false; +} + bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const { @@ -1299,16 +1311,16 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addMemOperand(*MI.memoperands_begin()); } else if (TM.getCodeModel() == CodeModel::Large) { BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) - .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); + .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32); + .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16); + .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); + .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addImm(0) @@ -1345,14 +1357,6 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const { case AArch64::BICSXrs: case AArch64::BICWrs: case AArch64::BICXrs: - case AArch64::CRC32Brr: - case AArch64::CRC32CBrr: - case AArch64::CRC32CHrr: - case AArch64::CRC32CWrr: - case AArch64::CRC32CXrr: - case AArch64::CRC32Hrr: - case AArch64::CRC32Wrr: - case AArch64::CRC32Xrr: case AArch64::EONWrs: case AArch64::EONXrs: case AArch64::EORWrs: @@ -1691,16 +1695,59 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( } else return false; - // Offset is calculated as the immediate operand multiplied by the scaling factor. - // Unscaled instructions have scaling factor set to 1. + // Get the scaling factor for the instruction and set the width for the + // instruction. unsigned Scale = 0; - switch (LdSt.getOpcode()) { + int64_t Dummy1, Dummy2; + + // If this returns false, then it's an instruction we don't want to handle. + if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) + return false; + + // Compute the offset. Offset is calculated as the immediate operand + // multiplied by the scaling factor. Unscaled instructions have scaling factor + // set to 1. + if (LdSt.getNumExplicitOperands() == 3) { + BaseReg = LdSt.getOperand(1).getReg(); + Offset = LdSt.getOperand(2).getImm() * Scale; + } else { + assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); + BaseReg = LdSt.getOperand(2).getReg(); + Offset = LdSt.getOperand(3).getImm() * Scale; + } + return true; +} + +MachineOperand& +AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { + assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); + MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands()-1); + assert(OfsOp.isImm() && "Offset operand wasn't immediate."); + return OfsOp; +} + +bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, + unsigned &Width, int64_t &MinOffset, + int64_t &MaxOffset) const { + switch (Opcode) { + // Not a memory operation or something we want to handle. default: + Scale = Width = 0; + MinOffset = MaxOffset = 0; return false; + case AArch64::STRWpost: + case AArch64::LDRWpost: + Width = 32; + Scale = 4; + MinOffset = -256; + MaxOffset = 255; + break; case AArch64::LDURQi: case AArch64::STURQi: Width = 16; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDURXi: case AArch64::LDURDi: @@ -1708,6 +1755,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STURDi: Width = 8; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDURWi: case AArch64::LDURSi: @@ -1716,6 +1765,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STURSi: Width = 4; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDURHi: case AArch64::LDURHHi: @@ -1725,6 +1776,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STURHHi: Width = 2; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDURBi: case AArch64::LDURBBi: @@ -1734,6 +1787,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STURBBi: Width = 1; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDPQi: case AArch64::LDNPQi: @@ -1741,10 +1796,14 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STNPQi: Scale = 16; Width = 32; + MinOffset = -64; + MaxOffset = 63; break; case AArch64::LDRQui: case AArch64::STRQui: Scale = Width = 16; + MinOffset = 0; + MaxOffset = 4095; break; case AArch64::LDPXi: case AArch64::LDPDi: @@ -1756,12 +1815,16 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STNPDi: Scale = 8; Width = 16; + MinOffset = -64; + MaxOffset = 63; break; case AArch64::LDRXui: case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: Scale = Width = 8; + MinOffset = 0; + MaxOffset = 4095; break; case AArch64::LDPWi: case AArch64::LDPSi: @@ -1773,6 +1836,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STNPSi: Scale = 4; Width = 8; + MinOffset = -64; + MaxOffset = 63; break; case AArch64::LDRWui: case AArch64::LDRSui: @@ -1780,29 +1845,27 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STRWui: case AArch64::STRSui: Scale = Width = 4; + MinOffset = 0; + MaxOffset = 4095; break; case AArch64::LDRHui: case AArch64::LDRHHui: case AArch64::STRHui: case AArch64::STRHHui: Scale = Width = 2; + MinOffset = 0; + MaxOffset = 4095; break; case AArch64::LDRBui: case AArch64::LDRBBui: case AArch64::STRBui: case AArch64::STRBBui: Scale = Width = 1; + MinOffset = 0; + MaxOffset = 4095; break; } - if (LdSt.getNumExplicitOperands() == 3) { - BaseReg = LdSt.getOperand(1).getReg(); - Offset = LdSt.getOperand(2).getImm() * Scale; - } else { - assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); - BaseReg = LdSt.getOperand(2).getReg(); - Offset = LdSt.getOperand(3).getImm() * Scale; - } return true; } @@ -1903,88 +1966,6 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, return Offset1 + 1 == Offset2; } -bool AArch64InstrInfo::shouldScheduleAdjacent( - const MachineInstr &First, const MachineInstr &Second) const { - if (Subtarget.hasArithmeticBccFusion()) { - // Fuse CMN, CMP, TST followed by Bcc. - unsigned SecondOpcode = Second.getOpcode(); - if (SecondOpcode == AArch64::Bcc) { - switch (First.getOpcode()) { - default: - return false; - case AArch64::ADDSWri: - case AArch64::ADDSWrr: - case AArch64::ADDSXri: - case AArch64::ADDSXrr: - case AArch64::ANDSWri: - case AArch64::ANDSWrr: - case AArch64::ANDSXri: - case AArch64::ANDSXrr: - case AArch64::SUBSWri: - case AArch64::SUBSWrr: - case AArch64::SUBSXri: - case AArch64::SUBSXrr: - case AArch64::BICSWrr: - case AArch64::BICSXrr: - return true; - case AArch64::ADDSWrs: - case AArch64::ADDSXrs: - case AArch64::ANDSWrs: - case AArch64::ANDSXrs: - case AArch64::SUBSWrs: - case AArch64::SUBSXrs: - case AArch64::BICSWrs: - case AArch64::BICSXrs: - // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); - } - } - } - if (Subtarget.hasArithmeticCbzFusion()) { - // Fuse ALU operations followed by CBZ/CBNZ. - unsigned SecondOpcode = Second.getOpcode(); - if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || - SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { - switch (First.getOpcode()) { - default: - return false; - case AArch64::ADDWri: - case AArch64::ADDWrr: - case AArch64::ADDXri: - case AArch64::ADDXrr: - case AArch64::ANDWri: - case AArch64::ANDWrr: - case AArch64::ANDXri: - case AArch64::ANDXrr: - case AArch64::EORWri: - case AArch64::EORWrr: - case AArch64::EORXri: - case AArch64::EORXrr: - case AArch64::ORRWri: - case AArch64::ORRWrr: - case AArch64::ORRXri: - case AArch64::ORRXrr: - case AArch64::SUBWri: - case AArch64::SUBWrr: - case AArch64::SUBXri: - case AArch64::SUBXrr: - return true; - case AArch64::ADDWrs: - case AArch64::ADDXrs: - case AArch64::ANDWrs: - case AArch64::ANDXrs: - case AArch64::SUBWrs: - case AArch64::SUBXrs: - case AArch64::BICWrs: - case AArch64::BICXrs: - // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); - } - } - } - return false; -} - MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, const MDNode *Expr, const DebugLoc &DL) const { @@ -3793,7 +3774,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) .addReg(ZeroReg) - .addOperand(Root.getOperand(2)); + .add(Root.getOperand(2)); InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); @@ -4286,3 +4267,199 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { {MO_TLS, "aarch64-tls"}}; return makeArrayRef(TargetFlags); } + +unsigned AArch64InstrInfo::getOutliningBenefit(size_t SequenceSize, + size_t Occurrences, + bool CanBeTailCall) const { + unsigned NotOutlinedSize = SequenceSize * Occurrences; + unsigned OutlinedSize; + + // Is this candidate something we can outline as a tail call? + if (CanBeTailCall) { + // If yes, then we just outline the sequence and replace each of its + // occurrences with a branch instruction. + OutlinedSize = SequenceSize + Occurrences; + } else { + // If no, then we outline the sequence (SequenceSize), add a return (+1), + // and replace each occurrence with a save/restore to LR and a call + // (3 * Occurrences) + OutlinedSize = (SequenceSize + 1) + (3 * Occurrences); + } + + // Return the number of instructions saved by outlining this sequence. + return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0; +} + +bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const { + return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); +} + +AArch64GenInstrInfo::MachineOutlinerInstrType +AArch64InstrInfo::getOutliningType(MachineInstr &MI) const { + + MachineFunction *MF = MI.getParent()->getParent(); + AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); + + // Don't outline LOHs. + if (FuncInfo->getLOHRelated().count(&MI)) + return MachineOutlinerInstrType::Illegal; + + // Don't allow debug values to impact outlining type. + if (MI.isDebugValue() || MI.isIndirectDebugValue()) + return MachineOutlinerInstrType::Invisible; + + // Is this a terminator for a basic block? + if (MI.isTerminator()) { + + // Is this the end of a function? + if (MI.getParent()->succ_empty()) + return MachineOutlinerInstrType::Legal; + + // It's not, so don't outline it. + return MachineOutlinerInstrType::Illegal; + } + + // Don't outline positions. + if (MI.isPosition()) + return MachineOutlinerInstrType::Illegal; + + // Make sure none of the operands are un-outlinable. + for (const MachineOperand &MOP : MI.operands()) + if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || + MOP.isTargetIndex()) + return MachineOutlinerInstrType::Illegal; + + // Don't outline anything that uses the link register. + if (MI.modifiesRegister(AArch64::LR, &RI) || + MI.readsRegister(AArch64::LR, &RI)) + return MachineOutlinerInstrType::Illegal; + + // Does this use the stack? + if (MI.modifiesRegister(AArch64::SP, &RI) || + MI.readsRegister(AArch64::SP, &RI)) { + + // Is it a memory operation? + if (MI.mayLoadOrStore()) { + unsigned Base; // Filled with the base regiser of MI. + int64_t Offset; // Filled with the offset of MI. + unsigned DummyWidth; + + // Does it allow us to offset the base register and is the base SP? + if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) || + Base != AArch64::SP) + return MachineOutlinerInstrType::Illegal; + + // Find the minimum/maximum offset for this instruction and check if + // fixing it up would be in range. + int64_t MinOffset, MaxOffset; + unsigned DummyScale; + getMemOpInfo(MI.getOpcode(), DummyScale, DummyWidth, MinOffset, + MaxOffset); + + // TODO: We should really test what happens if an instruction overflows. + // This is tricky to test with IR tests, but when the outliner is moved + // to a MIR test, it really ought to be checked. + if (Offset + 16 < MinOffset || Offset + 16 > MaxOffset) + return MachineOutlinerInstrType::Illegal; + + // It's in range, so we can outline it. + return MachineOutlinerInstrType::Legal; + } + + // We can't fix it up, so don't outline it. + return MachineOutlinerInstrType::Illegal; + } + + return MachineOutlinerInstrType::Legal; +} + +void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { + for (MachineInstr &MI : MBB) { + unsigned Base, Width; + int64_t Offset; + + // Is this a load or store with an immediate offset with SP as the base? + if (!MI.mayLoadOrStore() || + !getMemOpBaseRegImmOfsWidth(MI, Base, Offset, Width, &RI) || + Base != AArch64::SP) + continue; + + // It is, so we have to fix it up. + unsigned Scale; + int64_t Dummy1, Dummy2; + + MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); + assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); + getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); + assert(Scale != 0 && "Unexpected opcode!"); + + // We've pushed the return address to the stack, so add 16 to the offset. + // This is safe, since we already checked if it would overflow when we + // checked if this instruction was legal to outline. + int64_t NewImm = (Offset + 16)/Scale; + StackOffsetOperand.setImm(NewImm); + } +} + +void AArch64InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const { + + // If this is a tail call outlined function, then there's already a return. + if (IsTailCall) + return; + + // It's not a tail call, so we have to insert the return ourselves. + MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) + .addReg(AArch64::LR, RegState::Undef); + MBB.insert(MBB.end(), ret); + + // Walk over the basic block and fix up all the stack accesses. + fixupPostOutline(MBB); +} + +void AArch64InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const {} + +MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( + Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, + MachineFunction &MF, bool IsTailCall) const { + + // Are we tail calling? + if (IsTailCall) { + // If yes, then we can just branch to the label. + It = MBB.insert(It, + BuildMI(MF, DebugLoc(), get(AArch64::B)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); + return It; + } + + // We're not tail calling, so we have to save LR before the call and restore + // it after. + MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(-16); + It = MBB.insert(It, STRXpre); + It++; + + // Insert the call. + It = MBB.insert(It, + BuildMI(MF, DebugLoc(), get(AArch64::BL)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); + + It++; + + // Restore the link register. + MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(16); + It = MBB.insert(It, LDRXpost); + + return It; +} + diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 5037866925d3..bacce441f6c5 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -133,12 +133,19 @@ public: int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; + /// Return the immediate offset of the base register in a load/store \p LdSt. + MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const; + + /// \brief Returns true if opcode \p Opc is a memory operation. If it is, set + /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly. + /// + /// For unscaled instructions, \p Scale is set to 1. + bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, + int64_t &MinOffset, int64_t &MaxOffset) const; + bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, const MDNode *Expr, @@ -245,7 +252,33 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableBitmaskMachineOperandTargetFlags() const override; + bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override; + unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences, + bool CanBeTailCall) const override; + AArch64GenInstrInfo::MachineOutlinerInstrType + getOutliningType(MachineInstr &MI) const override; + void insertOutlinerEpilogue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const override; + void insertOutlinerPrologue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool isTailCall) const override; + MachineBasicBlock::iterator + insertOutlinedCall(Module &M, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &It, + MachineFunction &MF, + bool IsTailCall) const override; + /// Returns true if the instruction has a shift by immediate that can be + /// executed in one cycle less. + bool isFalkorLSLFast(const MachineInstr &MI) const; private: + + /// \brief Sets the offsets on outlined instructions in \p MBB which use SP + /// so that they will be valid post-outlining. + /// + /// \param MBB A \p MachineBasicBlock in an outlined function. + void fixupPostOutline(MachineBasicBlock &MBB) const; + void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, ArrayRef<MachineOperand> Cond) const; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 2244baacca17..4449412532f3 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -30,6 +30,8 @@ def HasLSE : Predicate<"Subtarget->hasLSE()">, AssemblerPredicate<"FeatureLSE", "lse">; def HasRAS : Predicate<"Subtarget->hasRAS()">, AssemblerPredicate<"FeatureRAS", "ras">; +def HasRDM : Predicate<"Subtarget->hasRDM()">, + AssemblerPredicate<"FeatureRDM", "rdm">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; @@ -41,6 +43,11 @@ def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def UseAlternateSExtLoadCVTF32 : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; +def UseNegativeImmediates + : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates", + "NegativeImmediates">; + + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. // @@ -424,8 +431,10 @@ def MSRpstateImm1 : MSRpstateImm0_1; def MSRpstateImm4 : MSRpstateImm0_15; // The thread pointer (on Linux, at least, where this has been implemented) is -// TPIDR_EL0. -def : Pat<(AArch64threadpointer), (MRS 0xde82)>; +// TPIDR_EL0. Add pseudo op so we can mark it as not having any side effects. +let hasSideEffects = 0 in +def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), + [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[]>; // The cycle counter PMC register is PMCCNTR_EL0. let Predicates = [HasPerfMon] in @@ -574,31 +583,31 @@ def : Pat<(f64 fpimm:$in), // sequences. def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2, tglobaladdr:$g1, tglobaladdr:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48), - tglobaladdr:$g2, 32), - tglobaladdr:$g1, 16), - tglobaladdr:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0), + tglobaladdr:$g1, 16), + tglobaladdr:$g2, 32), + tglobaladdr:$g3, 48)>; def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2, tblockaddress:$g1, tblockaddress:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48), - tblockaddress:$g2, 32), - tblockaddress:$g1, 16), - tblockaddress:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0), + tblockaddress:$g1, 16), + tblockaddress:$g2, 32), + tblockaddress:$g3, 48)>; def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2, tconstpool:$g1, tconstpool:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48), - tconstpool:$g2, 32), - tconstpool:$g1, 16), - tconstpool:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0), + tconstpool:$g1, 16), + tconstpool:$g2, 32), + tconstpool:$g3, 48)>; def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2, tjumptable:$g1, tjumptable:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48), - tjumptable:$g2, 32), - tjumptable:$g1, 16), - tjumptable:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0), + tjumptable:$g1, 16), + tjumptable:$g2, 32), + tjumptable:$g3, 48)>; //===----------------------------------------------------------------------===// @@ -3284,7 +3293,7 @@ defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl> defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>; defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; -let Predicates = [HasV8_1a] in { +let Predicates = [HasRDM] in { defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; def : Pat<(i32 (int_aarch64_neon_sqadd @@ -5029,7 +5038,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST> 0), dsub)))>, Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; - + def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp index b51473524c72..878dac6bff1e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -12,17 +12,19 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#include "AArch64InstructionSelector.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterBankInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" @@ -36,13 +38,61 @@ using namespace llvm; #error "You shouldn't build this" #endif +namespace { + +class AArch64InstructionSelector : public InstructionSelector { +public: + AArch64InstructionSelector(const AArch64TargetMachine &TM, + const AArch64Subtarget &STI, + const AArch64RegisterBankInfo &RBI); + + bool select(MachineInstr &I) const override; + +private: + /// tblgen-erated 'select' implementation, used as the initial selector for + /// the patterns that don't require complex C++. + bool selectImpl(MachineInstr &I) const; + + bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + + bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + + bool selectArithImmed(MachineOperand &Root, MachineOperand &Result1, + MachineOperand &Result2) const; + + const AArch64TargetMachine &TM; + const AArch64Subtarget &STI; + const AArch64InstrInfo &TII; + const AArch64RegisterInfo &TRI; + const AArch64RegisterBankInfo &RBI; + +// We declare the temporaries used by selectImpl() in the class to minimize the +// cost of constructing placeholder values. +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL +}; + +} // end anonymous namespace + +#define GET_GLOBALISEL_IMPL #include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL AArch64InstructionSelector::AArch64InstructionSelector( const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI) - : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI) {} + : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI) +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT +{ +} // FIXME: This should be target-independent, inferred from the types declared // for each class in the bank. @@ -119,67 +169,34 @@ static bool unsupportedBinOp(const MachineInstr &I, } /// Select the AArch64 opcode for the basic binary operation \p GenericOpc -/// (such as G_OR or G_ADD), appropriate for the register bank \p RegBankID +/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID /// and of size \p OpSize. /// \returns \p GenericOpc if the combination is unsupported. static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, unsigned OpSize) { switch (RegBankID) { case AArch64::GPRRegBankID: - if (OpSize <= 32) { - assert((OpSize == 32 || (GenericOpc != TargetOpcode::G_SDIV && - GenericOpc != TargetOpcode::G_UDIV && - GenericOpc != TargetOpcode::G_LSHR && - GenericOpc != TargetOpcode::G_ASHR)) && - "operation should have been legalized before now"); - + if (OpSize == 32) { switch (GenericOpc) { - case TargetOpcode::G_OR: - return AArch64::ORRWrr; - case TargetOpcode::G_XOR: - return AArch64::EORWrr; - case TargetOpcode::G_AND: - return AArch64::ANDWrr; - case TargetOpcode::G_ADD: - assert(OpSize != 32 && "s32 G_ADD should have been selected"); - return AArch64::ADDWrr; - case TargetOpcode::G_SUB: - return AArch64::SUBWrr; case TargetOpcode::G_SHL: return AArch64::LSLVWr; case TargetOpcode::G_LSHR: return AArch64::LSRVWr; case TargetOpcode::G_ASHR: return AArch64::ASRVWr; - case TargetOpcode::G_SDIV: - return AArch64::SDIVWr; - case TargetOpcode::G_UDIV: - return AArch64::UDIVWr; default: return GenericOpc; } } else if (OpSize == 64) { switch (GenericOpc) { - case TargetOpcode::G_OR: - return AArch64::ORRXrr; - case TargetOpcode::G_XOR: - return AArch64::EORXrr; - case TargetOpcode::G_AND: - return AArch64::ANDXrr; case TargetOpcode::G_GEP: return AArch64::ADDXrr; - case TargetOpcode::G_SUB: - return AArch64::SUBXrr; case TargetOpcode::G_SHL: return AArch64::LSLVXr; case TargetOpcode::G_LSHR: return AArch64::LSRVXr; case TargetOpcode::G_ASHR: return AArch64::ASRVXr; - case TargetOpcode::G_SDIV: - return AArch64::SDIVXr; - case TargetOpcode::G_UDIV: - return AArch64::UDIVXr; default: return GenericOpc; } @@ -473,6 +490,82 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P, } } +bool AArch64InstructionSelector::selectCompareBranch( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + + const unsigned CondReg = I.getOperand(0).getReg(); + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + MachineInstr *CCMI = MRI.getVRegDef(CondReg); + if (CCMI->getOpcode() != TargetOpcode::G_ICMP) + return false; + + unsigned LHS = CCMI->getOperand(2).getReg(); + unsigned RHS = CCMI->getOperand(3).getReg(); + if (!getConstantVRegVal(RHS, MRI)) + std::swap(RHS, LHS); + + const auto RHSImm = getConstantVRegVal(RHS, MRI); + if (!RHSImm || *RHSImm != 0) + return false; + + const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI); + if (RB.getID() != AArch64::GPRRegBankID) + return false; + + const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate(); + if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ) + return false; + + const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits(); + unsigned CBOpc = 0; + if (CmpWidth <= 32) + CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW); + else if (CmpWidth == 64) + CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX); + else + return false; + + auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc)) + .addUse(LHS) + .addMBB(DestMBB); + + constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectVaStartAAPCS( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + return false; +} + +bool AArch64InstructionSelector::selectVaStartDarwin( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + unsigned ListReg = I.getOperand(0).getReg(); + + unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + + auto MIB = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) + .addDef(ArgsAddrReg) + .addFrameIndex(FuncInfo->getVarArgsStackIndex()) + .addImm(0) + .addImm(0); + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) + .addUse(ArgsAddrReg) + .addUse(ListReg) + .addImm(0) + .addMemOperand(*I.memoperands_begin()); + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::select(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -549,6 +642,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { const unsigned CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + if (selectCompareBranch(I, MF, MRI)) + return true; + auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW)) .addUse(CondReg) .addImm(/*bit offset=*/0) @@ -558,6 +654,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); } + case TargetOpcode::G_BRINDIRECT: { + I.setDesc(TII.get(AArch64::BR)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_FCONSTANT: case TargetOpcode::G_CONSTANT: { const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; @@ -629,9 +730,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { // FIXME: Is going through int64_t always correct? ImmOp.ChangeToImmediate( ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); - } else { + } else if (I.getOperand(1).isCImm()) { uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); I.getOperand(1).ChangeToImmediate(Val); + } else if (I.getOperand(1).isImm()) { + uint64_t Val = I.getOperand(1).getImm(); + I.getOperand(1).ChangeToImmediate(Val); } constrainSelectedInstRegOperands(I, TII, TRI, RBI); @@ -686,10 +790,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return false; } -#ifndef NDEBUG - // Sanity-check the pointer register. + auto &MemOp = **I.memoperands_begin(); + if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + return false; + } + const unsigned PtrReg = I.getOperand(1).getReg(); +#ifndef NDEBUG const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); + // Sanity-check the pointer register. assert(PtrRB.getID() == AArch64::GPRRegBankID && "Load/Store pointer operand isn't a GPR"); assert(MRI.getType(PtrReg).isPointer() && @@ -706,11 +816,46 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { I.setDesc(TII.get(NewOpc)); - I.addOperand(MachineOperand::CreateImm(0)); + uint64_t Offset = 0; + auto *PtrMI = MRI.getVRegDef(PtrReg); + + // Try to fold a GEP into our unsigned immediate addressing mode. + if (PtrMI->getOpcode() == TargetOpcode::G_GEP) { + if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) { + int64_t Imm = *COff; + const unsigned Size = MemTy.getSizeInBits() / 8; + const unsigned Scale = Log2_32(Size); + if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) { + unsigned Ptr2Reg = PtrMI->getOperand(1).getReg(); + I.getOperand(1).setReg(Ptr2Reg); + PtrMI = MRI.getVRegDef(Ptr2Reg); + Offset = Imm / Size; + } + } + } + + // If we haven't folded anything into our addressing mode yet, try to fold + // a frame index into the base+offset. + if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX) + I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex()); + + I.addOperand(MachineOperand::CreateImm(Offset)); + + // If we're storing a 0, use WZR/XZR. + if (auto CVal = getConstantVRegVal(ValReg, MRI)) { + if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) { + if (I.getOpcode() == AArch64::STRWui) + I.getOperand(0).setReg(AArch64::WZR); + else if (I.getOpcode() == AArch64::STRXui) + I.getOperand(0).setReg(AArch64::XZR); + } + } + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - case TargetOpcode::G_MUL: { + case TargetOpcode::G_SMULH: + case TargetOpcode::G_UMULH: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) return false; @@ -719,48 +864,33 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); if (RB.getID() != AArch64::GPRRegBankID) { - DEBUG(dbgs() << "G_MUL on bank: " << RB << ", expected: GPR\n"); + DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); return false; } - unsigned ZeroReg; - unsigned NewOpc; - if (Ty.isScalar() && Ty.getSizeInBits() <= 32) { - NewOpc = AArch64::MADDWrrr; - ZeroReg = AArch64::WZR; - } else if (Ty == LLT::scalar(64)) { - NewOpc = AArch64::MADDXrrr; - ZeroReg = AArch64::XZR; - } else { - DEBUG(dbgs() << "G_MUL has type: " << Ty << ", expected: " - << LLT::scalar(32) << " or " << LLT::scalar(64) << '\n'); + if (Ty != LLT::scalar(64)) { + DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty + << ", expected: " << LLT::scalar(64) << '\n'); return false; } + unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr + : AArch64::UMULHrr; I.setDesc(TII.get(NewOpc)); - I.addOperand(MachineOperand::CreateReg(ZeroReg, /*isDef=*/false)); - // Now that we selected an opcode, we need to constrain the register // operands to use appropriate classes. return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - case TargetOpcode::G_FADD: case TargetOpcode::G_FSUB: case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: case TargetOpcode::G_OR: - case TargetOpcode::G_XOR: - case TargetOpcode::G_AND: case TargetOpcode::G_SHL: case TargetOpcode::G_LSHR: case TargetOpcode::G_ASHR: - case TargetOpcode::G_SDIV: - case TargetOpcode::G_UDIV: - case TargetOpcode::G_ADD: - case TargetOpcode::G_SUB: case TargetOpcode::G_GEP: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) @@ -783,6 +913,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } + case TargetOpcode::G_PTR_MASK: { + uint64_t Align = I.getOperand(2).getImm(); + if (Align >= 64 || Align == 0) + return false; + + uint64_t Mask = ~((1ULL << Align) - 1); + I.setDesc(TII.get(AArch64::ANDXri)); + I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64)); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } case TargetOpcode::G_PTRTOINT: case TargetOpcode::G_TRUNC: { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); @@ -1026,7 +1167,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { if (Ty == LLT::scalar(32)) { CSelOpc = AArch64::CSELWr; - } else if (Ty == LLT::scalar(64)) { + } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) { CSelOpc = AArch64::CSELXr; } else { return false; @@ -1134,7 +1275,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { .addDef(Def1Reg) .addUse(AArch64::WZR) .addUse(AArch64::WZR) - .addImm(CC1); + .addImm(getInvertedCondCode(CC1)); if (CC2 != AArch64CC::AL) { unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); @@ -1143,7 +1284,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { .addDef(Def2Reg) .addUse(AArch64::WZR) .addUse(AArch64::WZR) - .addImm(CC2); + .addImm(getInvertedCondCode(CC2)); MachineInstr &OrMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr)) .addDef(DefReg) @@ -1159,7 +1300,69 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { I.eraseFromParent(); return true; } + case TargetOpcode::G_VASTART: + return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) + : selectVaStartAAPCS(I, MF, MRI); } return false; } + +/// SelectArithImmed - Select an immediate value that can be represented as +/// a 12-bit value shifted left by either 0 or 12. If so, return true with +/// Val set to the 12-bit value and Shift set to the shifter operand. +bool AArch64InstructionSelector::selectArithImmed( + MachineOperand &Root, MachineOperand &Result1, + MachineOperand &Result2) const { + MachineInstr &MI = *Root.getParent(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + uint64_t Immed; + if (Root.isImm()) + Immed = Root.getImm(); + else if (Root.isCImm()) + Immed = Root.getCImm()->getZExtValue(); + else if (Root.isReg()) { + MachineInstr *Def = MRI.getVRegDef(Root.getReg()); + if (Def->getOpcode() != TargetOpcode::G_CONSTANT) + return false; + MachineOperand &Op1 = Def->getOperand(1); + if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64) + return false; + Immed = Op1.getCImm()->getZExtValue(); + } else + return false; + + unsigned ShiftAmt; + + if (Immed >> 12 == 0) { + ShiftAmt = 0; + } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { + ShiftAmt = 12; + Immed = Immed >> 12; + } else + return false; + + unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + Result1.ChangeToImmediate(Immed); + Result1.clearParent(); + Result2.ChangeToImmediate(ShVal); + Result2.clearParent(); + return true; +} + +namespace llvm { +InstructionSelector * +createAArch64InstructionSelector(const AArch64TargetMachine &TM, + AArch64Subtarget &Subtarget, + AArch64RegisterBankInfo &RBI) { + return new AArch64InstructionSelector(TM, Subtarget, RBI); +} +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h deleted file mode 100644 index 2c6e5a912fb7..000000000000 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h +++ /dev/null @@ -1,49 +0,0 @@ -//===- AArch64InstructionSelector --------------------------------*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file declares the targeting of the InstructionSelector class for -/// AArch64. -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H - -#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" - -namespace llvm { - -class AArch64InstrInfo; -class AArch64RegisterBankInfo; -class AArch64RegisterInfo; -class AArch64Subtarget; -class AArch64TargetMachine; - -class AArch64InstructionSelector : public InstructionSelector { -public: - AArch64InstructionSelector(const AArch64TargetMachine &TM, - const AArch64Subtarget &STI, - const AArch64RegisterBankInfo &RBI); - - bool select(MachineInstr &I) const override; - -private: - /// tblgen-erated 'select' implementation, used as the initial selector for - /// the patterns that don't require complex C++. - bool selectImpl(MachineInstr &I) const; - - const AArch64TargetMachine &TM; - const AArch64Subtarget &STI; - const AArch64InstrInfo &TII; - const AArch64RegisterInfo &TRI; - const AArch64RegisterBankInfo &RBI; -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 83f276a8161b..6e6daf812295 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -13,7 +13,10 @@ //===----------------------------------------------------------------------===// #include "AArch64LegalizerInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/IR/Type.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Target/TargetOpcodes.h" @@ -36,11 +39,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); - for (auto BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) { + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) { // These operations naturally get the right answer when used on // GPR32, even if the actual type is narrower. - for (auto Ty : {s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) + for (auto Ty : {s32, s64, v2s32, v4s32, v2s64}) setAction({BinOp, Ty}, Legal); + + for (auto Ty : {s1, s8, s16}) + setAction({BinOp, Ty}, WidenScalar); } setAction({G_GEP, p0}, Legal); @@ -49,7 +55,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { for (auto Ty : {s1, s8, s16, s32}) setAction({G_GEP, 1, Ty}, WidenScalar); - for (auto BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) { + setAction({G_PTR_MASK, p0}, Legal); + + for (unsigned BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) { for (auto Ty : {s32, s64}) setAction({BinOp, Ty}, Legal); @@ -57,25 +65,41 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({BinOp, Ty}, WidenScalar); } - for (auto BinOp : { G_SREM, G_UREM }) + for (unsigned BinOp : {G_SREM, G_UREM}) for (auto Ty : { s1, s8, s16, s32, s64 }) setAction({BinOp, Ty}, Lower); - for (auto Op : { G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULO, G_UMULO }) { + for (unsigned Op : {G_SMULO, G_UMULO}) + setAction({Op, s64}, Lower); + + for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) { for (auto Ty : { s32, s64 }) setAction({Op, Ty}, Legal); setAction({Op, 1, s1}, Legal); } - for (auto BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV}) + for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV}) for (auto Ty : {s32, s64}) setAction({BinOp, Ty}, Legal); - setAction({G_FREM, s32}, Libcall); - setAction({G_FREM, s64}, Libcall); + for (unsigned BinOp : {G_FREM, G_FPOW}) { + setAction({BinOp, s32}, Libcall); + setAction({BinOp, s64}, Libcall); + } - for (auto MemOp : {G_LOAD, G_STORE}) { + for (auto Ty : {s32, s64, p0}) { + setAction({G_INSERT, Ty}, Legal); + setAction({G_INSERT, 1, Ty}, Legal); + } + for (auto Ty : {s1, s8, s16}) { + setAction({G_INSERT, Ty}, WidenScalar); + setAction({G_INSERT, 1, Ty}, Legal); + // FIXME: Can't widen the sources because that violates the constraints on + // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap). + } + + for (unsigned MemOp : {G_LOAD, G_STORE}) { for (auto Ty : {s8, s16, s32, s64, p0, v2s32}) setAction({MemOp, Ty}, Legal); @@ -141,12 +165,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_TRUNC, 1, Ty}, Legal); // Conversions - for (auto Ty : { s1, s8, s16, s32, s64 }) { + for (auto Ty : { s32, s64 }) { setAction({G_FPTOSI, 0, Ty}, Legal); setAction({G_FPTOUI, 0, Ty}, Legal); setAction({G_SITOFP, 1, Ty}, Legal); setAction({G_UITOFP, 1, Ty}, Legal); } + for (auto Ty : { s1, s8, s16 }) { + setAction({G_FPTOSI, 0, Ty}, WidenScalar); + setAction({G_FPTOUI, 0, Ty}, WidenScalar); + setAction({G_SITOFP, 1, Ty}, WidenScalar); + setAction({G_UITOFP, 1, Ty}, WidenScalar); + } for (auto Ty : { s32, s64 }) { setAction({G_FPTOSI, 1, Ty}, Legal); @@ -158,9 +188,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { // Control-flow for (auto Ty : {s1, s8, s16, s32}) setAction({G_BRCOND, Ty}, Legal); + setAction({G_BRINDIRECT, p0}, Legal); // Select - for (auto Ty : {s1, s8, s16, s32, s64}) + for (auto Ty : {s1, s8, s16}) + setAction({G_SELECT, Ty}, WidenScalar); + + for (auto Ty : {s32, s64, p0}) setAction({G_SELECT, Ty}, Legal); setAction({G_SELECT, 1, s1}, Legal); @@ -200,5 +234,82 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal); } + setAction({G_VASTART, p0}, Legal); + + // va_list must be a pointer, but most sized types are pretty easy to handle + // as the destination. + setAction({G_VAARG, 1, p0}, Legal); + + for (auto Ty : {s8, s16, s32, s64, p0}) + setAction({G_VAARG, Ty}, Custom); + computeTables(); } + +bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + switch (MI.getOpcode()) { + default: + // No idea what to do. + return false; + case TargetOpcode::G_VAARG: + return legalizeVaArg(MI, MRI, MIRBuilder); + } + + llvm_unreachable("expected switch to return"); +} + +bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MIRBuilder.setInstr(MI); + MachineFunction &MF = MIRBuilder.getMF(); + unsigned Align = MI.getOperand(2).getImm(); + unsigned Dst = MI.getOperand(0).getReg(); + unsigned ListPtr = MI.getOperand(1).getReg(); + + LLT PtrTy = MRI.getType(ListPtr); + LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); + + const unsigned PtrSize = PtrTy.getSizeInBits() / 8; + unsigned List = MRI.createGenericVirtualRegister(PtrTy); + MIRBuilder.buildLoad( + List, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, + PtrSize, /* Align = */ PtrSize)); + + unsigned DstPtr; + if (Align > PtrSize) { + // Realign the list to the actual required alignment. + unsigned AlignMinus1 = MRI.createGenericVirtualRegister(IntPtrTy); + MIRBuilder.buildConstant(AlignMinus1, Align - 1); + + unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy); + MIRBuilder.buildGEP(ListTmp, List, AlignMinus1); + + DstPtr = MRI.createGenericVirtualRegister(PtrTy); + MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align)); + } else + DstPtr = List; + + uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8; + MIRBuilder.buildLoad( + Dst, DstPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, + ValSize, std::max(Align, PtrSize))); + + unsigned SizeReg = MRI.createGenericVirtualRegister(IntPtrTy); + MIRBuilder.buildConstant(SizeReg, alignTo(ValSize, PtrSize)); + + unsigned NewList = MRI.createGenericVirtualRegister(PtrTy); + MIRBuilder.buildGEP(NewList, DstPtr, SizeReg); + + MIRBuilder.buildStore( + NewList, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore, + PtrSize, /* Align = */ PtrSize)); + + MI.eraseFromParent(); + return true; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h index feacbef9f147..42d4ac130c5c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h @@ -25,6 +25,13 @@ class LLVMContext; class AArch64LegalizerInfo : public LegalizerInfo { public: AArch64LegalizerInfo(); + + bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; + +private: + bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; }; } // End llvm namespace. #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 8e312dcf276f..976498aa70d6 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -16,19 +16,29 @@ #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> + using namespace llvm; #define DEBUG_TYPE "aarch64-ldst-opt" @@ -58,15 +68,15 @@ typedef struct LdStPairFlags { // If a matching instruction is found, MergeForward is set to true if the // merge is to remove the first instruction and replace the second with // a pair-wise insn, and false if the reverse is true. - bool MergeForward; + bool MergeForward = false; // SExtIdx gives the index of the result of the load pair that must be // extended. The value of SExtIdx assumes that the paired load produces the // value in this order: (I, returned iterator), i.e., -1 means no value has // to be extended, 0 means I, and 1 means the returned iterator. - int SExtIdx; + int SExtIdx = -1; - LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} + LdStPairFlags() = default; void setMergeForward(bool V = true) { MergeForward = V; } bool getMergeForward() const { return MergeForward; } @@ -78,10 +88,12 @@ typedef struct LdStPairFlags { struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; + AArch64LoadStoreOpt() : MachineFunctionPass(ID) { initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); } + AliasAnalysis *AA; const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; const AArch64Subtarget *Subtarget; @@ -89,6 +101,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Track which registers have been modified and used. BitVector ModifiedRegs, UsedRegs; + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). @@ -162,8 +179,10 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } }; + char AArch64LoadStoreOpt::ID = 0; -} // namespace + +} // end anonymous namespace INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", AARCH64_LOAD_STORE_OPT_NAME, false, false) @@ -246,7 +265,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, default: if (IsValidLdStrOpc) *IsValidLdStrOpc = false; - return UINT_MAX; + return std::numeric_limits<unsigned>::max(); case AArch64::STRDui: case AArch64::STURDi: case AArch64::STRQui: @@ -595,7 +614,7 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, MachineInstrBuilder MIB; MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) - .addOperand(BaseRegOp) + .add(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*MergeMI)); (void)MIB; @@ -709,9 +728,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, } } MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) - .addOperand(RegOp0) - .addOperand(RegOp1) - .addOperand(BaseRegOp) + .add(RegOp0) + .add(RegOp1) + .add(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*Paired)); @@ -923,7 +942,7 @@ static int alignTo(int Num, int PowOf2) { } static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, - const AArch64InstrInfo *TII) { + AliasAnalysis *AA) { // One of the instructions must modify memory. if (!MIa.mayStore() && !MIb.mayStore()) return false; @@ -932,14 +951,14 @@ static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) return false; - return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); + return MIa.mayAlias(AA, MIb, /*UseTBAA*/false); } static bool mayAlias(MachineInstr &MIa, SmallVectorImpl<MachineInstr *> &MemInsns, - const AArch64InstrInfo *TII) { + AliasAnalysis *AA) { for (MachineInstr *MIb : MemInsns) - if (mayAlias(MIa, *MIb, TII)) + if (mayAlias(MIa, *MIb, AA)) return true; return false; @@ -997,7 +1016,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( return false; // If we encounter a store aliased with the load, return early. - if (MI.mayStore() && mayAlias(LoadMI, MI, TII)) + if (MI.mayStore() && mayAlias(LoadMI, MI, AA)) return false; } while (MBBI != B && Count < Limit); return false; @@ -1167,7 +1186,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // first. if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && - !mayAlias(MI, MemInsns, TII)) { + !mayAlias(MI, MemInsns, AA)) { Flags.setMergeForward(false); return MBBI; } @@ -1178,7 +1197,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // into the second. if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && - !mayAlias(FirstMI, MemInsns, TII)) { + !mayAlias(FirstMI, MemInsns, AA)) { Flags.setMergeForward(true); return MBBI; } @@ -1233,19 +1252,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(*Update)) - .addOperand(getLdStRegOp(*I)) - .addOperand(getLdStBaseOp(*I)) + .add(getLdStRegOp(*Update)) + .add(getLdStRegOp(*I)) + .add(getLdStBaseOp(*I)) .addImm(Value) .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } else { // Paired instruction. int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(*Update)) - .addOperand(getLdStRegOp(*I, 0)) - .addOperand(getLdStRegOp(*I, 1)) - .addOperand(getLdStBaseOp(*I)) + .add(getLdStRegOp(*Update)) + .add(getLdStRegOp(*I, 0)) + .add(getLdStRegOp(*I, 1)) + .add(getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } @@ -1545,7 +1564,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, case AArch64::LDURBBi: case AArch64::LDURHHi: case AArch64::LDURWi: - case AArch64::LDURXi: { + case AArch64::LDURXi: if (tryToPromoteLoadFromStore(MBBI)) { Modified = true; break; @@ -1553,7 +1572,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; break; } - } } // 2) Merge adjacent zero stores into a wider store. // e.g., @@ -1722,6 +1740,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); TRI = Subtarget->getRegisterInfo(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); // Resize the modified and used register bitfield trackers. We do this once // per function and then clear the bitfield each time we optimize a load or diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp new file mode 100644 index 000000000000..a6926a6700e1 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -0,0 +1,272 @@ +//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file This file contains the AArch64 implementation of the DAG scheduling mutation +// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MacroFusion.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define DEBUG_TYPE "misched" + +STATISTIC(NumFused, "Number of instr pairs fused"); + +using namespace llvm; + +static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden, + cl::desc("Enable scheduling for macro fusion."), cl::init(true)); + +namespace { + +/// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused +/// together. Given an anchor instr, when the other instr is unspecified, then +/// check if the anchor instr may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr *SecondMI) { + assert((FirstMI || SecondMI) && "At least one instr must be specified"); + + const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII); + const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI); + + // Assume wildcards for unspecified instrs. + unsigned FirstOpcode = + FirstMI ? FirstMI->getOpcode() + : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END); + unsigned SecondOpcode = + SecondMI ? SecondMI->getOpcode() + : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END); + + if (ST.hasArithmeticBccFusion()) + // Fuse CMN, CMP, TST followed by Bcc. + if (SecondOpcode == AArch64::Bcc) + switch (FirstOpcode) { + default: + return false; + case AArch64::ADDSWri: + case AArch64::ADDSWrr: + case AArch64::ADDSXri: + case AArch64::ADDSXrr: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::SUBSWri: + case AArch64::SUBSWrr: + case AArch64::SUBSXri: + case AArch64::SUBSXrr: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + return true; + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !II.hasShiftedReg(*FirstMI); + case AArch64::INSTRUCTION_LIST_END: + return true; + } + + if (ST.hasArithmeticCbzFusion()) + // Fuse ALU operations followed by CBZ/CBNZ. + if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) + switch (FirstOpcode) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDWrr: + case AArch64::ADDXri: + case AArch64::ADDXrr: + case AArch64::ANDWri: + case AArch64::ANDWrr: + case AArch64::ANDXri: + case AArch64::ANDXrr: + case AArch64::EORWri: + case AArch64::EORWrr: + case AArch64::EORXri: + case AArch64::EORXrr: + case AArch64::ORRWri: + case AArch64::ORRWrr: + case AArch64::ORRXri: + case AArch64::ORRXrr: + case AArch64::SUBWri: + case AArch64::SUBWrr: + case AArch64::SUBXri: + case AArch64::SUBXrr: + return true; + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !II.hasShiftedReg(*FirstMI); + case AArch64::INSTRUCTION_LIST_END: + return true; + } + + if (ST.hasFuseAES()) + // Fuse AES crypto operations. + switch(FirstOpcode) { + // AES encode. + case AArch64::AESErr: + return SecondOpcode == AArch64::AESMCrr || + SecondOpcode == AArch64::INSTRUCTION_LIST_END; + // AES decode. + case AArch64::AESDrr: + return SecondOpcode == AArch64::AESIMCrr || + SecondOpcode == AArch64::INSTRUCTION_LIST_END; + } + + if (ST.hasFuseLiterals()) + // Fuse literal generation operations. + switch (FirstOpcode) { + // PC relative address. + case AArch64::ADRP: + return SecondOpcode == AArch64::ADDXri || + SecondOpcode == AArch64::INSTRUCTION_LIST_END; + // 32 bit immediate. + case AArch64::MOVZWi: + return (SecondOpcode == AArch64::MOVKWi && + SecondMI->getOperand(3).getImm() == 16) || + SecondOpcode == AArch64::INSTRUCTION_LIST_END; + // Lower half of 64 bit immediate. + case AArch64::MOVZXi: + return (SecondOpcode == AArch64::MOVKXi && + SecondMI->getOperand(3).getImm() == 16) || + SecondOpcode == AArch64::INSTRUCTION_LIST_END; + // Upper half of 64 bit immediate. + case AArch64::MOVKXi: + return FirstMI->getOperand(3).getImm() == 32 && + ((SecondOpcode == AArch64::MOVKXi && + SecondMI->getOperand(3).getImm() == 48) || + SecondOpcode == AArch64::INSTRUCTION_LIST_END); + } + + return false; +} + +/// \brief Implement the fusion of instr pairs in the scheduling DAG, +/// anchored at the instr in AnchorSU.. +static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) { + const MachineInstr *AnchorMI = AnchorSU.getInstr(); + if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient()) + return false; + + // If the anchor instr is the ExitSU, then consider its predecessors; + // otherwise, its successors. + bool Preds = (&AnchorSU == &DAG->ExitSU); + SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs; + + const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI; + const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr; + + // Check if the anchor instr may be fused. + if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), + FirstMI, SecondMI)) + return false; + + // Explorer for fusion candidates among the dependencies of the anchor instr. + for (SDep &Dep : AnchorDeps) { + // Ignore dependencies that don't enforce ordering. + if (Dep.isWeak()) + continue; + + SUnit &DepSU = *Dep.getSUnit(); + // Ignore the ExitSU if the dependents are successors. + if (!Preds && &DepSU == &DAG->ExitSU) + continue; + + const MachineInstr *DepMI = DepSU.getInstr(); + if (!DepMI || DepMI->isPseudo() || DepMI->isTransient()) + continue; + + FirstMI = Preds ? DepMI : AnchorMI; + SecondMI = Preds ? AnchorMI : DepMI; + if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(), + FirstMI, SecondMI)) + continue; + + // Create a single weak edge between the adjacent instrs. The only effect is + // to cause bottom-up scheduling to heavily prioritize the clustered instrs. + SUnit &FirstSU = Preds ? DepSU : AnchorSU; + SUnit &SecondSU = Preds ? AnchorSU : DepSU; + DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)); + + // Adjust the latency between the anchor instr and its + // predecessors/successors. + for (SDep &IDep : AnchorDeps) + if (IDep.getSUnit() == &DepSU) + IDep.setLatency(0); + + // Adjust the latency between the dependent instr and its + // successors/predecessors. + for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds) + if (IDep.getSUnit() == &AnchorSU) + IDep.setLatency(0); + + DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; + FirstSU.print(dbgs(), DAG); dbgs() << " - "; + SecondSU.print(dbgs(), DAG); dbgs() << " / "; + dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " << + DAG->TII->getName(SecondMI->getOpcode()) << '\n'; ); + + ++NumFused; + return true; + } + + return false; +} + +/// \brief Post-process the DAG to create cluster edges between instrs that may +/// be fused by the processor into a single operation. +class AArch64MacroFusion : public ScheduleDAGMutation { +public: + AArch64MacroFusion() {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); + + // For each of the SUnits in the scheduling block, try to fuse the instr in it + // with one in its successors. + for (SUnit &ISU : DAG->SUnits) + scheduleAdjacentImpl(DAG, ISU); + + // Try to fuse the instr in the ExitSU with one in its predecessors. + scheduleAdjacentImpl(DAG, DAG->ExitSU); +} + +} // end namespace + + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () { + return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr; +} + +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h new file mode 100644 index 000000000000..e5efedd9fbfd --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h @@ -0,0 +1,29 @@ +//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// \fileThis file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 definition of the DAG scheduling mutation +// to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" + +//===----------------------------------------------------------------------===// +// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops. +//===----------------------------------------------------------------------===// + +namespace llvm { + +/// Note that you have to add: +/// DAG.addMutation(createAArch64MacroFusionDAGMutation()); +/// to AArch64PassConfig::createMachineScheduler() to have an effect. +std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation(); + +} // llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp index 8f45e6a80a36..f3c8e7e9bdc2 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -12,13 +12,14 @@ // CBZW %W0, <BB#2> // BB#2: // %W0 = COPY %WZR -// This pass should be run after register allocation. +// Similarly, this pass also handles non-zero copies. +// BB#0: +// cmp x0, #1 +// b.eq .LBB0_1 +// .LBB0_1: +// orr x0, xzr, #0x1 // -// FIXME: This should be extended to handle any constant other than zero. E.g., -// cmp w0, #1 -// b.eq .BB1 -// BB1: -// mov w0, #1 +// This pass should be run after register allocation. // // FIXME: This could also be extended to check the whole dominance subtree below // the comparison if the compile time regression is acceptable. @@ -26,6 +27,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" @@ -43,6 +45,7 @@ namespace { class AArch64RedundantCopyElimination : public MachineFunctionPass { const MachineRegisterInfo *MRI; const TargetRegisterInfo *TRI; + BitVector ClobberedRegs; public: static char ID; @@ -50,6 +53,16 @@ public: initializeAArch64RedundantCopyEliminationPass( *PassRegistry::getPassRegistry()); } + + struct RegImm { + MCPhysReg Reg; + int32_t Imm; + RegImm(MCPhysReg Reg, int32_t Imm) : Reg(Reg), Imm(Imm) {} + }; + + Optional<RegImm> knownRegValInBlock(MachineInstr &CondBr, + MachineBasicBlock *MBB, + MachineBasicBlock::iterator &FirstUse); bool optimizeCopy(MachineBasicBlock *MBB); bool runOnMachineFunction(MachineFunction &MF) override; MachineFunctionProperties getRequiredProperties() const override { @@ -66,18 +79,120 @@ char AArch64RedundantCopyElimination::ID = 0; INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim", "AArch64 redundant copy elimination pass", false, false) -static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) { - unsigned Opc = MI.getOpcode(); +/// Remember what registers the specified instruction modifies. +static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs, + const TargetRegisterInfo *TRI) { + for (const MachineOperand &MO : MI.operands()) { + if (MO.isRegMask()) { + ClobberedRegs.setBitsNotInMask(MO.getRegMask()); + continue; + } + + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + if (!MO.isDef()) + continue; + + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + ClobberedRegs.set(*AI); + } +} + +/// It's possible to determine the value of a register based on a dominating +/// condition. To do so, this function checks to see if the basic block \p MBB +/// is the target to which a conditional branch \p CondBr jumps and whose +/// equality comparison is against a constant. If so, return a known physical +/// register and constant value pair. Otherwise, return None. +Optional<AArch64RedundantCopyElimination::RegImm> +AArch64RedundantCopyElimination::knownRegValInBlock( + MachineInstr &CondBr, MachineBasicBlock *MBB, + MachineBasicBlock::iterator &FirstUse) { + unsigned Opc = CondBr.getOpcode(); + // Check if the current basic block is the target block to which the // CBZ/CBNZ instruction jumps when its Wt/Xt is zero. - if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) && - MBB == MI.getOperand(1).getMBB()) - return true; - else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) && - MBB != MI.getOperand(1).getMBB()) - return true; - - return false; + if (((Opc == AArch64::CBZW || Opc == AArch64::CBZX) && + MBB == CondBr.getOperand(1).getMBB()) || + ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) && + MBB != CondBr.getOperand(1).getMBB())) { + FirstUse = CondBr; + return RegImm(CondBr.getOperand(0).getReg(), 0); + } + + // Otherwise, must be a conditional branch. + if (Opc != AArch64::Bcc) + return None; + + // Must be an equality check (i.e., == or !=). + AArch64CC::CondCode CC = (AArch64CC::CondCode)CondBr.getOperand(0).getImm(); + if (CC != AArch64CC::EQ && CC != AArch64CC::NE) + return None; + + MachineBasicBlock *BrTarget = CondBr.getOperand(1).getMBB(); + if ((CC == AArch64CC::EQ && BrTarget != MBB) || + (CC == AArch64CC::NE && BrTarget == MBB)) + return None; + + // Stop if we get to the beginning of PredMBB. + MachineBasicBlock *PredMBB = *MBB->pred_begin(); + assert(PredMBB == CondBr.getParent() && + "Conditional branch not in predecessor block!"); + if (CondBr == PredMBB->begin()) + return None; + + // Registers clobbered in PredMBB between CondBr instruction and current + // instruction being checked in loop. + ClobberedRegs.reset(); + + // Find compare instruction that sets NZCV used by CondBr. + MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator(); + for (MachineInstr &PredI : make_range(std::next(RIt), PredMBB->rend())) { + + // Track clobbered registers. + trackRegDefs(PredI, ClobberedRegs, TRI); + + bool IsCMN = false; + switch (PredI.getOpcode()) { + default: + break; + + // CMN is an alias for ADDS with a dead destination register. + case AArch64::ADDSWri: + case AArch64::ADDSXri: + IsCMN = true; + // CMP is an alias for SUBS with a dead destination register. + case AArch64::SUBSWri: + case AArch64::SUBSXri: { + MCPhysReg SrcReg = PredI.getOperand(1).getReg(); + + // Must not be a symbolic immediate. + if (!PredI.getOperand(2).isImm()) + return None; + + // The src register must not be modified between the cmp and conditional + // branch. This includes a self-clobbering compare. + if (ClobberedRegs[SrcReg]) + return None; + + // We've found the Cmp that sets NZCV. + int32_t KnownImm = PredI.getOperand(2).getImm(); + int32_t Shift = PredI.getOperand(3).getImm(); + KnownImm <<= Shift; + if (IsCMN) + KnownImm = -KnownImm; + FirstUse = PredI; + return RegImm(SrcReg, KnownImm); + } + } + + // Bail if we see an instruction that defines NZCV that we don't handle. + if (PredI.definesRegister(AArch64::NZCV)) + return None; + } + return None; } bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { @@ -85,79 +200,187 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { if (MBB->pred_size() != 1) return false; + // Check if the predecessor has two successors, implying the block ends in a + // conditional branch. MachineBasicBlock *PredMBB = *MBB->pred_begin(); - MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr(); - if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2) + if (PredMBB->succ_size() != 2) + return false; + + MachineBasicBlock::iterator CondBr = PredMBB->getLastNonDebugInstr(); + if (CondBr == PredMBB->end()) return false; - ++CompBr; + // Keep track of the earliest point in the PredMBB block where kill markers + // need to be removed if a COPY is removed. + MachineBasicBlock::iterator FirstUse; + // After calling knownRegValInBlock, FirstUse will either point to a CBZ/CBNZ + // or a compare (i.e., SUBS). In the latter case, we must take care when + // updating FirstUse when scanning for COPY instructions. In particular, if + // there's a COPY in between the compare and branch the COPY should not + // update FirstUse. + bool SeenFirstUse = false; + // Registers that contain a known value at the start of MBB. + SmallVector<RegImm, 4> KnownRegs; + + MachineBasicBlock::iterator Itr = std::next(CondBr); do { - --CompBr; - if (guaranteesZeroRegInBlock(*CompBr, MBB)) - break; - } while (CompBr != PredMBB->begin() && CompBr->isTerminator()); + --Itr; - // We've not found a CBZ/CBNZ, time to bail out. - if (!guaranteesZeroRegInBlock(*CompBr, MBB)) - return false; + Optional<RegImm> KnownRegImm = knownRegValInBlock(*Itr, MBB, FirstUse); + if (KnownRegImm == None) + continue; - unsigned TargetReg = CompBr->getOperand(0).getReg(); - if (!TargetReg) - return false; - assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) && - "Expect physical register"); + KnownRegs.push_back(*KnownRegImm); + + // Reset the clobber list, which is used by knownRegValInBlock. + ClobberedRegs.reset(); + + // Look backward in PredMBB for COPYs from the known reg to find other + // registers that are known to be a constant value. + for (auto PredI = Itr;; --PredI) { + if (FirstUse == PredI) + SeenFirstUse = true; + + if (PredI->isCopy()) { + MCPhysReg CopyDstReg = PredI->getOperand(0).getReg(); + MCPhysReg CopySrcReg = PredI->getOperand(1).getReg(); + for (auto &KnownReg : KnownRegs) { + if (ClobberedRegs[KnownReg.Reg]) + continue; + // If we have X = COPY Y, and Y is known to be zero, then now X is + // known to be zero. + if (CopySrcReg == KnownReg.Reg && !ClobberedRegs[CopyDstReg]) { + KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm)); + if (SeenFirstUse) + FirstUse = PredI; + break; + } + // If we have X = COPY Y, and X is known to be zero, then now Y is + // known to be zero. + if (CopyDstReg == KnownReg.Reg && !ClobberedRegs[CopySrcReg]) { + KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm)); + if (SeenFirstUse) + FirstUse = PredI; + break; + } + } + } + + // Stop if we get to the beginning of PredMBB. + if (PredI == PredMBB->begin()) + break; + + trackRegDefs(*PredI, ClobberedRegs, TRI); + // Stop if all of the known-zero regs have been clobbered. + if (all_of(KnownRegs, [&](RegImm KnownReg) { + return ClobberedRegs[KnownReg.Reg]; + })) + break; + } + break; + + } while (Itr != PredMBB->begin() && Itr->isTerminator()); - // Remember all registers aliasing with TargetReg. - SmallSetVector<unsigned, 8> TargetRegs; - for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI) - TargetRegs.insert(*AI); + // We've not found a registers with a known value, time to bail out. + if (KnownRegs.empty()) + return false; bool Changed = false; + // UsedKnownRegs is the set of KnownRegs that have had uses added to MBB. + SmallSetVector<unsigned, 4> UsedKnownRegs; MachineBasicBlock::iterator LastChange = MBB->begin(); - unsigned SmallestDef = TargetReg; - // Remove redundant Copy instructions unless TargetReg is modified. + // Remove redundant Copy instructions unless KnownReg is modified. for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { MachineInstr *MI = &*I; ++I; - if (MI->isCopy() && MI->getOperand(0).isReg() && - MI->getOperand(1).isReg()) { - - unsigned DefReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); - - if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) && - !MRI->isReserved(DefReg) && - (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) { - DEBUG(dbgs() << "Remove redundant Copy : "); - DEBUG((MI)->print(dbgs())); - - MI->eraseFromParent(); - Changed = true; - LastChange = I; - NumCopiesRemoved++; - SmallestDef = - TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef; - continue; + bool RemovedMI = false; + bool IsCopy = MI->isCopy(); + bool IsMoveImm = MI->isMoveImmediate(); + if (IsCopy || IsMoveImm) { + MCPhysReg DefReg = MI->getOperand(0).getReg(); + MCPhysReg SrcReg = IsCopy ? MI->getOperand(1).getReg() : 0; + int64_t SrcImm = IsMoveImm ? MI->getOperand(1).getImm() : 0; + if (!MRI->isReserved(DefReg) && + ((IsCopy && (SrcReg == AArch64::XZR || SrcReg == AArch64::WZR)) || + IsMoveImm)) { + for (RegImm &KnownReg : KnownRegs) { + if (KnownReg.Reg != DefReg && + !TRI->isSuperRegister(DefReg, KnownReg.Reg)) + continue; + + // For a copy, the known value must be a zero. + if (IsCopy && KnownReg.Imm != 0) + continue; + + if (IsMoveImm) { + // For a move immediate, the known immediate must match the source + // immediate. + if (KnownReg.Imm != SrcImm) + continue; + + // Don't remove a move immediate that implicitly defines the upper + // bits when only the lower 32 bits are known. + MCPhysReg CmpReg = KnownReg.Reg; + if (any_of(MI->implicit_operands(), [CmpReg](MachineOperand &O) { + return !O.isDead() && O.isReg() && O.isDef() && + O.getReg() != CmpReg; + })) + continue; + } + + if (IsCopy) + DEBUG(dbgs() << "Remove redundant Copy : " << *MI); + else + DEBUG(dbgs() << "Remove redundant Move : " << *MI); + + MI->eraseFromParent(); + Changed = true; + LastChange = I; + NumCopiesRemoved++; + UsedKnownRegs.insert(KnownReg.Reg); + RemovedMI = true; + break; + } } } - if (MI->modifiesRegister(TargetReg, TRI)) + // Skip to the next instruction if we removed the COPY/MovImm. + if (RemovedMI) + continue; + + // Remove any regs the MI clobbers from the KnownConstRegs set. + for (unsigned RI = 0; RI < KnownRegs.size();) + if (MI->modifiesRegister(KnownRegs[RI].Reg, TRI)) { + std::swap(KnownRegs[RI], KnownRegs[KnownRegs.size() - 1]); + KnownRegs.pop_back(); + // Don't increment RI since we need to now check the swapped-in + // KnownRegs[RI]. + } else { + ++RI; + } + + // Continue until the KnownRegs set is empty. + if (KnownRegs.empty()) break; } if (!Changed) return false; - // Otherwise, we have to fixup the use-def chain, starting with the - // CBZ/CBNZ. Conservatively mark as much as we can live. - CompBr->clearRegisterKills(SmallestDef, TRI); + // Add newly used regs to the block's live-in list if they aren't there + // already. + for (MCPhysReg KnownReg : UsedKnownRegs) + if (!MBB->isLiveIn(KnownReg)) + MBB->addLiveIn(KnownReg); - if (none_of(TargetRegs, [&](unsigned Reg) { return MBB->isLiveIn(Reg); })) - MBB->addLiveIn(TargetReg); - - // Clear any kills of TargetReg between CompBr and the last removed COPY. + // Clear kills in the range where changes were made. This is conservative, + // but should be okay since kill markers are being phased out. + DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse + << "\tLastChange: " << *LastChange); + for (MachineInstr &MMI : make_range(FirstUse, PredMBB->end())) + MMI.clearKillInfo(); for (MachineInstr &MMI : make_range(MBB->begin(), LastChange)) - MMI.clearRegisterKills(SmallestDef, TRI); + MMI.clearKillInfo(); return true; } @@ -168,6 +391,11 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction( return false; TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); + + // Resize the clobber register bitfield tracker. We do this once per + // function and then clear the bitfield each time we optimize a copy. + ClobberedRegs.resize(TRI->getNumRegs()); + bool Changed = false; for (MachineBasicBlock &MBB : MF) Changed |= optimizeCopy(&MBB); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index b292c9c87dcd..20a5979f9b4b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -1,4 +1,4 @@ -//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==// +//===- AArch64RegisterBankInfo.cpp ----------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -13,13 +13,24 @@ //===----------------------------------------------------------------------===// #include "AArch64RegisterBankInfo.h" -#include "AArch64InstrInfo.h" // For XXXRegClassID. +#include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> + +#define GET_TARGET_REGBANK_IMPL +#include "AArch64GenRegisterBank.inc" // This file will be TableGen'ed at some point. #include "AArch64GenRegisterBankInfo.def" @@ -31,7 +42,7 @@ using namespace llvm; #endif AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) - : RegisterBankInfo(AArch64::RegBanks, AArch64::NumRegisterBanks) { + : AArch64GenRegisterBankInfo() { static bool AlreadyInit = false; // We have only one set of register banks, whatever the subtarget // is. Therefore, the initialization of the RegBanks table should be @@ -78,44 +89,21 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) // Check that the TableGen'ed like file is in sync we our expectations. // First, the Idx. - assert(AArch64::PartialMappingIdx::PMI_GPR32 == - AArch64::PartialMappingIdx::PMI_FirstGPR && - "GPR32 index not first in the GPR list"); - assert(AArch64::PartialMappingIdx::PMI_GPR64 == - AArch64::PartialMappingIdx::PMI_LastGPR && - "GPR64 index not last in the GPR list"); - assert(AArch64::PartialMappingIdx::PMI_FirstGPR <= - AArch64::PartialMappingIdx::PMI_LastGPR && - "GPR list is backward"); - assert(AArch64::PartialMappingIdx::PMI_FPR32 == - AArch64::PartialMappingIdx::PMI_FirstFPR && - "FPR32 index not first in the FPR list"); - assert(AArch64::PartialMappingIdx::PMI_FPR512 == - AArch64::PartialMappingIdx::PMI_LastFPR && - "FPR512 index not last in the FPR list"); - assert(AArch64::PartialMappingIdx::PMI_FirstFPR <= - AArch64::PartialMappingIdx::PMI_LastFPR && - "FPR list is backward"); - assert(AArch64::PartialMappingIdx::PMI_FPR32 + 1 == - AArch64::PartialMappingIdx::PMI_FPR64 && - AArch64::PartialMappingIdx::PMI_FPR64 + 1 == - AArch64::PartialMappingIdx::PMI_FPR128 && - AArch64::PartialMappingIdx::PMI_FPR128 + 1 == - AArch64::PartialMappingIdx::PMI_FPR256 && - AArch64::PartialMappingIdx::PMI_FPR256 + 1 == - AArch64::PartialMappingIdx::PMI_FPR512 && - "FPR indices not properly ordered"); + assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, + {PMI_GPR32, PMI_GPR64}) && + "PartialMappingIdx's are incorrectly ordered"); + assert(checkPartialMappingIdx( + PMI_FirstFPR, PMI_LastFPR, + {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) && + "PartialMappingIdx's are incorrectly ordered"); // Now, the content. // Check partial mapping. #define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \ do { \ - const PartialMapping &Map = \ - AArch64::PartMappings[AArch64::PartialMappingIdx::Idx - \ - AArch64::PartialMappingIdx::PMI_Min]; \ - (void)Map; \ - assert(Map.StartIdx == ValStartIdx && Map.Length == ValLength && \ - Map.RegBank == &RB && #Idx " is incorrectly initialized"); \ - } while (0) + assert( \ + checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \ + #Idx " is incorrectly initialized"); \ + } while (false) CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); @@ -128,17 +116,11 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) // Check value mapping. #define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \ do { \ - unsigned PartialMapBaseIdx = \ - AArch64::PartialMappingIdx::PMI_##RBName##Size - \ - AArch64::PartialMappingIdx::PMI_Min; \ - (void)PartialMapBaseIdx; \ - const ValueMapping &Map = AArch64::getValueMapping( \ - AArch64::PartialMappingIdx::PMI_First##RBName, Size)[Offset]; \ - (void)Map; \ - assert(Map.BreakDown == &AArch64::PartMappings[PartialMapBaseIdx] && \ - Map.NumBreakDowns == 1 && #RBName #Size \ - " " #Offset " is incorrectly initialized"); \ - } while (0) + assert(checkValueMapImpl(PartialMappingIdx::PMI_##RBName##Size, \ + PartialMappingIdx::PMI_First##RBName, Size, \ + Offset) && \ + #RBName #Size " " #Offset " is incorrectly initialized"); \ + } while (false) #define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0) @@ -157,7 +139,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP_IMPL(RBName, Size, 0); \ CHECK_VALUEMAP_IMPL(RBName, Size, 1); \ CHECK_VALUEMAP_IMPL(RBName, Size, 2); \ - } while (0) + } while (false) CHECK_VALUEMAP_3OPS(GPR, 32); CHECK_VALUEMAP_3OPS(GPR, 64); @@ -169,24 +151,23 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) #define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \ do { \ - unsigned PartialMapDstIdx = \ - AArch64::PMI_##RBNameDst##Size - AArch64::PMI_Min; \ - unsigned PartialMapSrcIdx = \ - AArch64::PMI_##RBNameSrc##Size - AArch64::PMI_Min; \ - (void) PartialMapDstIdx; \ - (void) PartialMapSrcIdx; \ - const ValueMapping *Map = AArch64::getCopyMapping( \ - AArch64::PMI_First##RBNameDst == AArch64::PMI_FirstGPR, \ - AArch64::PMI_First##RBNameSrc == AArch64::PMI_FirstGPR, Size); \ - (void) Map; \ - assert(Map[0].BreakDown == &AArch64::PartMappings[PartialMapDstIdx] && \ + unsigned PartialMapDstIdx = PMI_##RBNameDst##Size - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getCopyMapping( \ + AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ Map[0].NumBreakDowns == 1 && #RBNameDst #Size \ " Dst is incorrectly initialized"); \ - assert(Map[1].BreakDown == &AArch64::PartMappings[PartialMapSrcIdx] && \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \ " Src is incorrectly initialized"); \ \ - } while (0) + } while (false) CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32); CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32); @@ -280,12 +261,10 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( break; InstructionMappings AltMappings; InstructionMapping GPRMapping( - /*ID*/ 1, /*Cost*/ 1, - AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size), + /*ID*/ 1, /*Cost*/ 1, getValueMapping(PMI_FirstGPR, Size), /*NumOperands*/ 3); InstructionMapping FPRMapping( - /*ID*/ 2, /*Cost*/ 1, - AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size), + /*ID*/ 2, /*Cost*/ 1, getValueMapping(PMI_FirstFPR, Size), /*NumOperands*/ 3); AltMappings.emplace_back(std::move(GPRMapping)); @@ -305,21 +284,21 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( InstructionMappings AltMappings; InstructionMapping GPRMapping( /*ID*/ 1, /*Cost*/ 1, - AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ true, Size), + getCopyMapping(AArch64::GPRRegBankID, AArch64::GPRRegBankID, Size), /*NumOperands*/ 2); InstructionMapping FPRMapping( /*ID*/ 2, /*Cost*/ 1, - AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ false, Size), + getCopyMapping(AArch64::FPRRegBankID, AArch64::FPRRegBankID, Size), /*NumOperands*/ 2); InstructionMapping GPRToFPRMapping( /*ID*/ 3, /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), - AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ true, Size), + getCopyMapping(AArch64::FPRRegBankID, AArch64::GPRRegBankID, Size), /*NumOperands*/ 2); InstructionMapping FPRToGPRMapping( /*ID*/ 3, /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), - AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ false, Size), + getCopyMapping(AArch64::GPRRegBankID, AArch64::FPRRegBankID, Size), /*NumOperands*/ 2); AltMappings.emplace_back(std::move(GPRMapping)); @@ -341,17 +320,15 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( InstructionMappings AltMappings; InstructionMapping GPRMapping( /*ID*/ 1, /*Cost*/ 1, - getOperandsMapping( - {AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size), - // Addresses are GPR 64-bit. - AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}), + getOperandsMapping({getValueMapping(PMI_FirstGPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, 64)}), /*NumOperands*/ 2); InstructionMapping FPRMapping( /*ID*/ 2, /*Cost*/ 1, - getOperandsMapping( - {AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size), - // Addresses are GPR 64-bit. - AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}), + getOperandsMapping({getValueMapping(PMI_FirstFPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, 64)}), /*NumOperands*/ 2); AltMappings.emplace_back(std::move(GPRMapping)); @@ -369,13 +346,12 @@ void AArch64RegisterBankInfo::applyMappingImpl( switch (OpdMapper.getMI().getOpcode()) { case TargetOpcode::G_OR: case TargetOpcode::G_BITCAST: - case TargetOpcode::G_LOAD: { + case TargetOpcode::G_LOAD: // Those ID must match getInstrAlternativeMappings. assert((OpdMapper.getInstrMapping().getID() >= 1 && OpdMapper.getInstrMapping().getID() <= 4) && "Don't know how to handle that ID"); return applyDefaultMapping(OpdMapper); - } default: llvm_unreachable("Don't know how to handle that operation"); } @@ -411,6 +387,8 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) { unsigned Size = Ty.getSizeInBits(); bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc); + PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR; + #ifndef NDEBUG // Make sure all the operands are using similar size and type. // Should probably be checked by the machine verifier. @@ -422,20 +400,19 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) { // for each types. for (unsigned Idx = 1; Idx != NumOperands; ++Idx) { LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg()); - assert(AArch64::getRegBankBaseIdxOffset(OpTy.getSizeInBits()) == - AArch64::getRegBankBaseIdxOffset(Size) && - "Operand has incompatible size"); + assert( + AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset( + RBIdx, OpTy.getSizeInBits()) == + AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(RBIdx, Size) && + "Operand has incompatible size"); bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc); (void)OpIsFPR; assert(IsFPR == OpIsFPR && "Operand has incompatible type"); } #endif // End NDEBUG. - AArch64::PartialMappingIdx RBIdx = - IsFPR ? AArch64::PMI_FirstFPR : AArch64::PMI_FirstGPR; - - return InstructionMapping{DefaultMappingID, 1, - AArch64::getValueMapping(RBIdx, Size), NumOperands}; + return InstructionMapping{DefaultMappingID, 1, getValueMapping(RBIdx, Size), + NumOperands}; } RegisterBankInfo::InstructionMapping @@ -485,9 +462,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; const RegisterBank &SrcRB = SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; - return InstructionMapping{DefaultMappingID, copyCost(DstRB, SrcRB, Size), - AArch64::getCopyMapping(DstIsGPR, SrcIsGPR, Size), - /*NumOperands*/ 2}; + return InstructionMapping{ + DefaultMappingID, copyCost(DstRB, SrcRB, Size), + getCopyMapping(DstRB.getID(), SrcRB.getID(), Size), + /*NumOperands*/ 2}; } case TargetOpcode::G_SEQUENCE: // FIXME: support this, but the generic code is really not going to do @@ -501,7 +479,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Track the size and bank of each register. We don't do partial mappings. SmallVector<unsigned, 4> OpSize(NumOperands); - SmallVector<AArch64::PartialMappingIdx, 4> OpRegBankIdx(NumOperands); + SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands); for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { auto &MO = MI.getOperand(Idx); if (!MO.isReg()) @@ -513,9 +491,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs. // For floating-point instructions, scalars go in FPRs. if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc)) - OpRegBankIdx[Idx] = AArch64::PMI_FirstFPR; + OpRegBankIdx[Idx] = PMI_FirstFPR; else - OpRegBankIdx[Idx] = AArch64::PMI_FirstGPR; + OpRegBankIdx[Idx] = PMI_FirstGPR; } unsigned Cost = 1; @@ -523,49 +501,50 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // fine-tune the computed mapping. switch (Opc) { case TargetOpcode::G_SITOFP: - case TargetOpcode::G_UITOFP: { - OpRegBankIdx = {AArch64::PMI_FirstFPR, AArch64::PMI_FirstGPR}; + case TargetOpcode::G_UITOFP: + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; break; - } case TargetOpcode::G_FPTOSI: - case TargetOpcode::G_FPTOUI: { - OpRegBankIdx = {AArch64::PMI_FirstGPR, AArch64::PMI_FirstFPR}; + case TargetOpcode::G_FPTOUI: + OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; break; - } - case TargetOpcode::G_FCMP: { - OpRegBankIdx = {AArch64::PMI_FirstGPR, - /* Predicate */ AArch64::PMI_None, AArch64::PMI_FirstFPR, - AArch64::PMI_FirstFPR}; + case TargetOpcode::G_FCMP: + OpRegBankIdx = {PMI_FirstGPR, + /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR}; break; - } - case TargetOpcode::G_BITCAST: { + case TargetOpcode::G_BITCAST: // This is going to be a cross register bank copy and this is expensive. if (OpRegBankIdx[0] != OpRegBankIdx[1]) - Cost = - copyCost(*AArch64::PartMappings[OpRegBankIdx[0]].RegBank, - *AArch64::PartMappings[OpRegBankIdx[1]].RegBank, OpSize[0]); + Cost = copyCost( + *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[0]].RegBank, + *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank, + OpSize[0]); break; - } - case TargetOpcode::G_LOAD: { + case TargetOpcode::G_LOAD: // Loading in vector unit is slightly more expensive. // This is actually only true for the LD1R and co instructions, // but anyway for the fast mode this number does not matter and // for the greedy mode the cost of the cross bank copy will // offset this number. // FIXME: Should be derived from the scheduling model. - if (OpRegBankIdx[0] >= AArch64::PMI_FirstFPR) + if (OpRegBankIdx[0] >= PMI_FirstFPR) Cost = 2; - } + break; } // Finally construct the computed mapping. RegisterBankInfo::InstructionMapping Mapping = InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands}; SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); - for (unsigned Idx = 0; Idx < NumOperands; ++Idx) - if (MI.getOperand(Idx).isReg()) - OpdsMapping[Idx] = - AArch64::getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { + if (MI.getOperand(Idx).isReg()) { + auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); + if (!Mapping->isValid()) + return InstructionMapping(); + + OpdsMapping[Idx] = Mapping; + } + } Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); return Mapping; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h index f763235049d4..0a795a42c0b1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -16,25 +16,78 @@ #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#define GET_REGBANK_DECLARATIONS +#include "AArch64GenRegisterBank.inc" + namespace llvm { class TargetRegisterInfo; -namespace AArch64 { -enum { - GPRRegBankID = 0, /// General Purpose Registers: W, X. - FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q. - CCRRegBankID = 2, /// Conditional register: NZCV. - NumRegisterBanks -}; +class AArch64GenRegisterBankInfo : public RegisterBankInfo { +protected: + + enum PartialMappingIdx { + PMI_None = -1, + PMI_FPR32 = 1, + PMI_FPR64, + PMI_FPR128, + PMI_FPR256, + PMI_FPR512, + PMI_GPR32, + PMI_GPR64, + PMI_FirstGPR = PMI_GPR32, + PMI_LastGPR = PMI_GPR64, + PMI_FirstFPR = PMI_FPR32, + PMI_LastFPR = PMI_FPR512, + PMI_Min = PMI_FirstFPR, + }; + + static RegisterBankInfo::PartialMapping PartMappings[]; + static RegisterBankInfo::ValueMapping ValMappings[]; + static PartialMappingIdx BankIDToCopyMapIdx[]; + + enum ValueMappingIdx { + InvalidIdx = 0, + First3OpsIdx = 1, + Last3OpsIdx = 19, + DistanceBetweenRegBanks = 3, + FirstCrossRegCpyIdx = 22, + LastCrossRegCpyIdx = 34, + DistanceBetweenCrossRegCpy = 2 + }; + + static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, + unsigned ValLength, const RegisterBank &RB); + static bool checkValueMapImpl(unsigned Idx, unsigned FirstInBank, + unsigned Size, unsigned Offset); + static bool checkPartialMappingIdx(PartialMappingIdx FirstAlias, + PartialMappingIdx LastAlias, + ArrayRef<PartialMappingIdx> Order); -extern RegisterBank GPRRegBank; -extern RegisterBank FPRRegBank; -extern RegisterBank CCRRegBank; -} // End AArch64 namespace. + static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size); + + /// Get the pointer to the ValueMapping representing the RegisterBank + /// at \p RBIdx with a size of \p Size. + /// + /// The returned mapping works for instructions with the same kind of + /// operands for up to 3 operands. + /// + /// \pre \p RBIdx != PartialMappingIdx::None + static const RegisterBankInfo::ValueMapping * + getValueMapping(PartialMappingIdx RBIdx, unsigned Size); + + /// Get the pointer to the ValueMapping of the operands of a copy + /// instruction from the \p SrcBankID register bank to the \p DstBankID + /// register bank with a size of \p Size. + static const RegisterBankInfo::ValueMapping * + getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); + +#define GET_TARGET_REGBANK_CLASS +#include "AArch64GenRegisterBank.inc" +}; /// This class provides the information for the target register banks. -class AArch64RegisterBankInfo final : public RegisterBankInfo { +class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo { /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td new file mode 100644 index 000000000000..c2b6c0b04e9b --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td @@ -0,0 +1,20 @@ +//=- AArch64RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +/// General Purpose Registers: W, X. +def GPRRegBank : RegisterBank<"GPR", [GPR64all]>; + +/// Floating Point/Vector Registers: B, H, S, D, Q. +def FPRRegBank : RegisterBank<"FPR", [QQQQ]>; + +/// Conditional register: NZCV. +def CCRRegBank : RegisterBank<"CCR", [CCR]>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 98fad71aa18a..baf15ac540cf 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -118,25 +118,17 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // FIXME: avoid re-calculating this every time. BitVector Reserved(getNumRegs()); - markSuperRegs(Reserved, AArch64::SP); - markSuperRegs(Reserved, AArch64::XZR); markSuperRegs(Reserved, AArch64::WSP); markSuperRegs(Reserved, AArch64::WZR); - if (TFI->hasFP(MF) || TT.isOSDarwin()) { - markSuperRegs(Reserved, AArch64::FP); + if (TFI->hasFP(MF) || TT.isOSDarwin()) markSuperRegs(Reserved, AArch64::W29); - } - if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) { - markSuperRegs(Reserved, AArch64::X18); // Platform register - markSuperRegs(Reserved, AArch64::W18); - } + if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) + markSuperRegs(Reserved, AArch64::W18); // Platform register - if (hasBasePointer(MF)) { - markSuperRegs(Reserved, AArch64::X19); + if (hasBasePointer(MF)) markSuperRegs(Reserved, AArch64::W19); - } assert(checkAllSuperRegsMarked(Reserved)); return Reserved; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td index 93ca079275c8..18d000ace94c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -13,7 +13,7 @@ // ===---------------------------------------------------------------------===// // The following definitions describe the simpler per-operand machine model. -// This works with MachineScheduler. See MCSchedModel.h for details. +// This works with MachineScheduler. See MCSchedule.h for details. // Cortex-A53 machine model for scheduling and other instruction cost heuristics. def CortexA53Model : SchedMachineModel { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td index 99c48d0146e4..303398ea0b7f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -162,7 +162,9 @@ def : InstRW<[A57Write_2cyc_1M], (instregex "BFM")>; // Cryptography Extensions // ----------------------------------------------------------------------------- -def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>; +def A57ReadAES : SchedReadAdvance<3, [A57Write_3cyc_1W]>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^AES[DE]")>; +def : InstRW<[A57Write_3cyc_1W, A57ReadAES], (instregex "^AESI?MC")>; def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>; def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>; def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index 19a6d6f2a1ad..eec089087fe0 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -17,10 +17,112 @@ // instruction cost model. def FalkorModel : SchedMachineModel { - let IssueWidth = 4; // 4-wide issue for expanded uops. + let IssueWidth = 8; // 8 uops are dispatched per cycle. let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer. let LoopMicroOpBufferSize = 16; let LoadLatency = 3; // Optimistic load latency. let MispredictPenalty = 11; // Minimum branch misprediction penalty. - let CompleteModel = 0; + let CompleteModel = 1; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Falkor. + +let SchedModel = FalkorModel in { + + def FalkorUnitB : ProcResource<1>; // Branch + def FalkorUnitLD : ProcResource<1>; // Load pipe + def FalkorUnitSD : ProcResource<1>; // Store data + def FalkorUnitST : ProcResource<1>; // Store pipe + def FalkorUnitX : ProcResource<1>; // Complex arithmetic + def FalkorUnitY : ProcResource<1>; // Simple arithmetic + def FalkorUnitZ : ProcResource<1>; // Simple arithmetic + + def FalkorUnitVSD : ProcResource<1>; // Vector store data + def FalkorUnitVX : ProcResource<1>; // Vector X-pipe + def FalkorUnitVY : ProcResource<1>; // Vector Y-pipe + + def FalkorUnitGTOV : ProcResource<1>; // Scalar to Vector + def FalkorUnitVTOG : ProcResource<1>; // Vector to Scalar + + // Define the resource groups. + def FalkorUnitXY : ProcResGroup<[FalkorUnitX, FalkorUnitY]>; + def FalkorUnitXYZ : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ]>; + def FalkorUnitXYZB : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ, + FalkorUnitB]>; + def FalkorUnitZB : ProcResGroup<[FalkorUnitZ, FalkorUnitB]>; + def FalkorUnitVXVY : ProcResGroup<[FalkorUnitVX, FalkorUnitVY]>; + +} + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latency for +// Falkor. + +let SchedModel = FalkorModel in { + +def : WriteRes<WriteImm, [FalkorUnitXYZ]> { let Latency = 1; } +def : WriteRes<WriteI, [FalkorUnitXYZ]> { let Latency = 1; } +def : WriteRes<WriteISReg, [FalkorUnitVXVY, FalkorUnitVXVY]> + { let Latency = 1; let NumMicroOps = 2; } +def : WriteRes<WriteIEReg, [FalkorUnitXYZ, FalkorUnitXYZ]> + { let Latency = 2; let NumMicroOps = 2; } +def : WriteRes<WriteExtr, [FalkorUnitXYZ, FalkorUnitXYZ]> + { let Latency = 2; let NumMicroOps = 2; } +def : WriteRes<WriteIS, [FalkorUnitXYZ]> { let Latency = 1; } +def : WriteRes<WriteID32, [FalkorUnitX, FalkorUnitZ]> + { let Latency = 8; let NumMicroOps = 2; } +def : WriteRes<WriteID64, [FalkorUnitX, FalkorUnitZ]> + { let Latency = 16; let NumMicroOps = 2; } +def : WriteRes<WriteIM32, [FalkorUnitX]> { let Latency = 4; } +def : WriteRes<WriteIM64, [FalkorUnitX]> { let Latency = 5; } +def : WriteRes<WriteBr, [FalkorUnitB]> { let Latency = 1; } +def : WriteRes<WriteBrReg, [FalkorUnitB]> { let Latency = 1; } +def : WriteRes<WriteLD, [FalkorUnitLD]> { let Latency = 3; } +def : WriteRes<WriteST, [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]> + { let Latency = 3; let NumMicroOps = 3; } +def : WriteRes<WriteSTP, [FalkorUnitST, FalkorUnitSD]> + { let Latency = 0; let NumMicroOps = 2; } +def : WriteRes<WriteAdr, [FalkorUnitXYZ]> { let Latency = 5; } +def : WriteRes<WriteLDIdx, [FalkorUnitLD]> { let Latency = 5; } +def : WriteRes<WriteSTIdx, [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]> + { let Latency = 4; let NumMicroOps = 3; } +def : WriteRes<WriteF, [FalkorUnitVXVY, FalkorUnitVXVY]> + { let Latency = 3; let NumMicroOps = 2; } +def : WriteRes<WriteFCmp, [FalkorUnitVXVY]> { let Latency = 2; } +def : WriteRes<WriteFCvt, [FalkorUnitVXVY]> { let Latency = 4; } +def : WriteRes<WriteFCopy, [FalkorUnitVXVY]> { let Latency = 4; } +def : WriteRes<WriteFImm, [FalkorUnitVXVY]> { let Latency = 4; } +def : WriteRes<WriteFMul, [FalkorUnitVXVY, FalkorUnitVXVY]> + { let Latency = 6; let NumMicroOps = 2; } +def : WriteRes<WriteFDiv, [FalkorUnitVXVY, FalkorUnitVXVY]> + { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1 +def : WriteRes<WriteV, [FalkorUnitVXVY]> { let Latency = 6; } +def : WriteRes<WriteVLD, [FalkorUnitLD]> { let Latency = 3; } +def : WriteRes<WriteVST, [FalkorUnitST, FalkorUnitVSD]> + { let Latency = 0; let NumMicroOps = 2; } + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteLDHi, []> { let Latency = 3; } + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// No forwarding logic is modelled yet. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +// Detailed Refinements +// ----------------------------------------------------------------------------- +include "AArch64SchedFalkorDetails.td" + } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td new file mode 100644 index 000000000000..6bce4ef6b652 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -0,0 +1,523 @@ +//==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the uop and latency details for the machine model for the +// Qualcomm Falkor subtarget. +// +//===----------------------------------------------------------------------===// + +include "AArch64SchedFalkorWriteRes.td" + +//===----------------------------------------------------------------------===// +// Specialize the coarse model by associating instruction groups with the +// subtarget-defined types. As the modeled is refined, this will override most +// of the earlier mappings. + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + +// SIMD Floating-point Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f32|v4f16)$")>; + +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v4f16|v2i16p|v2i32p)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(16|32|64)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(16|32|64|v2f32|v4f16|v2i32|v4i16)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i16|v1i32|v1i64|v2i32|v4i16)rz$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f32|v4f16)$")>; + +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?V(v4i16|v4i32|v8i16)v$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)(v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i16p|v2i32p|v2i64p|v2f32|v4f16)$")>; + +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>; + +def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FMULX16, FMULX32)>; + +def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>; +def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FMULX64)>; + +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>; + +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v8f16|v2i64p)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32|v8i16)rz$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32|v8f16)$")>; + +def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)(v2f32|v4f16)$")>; + +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32|v8f16)$")>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>; + +def : InstRW<[FalkorWr_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; + +def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>; + +def : InstRW<[FalkorWr_3VXVY_4cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>; + +def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i16|v4i32|v8i16|v4f32)$")>; + +def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>; + +def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; + +def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>; +def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>; +// SIMD Integer Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs ADDPv2i64p)>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(AND|ORR|ORN|BIC|EOR)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIC|ORR)(v2i32|v4i16)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^NEG(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$")>; + +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHLv1i64$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs PMULv8i8)>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>; + +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)(ABD|ADALP)(v8i8|v4i16|v2i32)(_v.*)?$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)ADDLVv4i16v$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(s|h|b)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs ADDVv4i16v)>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQABS(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)$")>; + +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; + +def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>; + +def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs ADDVv4i32v)>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs ADDVv8i16v)>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(ADD|SUB)HNv.*$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)ABA(v2i32|v4i16|v8i8)$")>; + +def : InstRW<[FalkorWr_2VXVY_5cyc], (instrs ADDVv16i8v)>; + +def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32)_shift?$")>; +def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^R(ADD|SUB)HNv.*$")>; + +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^ADD(v16i8|v8i16|v4i32|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs ADDPv2i64)>; // sz==11 +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(AND|ORR|ORN|BIC|EOR)v16i8$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIC|ORR)(v8i16|v4i32)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(NEG|SUB)(v16i8|v8i16|v4i32|v2i64)$")>; + +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)ADDLv.*$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v16i8|v2i64|v4i32|v8i16)(_v.*)?$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SHR(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SUBLv.*$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS)(v16i8|v2i64|v4i32|v8i16)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^ADDP(v4i32|v8i16|v16i8)$")>; // sz!=11 +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL2?(v8i8|v16i8)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>; + +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)ABD(v16i8|v8i16|v4i32|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)ABDLv.*$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16|v4i32|v2i64)(_v.*)?$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL2?(v1i64|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; + +def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>; + +def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(S|U)ADDLVv8i16v$")>; + +def : InstRW<[FalkorWr_3VXVY_6cyc], (instregex "^(S|U)ADDLVv16i8v$")>; + +def : InstRW<[FalkorWr_4VXVY_2cyc], (instregex "^(S|U)(ADD|SUB)Wv.*$")>; + +def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>; + +def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>; + +def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>; +// SIMD Load Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[WriteVLD], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; +def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD], (instrs LD2i64)>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>; +def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[WriteVLD, WriteAdr], (instrs LD2i64_POST)>; + +def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "LD1i(8|16|32)$")>; +def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>; +def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>; +def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instrs LD3i64_POST)>; +def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instrs LD4i64_POST)>; + +def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>; +def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, WriteAdr], (instregex "^LD2i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>; +def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instrs LD3Threev2d_POST)>; +def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "LD3i(8|16|32)$")>; +def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, WriteAdr], (instregex "LD3i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>; +def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instrs LD4Fourv2d_POST)>; +def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>; +def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, WriteAdr], (instregex "^LD4i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "LD3Threev(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, WriteAdr],(instregex "LD3Threev(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, WriteAdr],(instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, WriteAdr],(instregex "LD3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, WriteAdr],(instregex "^LD4Fourv(16b|8h|4s)_POST$")>; + +// Arithmetic and Logical Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_ADD], (instregex "^ADD(S)?(W|X)r(s|x)$")>; +def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^SUB(S)?(W|X)r(s|x)$")>; + +// SIMD Miscellaneous Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs NOTv8i8)>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^REV(16|32|64)v.*$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN|XTN2)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>; + +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>; + +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "(S|U)QXTU?Nv.*$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>; + +def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; + +def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>; + +def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>; +def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs TBLv16i8One)>; + +def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>; + +def : InstRW<[FalkorWr_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>; + +def : InstRW<[FalkorWr_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>; + +def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>; +def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>; + +def : InstRW<[FalkorWr_4VXVY_6cyc], (instregex "^TBL(v8i8Four|v16i8Three)$")>; +def : InstRW<[FalkorWr_4VXVY_6cyc], (instregex "^TBX(v8i8Three|v16i8Three)$")>; + +def : InstRW<[FalkorWr_5VXVY_7cyc], (instrs TBLv16i8Four)>; +def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>; + +// SIMD Store Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[WriteVST], (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[WriteVST], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>; +def : InstRW<[WriteVST, WriteAdr], (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; +def : InstRW<[WriteVST, WriteAdr], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>; + +def : InstRW<[WriteVST, WriteVST], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[WriteVST, WriteVST], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[WriteVST, WriteVST], (instregex "^ST3(i8|i16|i32|i64)$")>; +def : InstRW<[WriteVST, WriteVST], (instregex "^ST4(i8|i16|i32|i64)$")>; +def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; +def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>; +def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST3(i8|i16|i32|i64)_POST$")>; +def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST4(i8|i16|i32|i64)_POST$")>; + +def : InstRW<[WriteV, WriteVST, WriteVST], (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>; +def : InstRW<[WriteV, WriteVST, WriteVST, WriteAdr], (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>; + +def : InstRW<[WriteVST, WriteVST, WriteVST], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[WriteVST, WriteVST, WriteVST], (instrs ST3Threev2d)>; +def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr], (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>; +def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr], (instrs ST3Threev2d_POST)>; + +def : InstRW<[WriteV, WriteV, WriteVST, WriteVST], (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>; +def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteAdr], (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>; + +def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], (instrs ST4Fourv2d)>; +def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr], (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>; +def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr], (instrs ST4Fourv2d_POST)>; + +def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST3Three(v16b|v8h|v4s)$")>; +def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST3Three(v16b|v8h|v4s)_POST$")>; + +def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST4Four(v16b|v8h|v4s)$")>; +def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST4Four(v16b|v8h|v4s)_POST$")>; + +// Branch Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1none_0cyc], (instrs B)>; +def : InstRW<[FalkorWr_1Z_0cyc], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>; +def : InstRW<[FalkorWr_1ZB_0cyc], (instrs Bcc)>; +def : InstRW<[FalkorWr_1XYZB_0cyc], (instrs BL)>; +def : InstRW<[FalkorWr_1Z_1XY_0cyc], (instrs BLR)>; + +// Cryptography Extensions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs SHA1Hrr)>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs AESIMCrr, AESMCrr)>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs AESDrr, AESErr)>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>; +def : InstRW<[FalkorWr_1VX_1VY_4cyc], (instregex "^SHA1(C|M|P)rrr$")>; +def : InstRW<[FalkorWr_1VX_1VY_5cyc], (instrs SHA256H2rrr, SHA256Hrrr)>; +def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>; + +// FP Load Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[WriteLD], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>; +def : InstRW<[WriteLD, WriteAdr], (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>; +def : InstRW<[WriteLD], (instregex "^LDUR(Q|D|S|H|B)i$")>; +def : InstRW<[FalkorWr_LDR], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDNPQi)>; +def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDPQi)>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDNP(D|S)i$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDP(D|S)i$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi, WriteAdr],(instregex "LDP(D|S)(pre|post)$")>; +def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi, WriteAdr],(instregex "^LDPQ(pre|post)$")>; + +// FP Data Processing Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(H|S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(H|S|D)r(r|i)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P)(S|U)U(W|X)(H|S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(H|S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(H|S|D)rrr$")>; + +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(H|S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(16|32|64)p$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTHSr, FCVTHDr)>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(H|S|D)r$")>; + +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(16|32|64)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(H|S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTSHr, FCVTDHr)>; + +def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>; + +def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>; + +def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>; + +def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>; +def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>; + +def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; +def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>; +// FP Miscellaneous Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>; + +def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>; + + +// Load Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>; +def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFUMi)>; + +def : InstRW<[WriteLD, WriteLDHi], (instregex "^LDNP(W|X)i$")>; +def : InstRW<[WriteLD, WriteLDHi], (instregex "^LDP(W|X)i$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(B|H|W|X)ui$")>; +def : InstRW<[WriteLD, WriteAdr], (instregex "^LDR(B|H|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(W|X)l$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDTR(B|H|W|X)i$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(B|H|W|X)i$")>; + +def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; +def : InstRW<[FalkorWr_1LD_4cyc], (instrs LDRSWl)>; +def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; +def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; + +def : InstRW<[FalkorWr_PRFM], (instregex "^PRFMro(W|X)$")>; +def : InstRW<[FalkorWr_LDR], (instregex "^LDR(B|H|W|X)ro(W|X)$")>; + +def : InstRW<[FalkorWr_LDRS], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>; + +def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; +def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>; +def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>; +// Miscellaneous Data-Processing Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(S|U)?BFM(W|X)ri$")>; +def : InstRW<[FalkorWr_1X_2cyc], (instregex "^CRC32.*$")>; +def : InstRW<[FalkorWr_1XYZ_2cyc], (instregex "^(CLS|CLZ|RBIT|REV|REV16|REV32)(W|X)r$")>; +def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>; + +// Divide and Multiply Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1X_4cyc], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; +def : InstRW<[FalkorWr_1X_4cyc], (instregex "^M(ADD|SUB)Wrrr$")>; + +def : InstRW<[FalkorWr_1X_5cyc], (instregex "^(S|U)MULHrr$")>; +def : InstRW<[FalkorWr_1X_5cyc], (instregex "^M(ADD|SUB)Xrrr$")>; + +def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>; +def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>; + +// Move and Shift Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W|X).*")>; +def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^ADRP?$")>; +def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^MOVN(W|X)i$")>; +def : InstRW<[FalkorWr_MOVZ], (instregex "^MOVZ(W|X)i$")>; + +// Other Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1LD_0cyc], (instrs CLREX, DMB, DSB)>; +def : InstRW<[FalkorWr_1none_0cyc], (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HLT, HVC, ISB, SMC, SVC)>; +def : InstRW<[FalkorWr_1ST_0cyc], (instrs SYSxt, SYSLxt)>; +def : InstRW<[FalkorWr_1Z_0cyc], (instrs MSRpstateImm1, MSRpstateImm4)>; + +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS)>; + +def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>; + +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>; +def : InstRW<[WriteVST], (instrs STNPDi, STNPSi)>; +def : InstRW<[WriteSTP], (instrs STNPWi, STNPXi)>; +def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>; + +def : InstRW<[WriteST], (instregex "^LDC.*$")>; +def : InstRW<[WriteST], (instregex "^STLR(B|H|W|X)$")>; +def : InstRW<[WriteST], (instregex "^STXP(W|X)$")>; +def : InstRW<[WriteST], (instregex "^STXR(B|H|W|X)$")>; + +def : InstRW<[WriteSTX], (instregex "^STLXP(W|X)$")>; +def : InstRW<[WriteSTX], (instregex "^STLXR(B|H|W|X)$")>; +def : InstRW<[WriteVST, WriteVST], (instrs STNPQi)>; + +// Store Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[WriteVST], (instregex "^STP(D|S)(i|post|pre)$")>; +def : InstRW<[WriteST], (instregex "^STP(W|X)(i|post|pre)$")>; +def : InstRW<[WriteST], (instregex "^STR(Q|D|S|BB|HH)ui$")>; +def : InstRW<[WriteST], (instregex "^STUR(Q|D|S|BB|HH)i$")>; +def : InstRW<[WriteST], (instregex "^STR(B|H|W|X)(post|pre|ui)$")>; +def : InstRW<[WriteST], (instregex "^STTR(B|H|W|X)i$")>; +def : InstRW<[WriteST], (instregex "^STUR(B|H|W|X)i$")>; + +def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)ro(W|X)$")>; + +def : InstRW<[WriteVST, WriteVST], (instregex "^STPQ(i|post|pre)$")>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td new file mode 100644 index 000000000000..9cdb4be4246b --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td @@ -0,0 +1,361 @@ +//=- AArch64SchedFalkorWrRes.td - Falkor Write Res ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Contains all of the Falkor specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: FalkorWr +// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD) +// Latency: #cyc +// +// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued +// down one Z pipe, six SD pipes, four VX pipes and the total latency is +// six cycles. +// +// Contains all of the Falkor specific ReadAdvance types for forwarding logic. +// +// Contains all of the Falkor specific WriteVariant types for immediate zero +// and LSLFast. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Define 1 micro-op types + + +def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } +def FalkorWr_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } +def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } +def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } +def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } +def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; } +def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; } +def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; } +def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; } +def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; } +def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; } + +def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } +def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } +def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } +def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } + +def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } +def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } +def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; } + +def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; } +def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; } +def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Define 2 micro-op types + +def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 1; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> { + let Latency = 0; + let NumMicroOps = 2; +} + +def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { + let Latency = 8; + let ResourceCycles = [2, 8]; +} + +def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { + let Latency = 16; + let ResourceCycles = [2, 16]; +} + +def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define 3 micro-op types + +def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 3; +} +def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 3; +} +def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 3; +} +def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 3; +} +def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} +def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitZ]> { + let Latency = 3; + let NumMicroOps = 3; +} + +//===----------------------------------------------------------------------===// +// Define 4 micro-op types + +def FalkorWr_2VX_2VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, + FalkorUnitVX, FalkorUnitVY]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define 5 micro-op types + +def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 5; +} +def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 5; +} +def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY]> { + let Latency = 7; + let NumMicroOps = 5; +} + +//===----------------------------------------------------------------------===// +// Define 6 micro-op types + +def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 6; +} + +//===----------------------------------------------------------------------===// +// Define 8 micro-op types + +def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 8; +} + +//===----------------------------------------------------------------------===// +// Define 9 micro-op types + +def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, + FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitLD, + FalkorUnitLD, FalkorUnitXYZ, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 9; +} + +def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, + FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitXYZ, + FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 9; +} + +// Forwarding logic is modeled for vector multiply and accumulate +// ----------------------------------------------------------------------------- +def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc, + FalkorWr_2VXVY_4cyc]>; +def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc, + FalkorWr_1VXVY_6cyc, + FalkorWr_2VXVY_5cyc, + FalkorWr_2VXVY_6cyc]>; + +// SchedPredicates and WriteVariants for Immediate Zero and LSLFast +// ----------------------------------------------------------------------------- +def FalkorImmZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; +def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; + +def FalkorWr_FMOV : SchedWriteVariant<[ + SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>; + +def FalkorWr_MOVZ : SchedWriteVariant<[ + SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZB_1cyc]>]>; + +def FalkorWr_LDR : SchedWriteVariant<[ + SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_3cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_4cyc]>]>; + +def FalkorWr_ADD : SchedWriteVariant<[ + SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>, + SchedVar<FalkorImmZPred, [FalkorWr_1XYZ_1cyc]>, + SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>; + +def FalkorWr_PRFM : SchedWriteVariant<[ + SchedVar<FalkorLSLFastPred, [FalkorWr_1ST_3cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>; + +def FalkorWr_LDRS : SchedWriteVariant<[ + SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_4cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td index 426ae6103e4b..02cccccd3078 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -776,23 +776,29 @@ def KryoWrite_4cyc_X_X_115ln : } def : InstRW<[KryoWrite_4cyc_X_X_115ln], (instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>; -def KryoWrite_1cyc_XA_Y_noRSV_43ln : +def KryoWrite_10cyc_XA_Y_noRSV_43ln : SchedWriteRes<[KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 3; + let Latency = 10; let NumMicroOps = 3; } -def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln], - (instrs FDIVDrr, FDIVSrr)>; -def KryoWrite_1cyc_XA_Y_noRSV_121ln : +def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_43ln], + (instrs FDIVSrr)>; +def KryoWrite_14cyc_XA_Y_noRSV_43ln : SchedWriteRes<[KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 3; + let Latency = 14; let NumMicroOps = 3; } -def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln], +def : InstRW<[KryoWrite_14cyc_XA_Y_noRSV_43ln], + (instrs FDIVDrr)>; +def KryoWrite_10cyc_XA_Y_noRSV_121ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 10; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_121ln], (instrs FDIVv2f32)>; -def KryoWrite_1cyc_XA_Y_XA_Y_123ln : +def KryoWrite_14cyc_XA_Y_XA_Y_123ln : SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 4; + let Latency = 14; let NumMicroOps = 4; } -def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln], +def : InstRW<[KryoWrite_14cyc_XA_Y_XA_Y_123ln], (instrs FDIVv2f64, FDIVv4f32)>; def KryoWrite_5cyc_X_noRSV_55ln : SchedWriteRes<[KryoUnitX]> { @@ -968,24 +974,36 @@ def KryoWrite_2cyc_XY_XY_109ln : } def : InstRW<[KryoWrite_2cyc_XY_XY_109ln], (instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>; -def KryoWrite_1cyc_XA_Y_noRSV_42ln : +def KryoWrite_12cyc_XA_Y_noRSV_42ln : SchedWriteRes<[KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 3; + let Latency = 12; let NumMicroOps = 3; } -def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln], - (instregex "FSQRT(S|D)r")>; -def KryoWrite_1cyc_XA_Y_noRSV_120ln : +def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_42ln], + (instrs FSQRTSr)>; +def KryoWrite_21cyc_XA_Y_noRSV_42ln : SchedWriteRes<[KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 3; + let Latency = 21; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_21cyc_XA_Y_noRSV_42ln], + (instrs FSQRTDr)>; +def KryoWrite_12cyc_XA_Y_noRSV_120ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 12; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_120ln], + (instrs FSQRTv2f32)>; +def KryoWrite_21cyc_XA_Y_XA_Y_122ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { + let Latency = 21; let NumMicroOps = 4; } -def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln], - (instregex "FSQRTv2f32")>; -def KryoWrite_1cyc_XA_Y_XA_Y_122ln : +def : InstRW<[KryoWrite_21cyc_XA_Y_XA_Y_122ln], + (instrs FSQRTv4f32)>; +def KryoWrite_36cyc_XA_Y_XA_Y_122ln : SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 4; + let Latency = 36; let NumMicroOps = 4; } -def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln], - (instregex "FSQRT(v2f64|v4f32)")>; +def : InstRW<[KryoWrite_36cyc_XA_Y_XA_Y_122ln], + (instrs FSQRTv2f64)>; def KryoWrite_1cyc_X_201ln : SchedWriteRes<[KryoUnitX]> { let Latency = 1; let NumMicroOps = 1; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td index 14d6891253fa..3fbbc0be682d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td @@ -366,7 +366,8 @@ def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; // Cryptography instructions. def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; -def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>; +def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>; +def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>; def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>; def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td new file mode 100644 index 000000000000..9a0cb702518d --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -0,0 +1,352 @@ +//==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM ThunderX T8X +// (T88, T81, T83) processors. +// Loosely based on Cortex-A53 which is somewhat similar. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details. + +// Cavium ThunderX T8X scheduling machine model. +def ThunderXT8XModel : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops dispatched per cycle. + let MicroOpBufferSize = 0; // ThunderX T88/T81/T83 are in-order. + let LoadLatency = 3; // Optimistic load latency. + let MispredictPenalty = 8; // Branch mispredict penalty. + let PostRAScheduler = 1; // Use PostRA scheduler. + let CompleteModel = 1; +} + +// Modeling each pipeline with BufferSize == 0 since T8X is in-order. +def THXT8XUnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU +def THXT8XUnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC +def THXT8XUnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division +def THXT8XUnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store +def THXT8XUnitBr : ProcResource<1> { let BufferSize = 0; } // Branch +def THXT8XUnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU +def THXT8XUnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mul/Div/Sqrt + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types mapping the ProcResources and +// latencies. + +let SchedModel = ThunderXT8XModel in { + +// ALU +def : WriteRes<WriteImm, [THXT8XUnitALU]> { let Latency = 1; } +def : WriteRes<WriteI, [THXT8XUnitALU]> { let Latency = 1; } +def : WriteRes<WriteISReg, [THXT8XUnitALU]> { let Latency = 2; } +def : WriteRes<WriteIEReg, [THXT8XUnitALU]> { let Latency = 2; } +def : WriteRes<WriteIS, [THXT8XUnitALU]> { let Latency = 2; } +def : WriteRes<WriteExtr, [THXT8XUnitALU]> { let Latency = 2; } + +// MAC +def : WriteRes<WriteIM32, [THXT8XUnitMAC]> { + let Latency = 4; + let ResourceCycles = [1]; +} + +def : WriteRes<WriteIM64, [THXT8XUnitMAC]> { + let Latency = 4; + let ResourceCycles = [1]; +} + +// Div +def : WriteRes<WriteID32, [THXT8XUnitDiv]> { + let Latency = 12; + let ResourceCycles = [6]; +} + +def : WriteRes<WriteID64, [THXT8XUnitDiv]> { + let Latency = 14; + let ResourceCycles = [8]; +} + +// Load +def : WriteRes<WriteLD, [THXT8XUnitLdSt]> { let Latency = 3; } +def : WriteRes<WriteLDIdx, [THXT8XUnitLdSt]> { let Latency = 3; } +def : WriteRes<WriteLDHi, [THXT8XUnitLdSt]> { let Latency = 3; } + +// Vector Load +def : WriteRes<WriteVLD, [THXT8XUnitLdSt]> { + let Latency = 8; + let ResourceCycles = [3]; +} + +def THXT8XWriteVLD1 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 6; + let ResourceCycles = [1]; +} + +def THXT8XWriteVLD2 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 11; + let ResourceCycles = [7]; +} + +def THXT8XWriteVLD3 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 12; + let ResourceCycles = [8]; +} + +def THXT8XWriteVLD4 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 13; + let ResourceCycles = [9]; +} + +def THXT8XWriteVLD5 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 13; + let ResourceCycles = [9]; +} + +// Pre/Post Indexing +def : WriteRes<WriteAdr, []> { let Latency = 0; } + +// Store +def : WriteRes<WriteST, [THXT8XUnitLdSt]> { let Latency = 1; } +def : WriteRes<WriteSTP, [THXT8XUnitLdSt]> { let Latency = 1; } +def : WriteRes<WriteSTIdx, [THXT8XUnitLdSt]> { let Latency = 1; } +def : WriteRes<WriteSTX, [THXT8XUnitLdSt]> { let Latency = 1; } + +// Vector Store +def : WriteRes<WriteVST, [THXT8XUnitLdSt]>; +def THXT8XWriteVST1 : SchedWriteRes<[THXT8XUnitLdSt]>; + +def THXT8XWriteVST2 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 10; + let ResourceCycles = [9]; +} + +def THXT8XWriteVST3 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 11; + let ResourceCycles = [10]; +} + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// Branch +def : WriteRes<WriteBr, [THXT8XUnitBr]>; +def THXT8XWriteBR : SchedWriteRes<[THXT8XUnitBr]>; +def : WriteRes<WriteBrReg, [THXT8XUnitBr]>; +def THXT8XWriteBRR : SchedWriteRes<[THXT8XUnitBr]>; +def THXT8XWriteRET : SchedWriteRes<[THXT8XUnitALU]>; +def : WriteRes<WriteSys, [THXT8XUnitBr]>; +def : WriteRes<WriteBarrier, [THXT8XUnitBr]>; +def : WriteRes<WriteHint, [THXT8XUnitBr]>; + +// FP ALU +def : WriteRes<WriteF, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCmp, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCvt, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCopy, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFImm, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteV, [THXT8XUnitFPALU]> { let Latency = 6; } + +// FP Mul, Div, Sqrt +def : WriteRes<WriteFMul, [THXT8XUnitFPMDS]> { let Latency = 6; } +def : WriteRes<WriteFDiv, [THXT8XUnitFPMDS]> { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THXT8XWriteFMAC : SchedWriteRes<[THXT8XUnitFPMDS]> { let Latency = 10; } + +def THXT8XWriteFDivSP : SchedWriteRes<[THXT8XUnitFPMDS]> { + let Latency = 12; + let ResourceCycles = [9]; +} + +def THXT8XWriteFDivDP : SchedWriteRes<[THXT8XUnitFPMDS]> { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THXT8XWriteFSqrtSP : SchedWriteRes<[THXT8XUnitFPMDS]> { + let Latency = 17; + let ResourceCycles = [14]; +} + +def THXT8XWriteFSqrtDP : SchedWriteRes<[THXT8XUnitFPMDS]> { + let Latency = 31; + let ResourceCycles = [28]; +} + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +// No forwarding for these reads. +def : ReadAdvance<ReadExtrHi, 1>; +def : ReadAdvance<ReadAdrBase, 2>; +def : ReadAdvance<ReadVLD, 2>; + +// FIXME: This needs more targeted benchmarking. +// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable +// operands are needed one cycle later if and only if they are to be +// shifted. Otherwise, they too are needed two cycles later. This same +// ReadAdvance applies to Extended registers as well, even though there is +// a separate SchedPredicate for them. +def : ReadAdvance<ReadI, 2, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; +def THXT8XReadShifted : SchedReadAdvance<1, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; +def THXT8XReadNotShifted : SchedReadAdvance<2, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; +def THXT8XReadISReg : SchedReadVariant<[ + SchedVar<RegShiftedPred, [THXT8XReadShifted]>, + SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>; +def : SchedAlias<ReadISReg, THXT8XReadISReg>; + +def THXT8XReadIEReg : SchedReadVariant<[ + SchedVar<RegExtendedPred, [THXT8XReadShifted]>, + SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>; +def : SchedAlias<ReadIEReg, THXT8XReadIEReg>; + +// MAC - Operands are generally needed one cycle later in the MAC pipe. +// Accumulator operands are needed two cycles later. +def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; +def : ReadAdvance<ReadIMA, 2, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; + +// Div +def : ReadAdvance<ReadID, 1, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRW. + +//--- +// Branch +//--- +def : InstRW<[THXT8XWriteBR], (instregex "^B")>; +def : InstRW<[THXT8XWriteBR], (instregex "^BL")>; +def : InstRW<[THXT8XWriteBR], (instregex "^B.*")>; +def : InstRW<[THXT8XWriteBR], (instregex "^CBNZ")>; +def : InstRW<[THXT8XWriteBR], (instregex "^CBZ")>; +def : InstRW<[THXT8XWriteBR], (instregex "^TBNZ")>; +def : InstRW<[THXT8XWriteBR], (instregex "^TBZ")>; +def : InstRW<[THXT8XWriteBRR], (instregex "^BR")>; +def : InstRW<[THXT8XWriteBRR], (instregex "^BLR")>; + +//--- +// Ret +//--- +def : InstRW<[THXT8XWriteRET], (instregex "^RET")>; + +//--- +// Miscellaneous +//--- +def : InstRW<[WriteI], (instrs COPY)>; + +//--- +// Vector Loads +//--- +def : InstRW<[THXT8XWriteVLD1], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[THXT8XWriteVLD1], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[THXT8XWriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[THXT8XWriteVLD2], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[THXT8XWriteVLD3], (instregex "LD3Threev(2d)$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[THXT8XWriteVLD2], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[THXT8XWriteVLD4], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; + +//--- +// Vector Stores +//--- +def : InstRW<[THXT8XWriteVST1], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[THXT8XWriteVST1], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[THXT8XWriteVST2], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST3Threev(2d)$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[THXT8XWriteVST2], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +//--- +// Floating Point MAC, DIV, SQRT +//--- +def : InstRW<[THXT8XWriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; +def : InstRW<[THXT8XWriteFMAC], (instregex "^FML(A|S).*")>; +def : InstRW<[THXT8XWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[THXT8XWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[THXT8XWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[THXT8XWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[THXT8XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[THXT8XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; + +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td index 35a40c314bf4..3654eeca530a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -1,4 +1,4 @@ -//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=// +//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 Scheduling ---*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -6,23 +6,23 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// 1. Introduction // -// This file defines the machine model for Broadcom Vulcan to support -// instruction scheduling and other instruction cost heuristics. +// This file defines the scheduling model for Cavium ThunderX2T99 +// processors. +// Based on Broadcom Vulcan. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // 2. Pipeline Description. -def VulcanModel : SchedMachineModel { +def ThunderX2T99Model : SchedMachineModel { let IssueWidth = 4; // 4 micro-ops dispatched at a time. let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer. let LoadLatency = 4; // Optimistic load latency. let MispredictPenalty = 12; // Extra cycles for mispredicted branch. // Determined via a mix of micro-arch details and experimentation. - let LoopMicroOpBufferSize = 32; + let LoopMicroOpBufferSize = 32; let PostRAScheduler = 1; // Using PostRA sched. let CompleteModel = 1; } @@ -30,155 +30,155 @@ def VulcanModel : SchedMachineModel { // Define the issue ports. // Port 0: ALU, FP/SIMD. -def VulcanP0 : ProcResource<1>; +def THX2T99P0 : ProcResource<1>; // Port 1: ALU, FP/SIMD, integer mul/div. -def VulcanP1 : ProcResource<1>; +def THX2T99P1 : ProcResource<1>; // Port 2: ALU, Branch. -def VulcanP2 : ProcResource<1>; +def THX2T99P2 : ProcResource<1>; // Port 3: Store data. -def VulcanP3 : ProcResource<1>; +def THX2T99P3 : ProcResource<1>; // Port 4: Load/store. -def VulcanP4 : ProcResource<1>; +def THX2T99P4 : ProcResource<1>; // Port 5: Load/store. -def VulcanP5 : ProcResource<1>; +def THX2T99P5 : ProcResource<1>; -let SchedModel = VulcanModel in { +let SchedModel = ThunderX2T99Model in { // Define groups for the functional units on each issue port. Each group // created will be used by a WriteRes later on. // // NOTE: Some groups only contain one member. This is a way to create names for // the various functional units that share a single issue port. For example, -// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for FP ops on port 1. +// "THX2T99I1" for ALU ops on port 1 and "THX2T99F1" for FP ops on port 1. // Integer divide and multiply micro-ops only on port 1. -def VulcanI1 : ProcResGroup<[VulcanP1]>; +def THX2T99I1 : ProcResGroup<[THX2T99P1]>; // Branch micro-ops only on port 2. -def VulcanI2 : ProcResGroup<[VulcanP2]>; +def THX2T99I2 : ProcResGroup<[THX2T99P2]>; // ALU micro-ops on ports 0, 1, and 2. -def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>; +def THX2T99I012 : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2]>; // Crypto FP/SIMD micro-ops only on port 1. -def VulcanF1 : ProcResGroup<[VulcanP1]>; +def THX2T99F1 : ProcResGroup<[THX2T99P1]>; // FP/SIMD micro-ops on ports 0 and 1. -def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>; +def THX2T99F01 : ProcResGroup<[THX2T99P0, THX2T99P1]>; // Store data micro-ops only on port 3. -def VulcanSD : ProcResGroup<[VulcanP3]>; +def THX2T99SD : ProcResGroup<[THX2T99P3]>; // Load/store micro-ops on ports 4 and 5. -def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>; +def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>; // 60 entry unified scheduler. -def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2, - VulcanP3, VulcanP4, VulcanP5]> { +def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2, + THX2T99P3, THX2T99P4, THX2T99P5]> { let BufferSize=60; } // Define commonly used write types for InstRW specializations. -// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>. +// All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>. // 3 cycles on I1. -def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; } +def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 3; } // 4 cycles on I1. -def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; } +def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 4; } // 1 cycle on I0, I1, or I2. -def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; } +def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> { let Latency = 1; } // 5 cycles on F1. -def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; } +def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 5; } // 7 cycles on F1. -def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; } +def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 7; } // 4 cycles on F0 or F1. -def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; } +def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 4; } // 5 cycles on F0 or F1. -def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; } +def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 5; } // 6 cycles on F0 or F1. -def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; } +def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 6; } // 7 cycles on F0 or F1. -def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; } +def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 7; } // 8 cycles on F0 or F1. -def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; } +def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 8; } // 16 cycles on F0 or F1. -def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> { +def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 16; let ResourceCycles = [8]; } // 23 cycles on F0 or F1. -def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> { +def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 23; let ResourceCycles = [11]; } // 1 cycles on LS0 or LS1. -def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; } +def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 1; } // 4 cycles on LS0 or LS1. -def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; } +def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 4; } // 5 cycles on LS0 or LS1. -def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; } +def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 5; } // 6 cycles on LS0 or LS1. -def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; } +def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 6; } // 5 cycles on LS0 or LS1 and I0, I1, or I2. -def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> { +def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { let Latency = 5; let NumMicroOps = 2; } // 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2. -def VulcanWrite_6Cyc_LS01_I012_I012 : - SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> { +def THX2T99Write_6Cyc_LS01_I012_I012 : + SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { let Latency = 6; let NumMicroOps = 3; } // 1 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { +def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 1; let NumMicroOps = 2; } // 5 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { +def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 5; let NumMicroOps = 2; } // 6 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { +def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 6; let NumMicroOps = 2; } // 7 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { +def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 7; let NumMicroOps = 2; } // 8 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { +def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { let Latency = 8; let NumMicroOps = 2; } @@ -202,7 +202,7 @@ def : ReadAdvance<ReadVLD, 0>; //===----------------------------------------------------------------------===// // 3. Instruction Tables. -let SchedModel = VulcanModel in { +let SchedModel = ThunderX2T99Model in { //--- // 3.1 Branch Instructions @@ -211,7 +211,7 @@ let SchedModel = VulcanModel in { // Branch, immed // Branch and link, immed // Compare and branch -def : WriteRes<WriteBr, [VulcanI2]> { let Latency = 1; } +def : WriteRes<WriteBr, [THX2T99I2]> { let Latency = 1; } def : WriteRes<WriteSys, []> { let Latency = 1; } def : WriteRes<WriteBarrier, []> { let Latency = 1; } @@ -222,7 +222,7 @@ def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } // Branch, register // Branch and link, register != LR // Branch and link, register = LR -def : WriteRes<WriteBrReg, [VulcanI2]> { let Latency = 1; } +def : WriteRes<WriteBrReg, [THX2T99I2]> { let Latency = 1; } //--- // 3.2 Arithmetic and Logical Instructions @@ -233,25 +233,25 @@ def : WriteRes<WriteBrReg, [VulcanI2]> { let Latency = 1; } // Conditional compare // Conditional select // Address generation -def : WriteRes<WriteI, [VulcanI012]> { let Latency = 1; } +def : WriteRes<WriteI, [THX2T99I012]> { let Latency = 1; } def : InstRW<[WriteI], (instrs COPY)>; // ALU, extend and/or shift -def : WriteRes<WriteISReg, [VulcanI012]> { +def : WriteRes<WriteISReg, [THX2T99I012]> { let Latency = 2; let ResourceCycles = [2]; } -def : WriteRes<WriteIEReg, [VulcanI012]> { +def : WriteRes<WriteIEReg, [THX2T99I012]> { let Latency = 2; let ResourceCycles = [2]; } // Move immed -def : WriteRes<WriteImm, [VulcanI012]> { let Latency = 1; } +def : WriteRes<WriteImm, [THX2T99I012]> { let Latency = 1; } // Variable shift -def : WriteRes<WriteIS, [VulcanI012]> { let Latency = 1; } +def : WriteRes<WriteIS, [THX2T99I012]> { let Latency = 1; } //--- // 3.4 Divide and Multiply Instructions @@ -259,33 +259,33 @@ def : WriteRes<WriteIS, [VulcanI012]> { let Latency = 1; } // Divide, W-form // Latency range of 13-23. Take the average. -def : WriteRes<WriteID32, [VulcanI1]> { +def : WriteRes<WriteID32, [THX2T99I1]> { let Latency = 18; let ResourceCycles = [18]; } // Divide, X-form // Latency range of 13-39. Take the average. -def : WriteRes<WriteID64, [VulcanI1]> { +def : WriteRes<WriteID64, [THX2T99I1]> { let Latency = 26; let ResourceCycles = [26]; } // Multiply accumulate, W-form -def : WriteRes<WriteIM32, [VulcanI012]> { let Latency = 5; } +def : WriteRes<WriteIM32, [THX2T99I012]> { let Latency = 5; } // Multiply accumulate, X-form -def : WriteRes<WriteIM64, [VulcanI012]> { let Latency = 5; } +def : WriteRes<WriteIM64, [THX2T99I012]> { let Latency = 5; } // Bitfield extract, two reg -def : WriteRes<WriteExtr, [VulcanI012]> { let Latency = 1; } +def : WriteRes<WriteExtr, [THX2T99I012]> { let Latency = 1; } // Bitfield move, basic // Bitfield move, insert // NOTE: Handled by WriteIS. // Count leading -def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$", +def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$")>; // Reverse bits/bytes @@ -300,13 +300,13 @@ def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$", // Load register, unscaled immed // Load register, immed unprivileged // Load register, unsigned immed -def : WriteRes<WriteLD, [VulcanLS01]> { let Latency = 4; } +def : WriteRes<WriteLD, [THX2T99LS01]> { let Latency = 4; } // Load register, immed post-index // NOTE: Handled by WriteLD, WriteI. // Load register, immed pre-index // NOTE: Handled by WriteLD, WriteAdr. -def : WriteRes<WriteAdr, [VulcanI012]> { let Latency = 1; } +def : WriteRes<WriteAdr, [THX2T99I012]> { let Latency = 1; } // Load register offset, basic // Load register, register offset, scale by 4/8 @@ -314,15 +314,15 @@ def : WriteRes<WriteAdr, [VulcanI012]> { let Latency = 1; } // Load register offset, extend // Load register, register offset, extend, scale by 4/8 // Load register, register offset, extend, scale by 2 -def VulcanWriteLDIdx : SchedWriteVariant<[ - SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>, - SchedVar<NoSchedPred, [VulcanWrite_5Cyc_LS01_I012]>]>; -def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>; +def THX2T99WriteLDIdx : SchedWriteVariant<[ + SchedVar<ScaledIdxPred, [THX2T99Write_6Cyc_LS01_I012_I012]>, + SchedVar<NoSchedPred, [THX2T99Write_5Cyc_LS01_I012]>]>; +def : SchedAlias<WriteLDIdx, THX2T99WriteLDIdx>; -def VulcanReadAdrBase : SchedReadVariant<[ +def THX2T99ReadAdrBase : SchedReadVariant<[ SchedVar<ScaledIdxPred, [ReadDefault]>, SchedVar<NoSchedPred, [ReadDefault]>]>; -def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>; +def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>; // Load pair, immed offset, normal // Load pair, immed offset, signed words, base != SP @@ -347,7 +347,7 @@ def : WriteRes<WriteLDHi, []> { // Store register, unscaled immed // Store register, immed unprivileged // Store register, unsigned immed -def : WriteRes<WriteST, [VulcanLS01, VulcanSD]> { +def : WriteRes<WriteST, [THX2T99LS01, THX2T99SD]> { let Latency = 1; let NumMicroOps = 2; } @@ -364,14 +364,14 @@ def : WriteRes<WriteST, [VulcanLS01, VulcanSD]> { // Store register, register offset, extend // Store register, register offset, extend, scale by 4/8 // Store register, register offset, extend, scale by 1 -def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> { +def : WriteRes<WriteSTIdx, [THX2T99LS01, THX2T99SD, THX2T99I012]> { let Latency = 1; let NumMicroOps = 3; } // Store pair, immed offset, W-form // Store pair, immed offset, X-form -def : WriteRes<WriteSTP, [VulcanLS01, VulcanSD]> { +def : WriteRes<WriteSTP, [THX2T99LS01, THX2T99SD]> { let Latency = 1; let NumMicroOps = 2; } @@ -389,35 +389,35 @@ def : WriteRes<WriteSTP, [VulcanLS01, VulcanSD]> { // FP absolute value // FP min/max // FP negate -def : WriteRes<WriteF, [VulcanF01]> { let Latency = 5; } +def : WriteRes<WriteF, [THX2T99F01]> { let Latency = 5; } // FP arithmetic -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>; +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>; // FP compare -def : WriteRes<WriteFCmp, [VulcanF01]> { let Latency = 5; } +def : WriteRes<WriteFCmp, [THX2T99F01]> { let Latency = 5; } // FP divide, S-form // FP square root, S-form -def : WriteRes<WriteFDiv, [VulcanF01]> { +def : WriteRes<WriteFDiv, [THX2T99F01]> { let Latency = 16; let ResourceCycles = [8]; } // FP divide, D-form // FP square root, D-form -def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>; +def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>; // FP multiply // FP multiply accumulate -def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; } +def : WriteRes<WriteFMul, [THX2T99F01]> { let Latency = 6; } // FP round to integral -def : InstRW<[VulcanWrite_7Cyc_F01], +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; // FP select -def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>; +def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>; //--- // 3.9 FP Miscellaneous Instructions @@ -426,16 +426,16 @@ def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>; // FP convert, from vec to vec reg // FP convert, from gen to vec reg // FP convert, from vec to gen reg -def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; } +def : WriteRes<WriteFCvt, [THX2T99F01]> { let Latency = 7; } // FP move, immed // FP move, register -def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; } +def : WriteRes<WriteFImm, [THX2T99F01]> { let Latency = 4; } // FP transfer, from gen to vec reg // FP transfer, from vec to gen reg -def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; } -def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; +def : WriteRes<WriteFCopy, [THX2T99F01]> { let Latency = 4; } +def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; //--- // 3.12 ASIMD Integer Instructions @@ -470,39 +470,39 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; // ASIMD shift by register, basic, Q-form // ASIMD shift by register, complex, D-form // ASIMD shift by register, complex, Q-form -def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; } +def : WriteRes<WriteV, [THX2T99F01]> { let Latency = 7; } // ASIMD arith, reduce, 4H/4S // ASIMD arith, reduce, 8B/8H // ASIMD arith, reduce, 16B -def : InstRW<[VulcanWrite_5Cyc_F01], +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; // ASIMD logical (MOV, MVN, ORN, ORR) -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>; // ASIMD polynomial (8x8) multiply long -def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>; //--- // 3.13 ASIMD Floating-point Instructions //--- // ASIMD FP absolute value -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>; // ASIMD FP arith, normal, D-form // ASIMD FP arith, normal, Q-form -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>; // ASIMD FP arith,pairwise, D-form // ASIMD FP arith, pairwise, Q-form -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>; // ASIMD FP compare, D-form // ASIMD FP compare, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>; -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", "^FCMGTv", "^FCMLEv", "^FCMLTv")>; @@ -513,42 +513,42 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", // NOTE: Handled by WriteV. // ASIMD FP divide, D-form, F32 -def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>; +def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>; // ASIMD FP divide, Q-form, F32 -def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>; +def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>; // ASIMD FP divide, Q-form, F64 -def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>; +def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>; // ASIMD FP max/min, normal, D-form // ASIMD FP max/min, normal, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv", +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv", "^FMINv", "^FMINNMv")>; // ASIMD FP max/min, pairwise, D-form // ASIMD FP max/min, pairwise, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv", +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv", "^FMINPv", "^FMINNMPv")>; // ASIMD FP max/min, reduce -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv", +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv", "^FMINVv", "^FMINNMVv")>; // ASIMD FP multiply, D-form, FZ // ASIMD FP multiply, D-form, no FZ // ASIMD FP multiply, Q-form, FZ // ASIMD FP multiply, Q-form, no FZ -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>; // ASIMD FP multiply accumulate, Dform, FZ // ASIMD FP multiply accumulate, Dform, no FZ // ASIMD FP multiply accumulate, Qform, FZ // ASIMD FP multiply accumulate, Qform, no FZ -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>; // ASIMD FP negate -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>; // ASIMD FP round, D-form // ASIMD FP round, Q-form @@ -559,39 +559,39 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>; //-- // ASIMD bit reverse -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>; // ASIMD bitwise insert, D-form // ASIMD bitwise insert, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>; // ASIMD count, D-form // ASIMD count, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>; // ASIMD duplicate, gen reg // ASIMD duplicate, element -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>; // ASIMD extract -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>; // ASIMD extract narrow // ASIMD extract narrow, saturating // NOTE: Handled by WriteV. // ASIMD insert, element to element -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>; // ASIMD move, integer immed -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>; // ASIMD move, FP immed -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>; // ASIMD reciprocal estimate, D-form // ASIMD reciprocal estimate, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", "^FRSQRTEv", "^URSQRTEv")>; @@ -599,31 +599,31 @@ def : InstRW<[VulcanWrite_5Cyc_F01], // ASIMD reciprocal step, D-form, no FZ // ASIMD reciprocal step, Q-form, FZ // ASIMD reciprocal step, Q-form, no FZ -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>; // ASIMD reverse -def : InstRW<[VulcanWrite_5Cyc_F01], +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^REV16v", "^REV32v", "^REV64v")>; // ASIMD table lookup, D-form // ASIMD table lookup, Q-form -def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>; +def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>; // ASIMD transfer, element to word or word -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^UMOVv")>; // ASIMD transfer, element to gen reg -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>; // ASIMD transfer gen reg to element -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>; // ASIMD transpose -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v", +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; // ASIMD unzip/zip -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>; //-- // 3.15 ASIMD Load Instructions @@ -631,114 +631,114 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>; // ASIMD load, 1 element, multiple, 1 reg, D-form // ASIMD load, 1 element, multiple, 1 reg, Q-form -def : InstRW<[VulcanWrite_4Cyc_LS01], +def : InstRW<[THX2T99Write_4Cyc_LS01], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 1 element, multiple, 2 reg, D-form // ASIMD load, 1 element, multiple, 2 reg, Q-form -def : InstRW<[VulcanWrite_4Cyc_LS01], +def : InstRW<[THX2T99Write_4Cyc_LS01], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 1 element, multiple, 3 reg, D-form // ASIMD load, 1 element, multiple, 3 reg, Q-form -def : InstRW<[VulcanWrite_5Cyc_LS01], +def : InstRW<[THX2T99Write_5Cyc_LS01], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 1 element, multiple, 4 reg, D-form // ASIMD load, 1 element, multiple, 4 reg, Q-form -def : InstRW<[VulcanWrite_6Cyc_LS01], +def : InstRW<[THX2T99Write_6Cyc_LS01], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 1 element, one lane, B/H/S // ASIMD load, 1 element, one lane, D -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD1i(8|16|32|64)_POST$")>; // ASIMD load, 1 element, all lanes, D-form, B/H/S // ASIMD load, 1 element, all lanes, D-form, D // ASIMD load, 1 element, all lanes, Q-form -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 2 element, multiple, D-form, B/H/S // ASIMD load, 2 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD load, 2 element, one lane, B/H // ASIMD load, 2 element, one lane, S // ASIMD load, 2 element, one lane, D -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD2i(8|16|32|64)_POST$")>; // ASIMD load, 2 element, all lanes, D-form, B/H/S // ASIMD load, 2 element, all lanes, D-form, D // ASIMD load, 2 element, all lanes, Q-form -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 3 element, multiple, D-form, B/H/S // ASIMD load, 3 element, multiple, Q-form, B/H/S // ASIMD load, 3 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_8Cyc_LS01_F01], +def : InstRW<[THX2T99Write_8Cyc_LS01_F01], (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD load, 3 element, one lone, B/H // ASIMD load, 3 element, one lane, S // ASIMD load, 3 element, one lane, D -def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], (instregex "^LD3i(8|16|32|64)_POST$")>; // ASIMD load, 3 element, all lanes, D-form, B/H/S // ASIMD load, 3 element, all lanes, D-form, D // ASIMD load, 3 element, all lanes, Q-form, B/H/S // ASIMD load, 3 element, all lanes, Q-form, D -def : InstRW<[VulcanWrite_7Cyc_LS01_F01], +def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD load, 4 element, multiple, D-form, B/H/S // ASIMD load, 4 element, multiple, Q-form, B/H/S // ASIMD load, 4 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_8Cyc_LS01_F01], +def : InstRW<[THX2T99Write_8Cyc_LS01_F01], (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD load, 4 element, one lane, B/H // ASIMD load, 4 element, one lane, S // ASIMD load, 4 element, one lane, D -def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], (instregex "^LD4i(8|16|32|64)_POST$")>; // ASIMD load, 4 element, all lanes, D-form, B/H/S // ASIMD load, 4 element, all lanes, D-form, D // ASIMD load, 4 element, all lanes, Q-form, B/H/S // ASIMD load, 4 element, all lanes, Q-form, D -def : InstRW<[VulcanWrite_6Cyc_LS01_F01], +def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; //-- @@ -747,82 +747,82 @@ def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], // ASIMD store, 1 element, multiple, 1 reg, D-form // ASIMD store, 1 element, multiple, 1 reg, Q-form -def : InstRW<[VulcanWrite_1Cyc_LS01], +def : InstRW<[THX2T99Write_1Cyc_LS01], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD store, 1 element, multiple, 2 reg, D-form // ASIMD store, 1 element, multiple, 2 reg, Q-form -def : InstRW<[VulcanWrite_1Cyc_LS01], +def : InstRW<[THX2T99Write_1Cyc_LS01], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD store, 1 element, multiple, 3 reg, D-form // ASIMD store, 1 element, multiple, 3 reg, Q-form -def : InstRW<[VulcanWrite_1Cyc_LS01], +def : InstRW<[THX2T99Write_1Cyc_LS01], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD store, 1 element, multiple, 4 reg, D-form // ASIMD store, 1 element, multiple, 4 reg, Q-form -def : InstRW<[VulcanWrite_1Cyc_LS01], +def : InstRW<[THX2T99Write_1Cyc_LS01], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; // ASIMD store, 1 element, one lane, B/H/S // ASIMD store, 1 element, one lane, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST1i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST1i(8|16|32|64)_POST$")>; // ASIMD store, 2 element, multiple, D-form, B/H/S // ASIMD store, 2 element, multiple, Q-form, B/H/S // ASIMD store, 2 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD store, 2 element, one lane, B/H/S // ASIMD store, 2 element, one lane, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST2i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST2i(8|16|32|64)_POST$")>; // ASIMD store, 3 element, multiple, D-form, B/H/S // ASIMD store, 3 element, multiple, Q-form, B/H/S // ASIMD store, 3 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD store, 3 element, one lane, B/H // ASIMD store, 3 element, one lane, S // ASIMD store, 3 element, one lane, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST3i(8|16|32|64)_POST$")>; // ASIMD store, 4 element, multiple, D-form, B/H/S // ASIMD store, 4 element, multiple, Q-form, B/H/S // ASIMD store, 4 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; // ASIMD store, 4 element, one lane, B/H // ASIMD store, 4 element, one lane, S // ASIMD store, 4 element, one lane, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], (instregex "^ST4i(8|16|32|64)_POST$")>; //-- @@ -830,23 +830,23 @@ def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], //-- // Crypto AES ops -def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>; +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES")>; // Crypto polynomial (64x64) multiply long -def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>; +def : InstRW<[THX2T99Write_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>; // Crypto SHA1 xor ops // Crypto SHA1 schedule acceleration ops // Crypto SHA256 schedule acceleration op (1 u-op) // Crypto SHA256 schedule acceleration op (2 u-ops) // Crypto SHA256 hash acceleration ops -def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA")>; //-- // 3.18 CRC //-- // CRC checksum ops -def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>; +def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32")>; -} // SchedModel = VulcanModel +} // SchedModel = ThunderX2T99Model diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 66a8f332513a..7f5507371fa0 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -42,10 +42,12 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( Entry.Node = Size; Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args)) - .setDiscardResult(); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(bzeroEntry, IntPtr), + std::move(Args)) + .setDiscardResult(); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } @@ -53,7 +55,5 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( } bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner( CodeGenOpt::Level OptLevel) const { - if (OptLevel >= CodeGenOpt::Aggressive) - return true; - return false; + return OptLevel >= CodeGenOpt::Aggressive; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 03e01329e036..b3aba4781db8 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -81,8 +81,22 @@ void AArch64Subtarget::initializeProperties() { MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; break; - case Vulcan: + case ThunderX2T99: + CacheLineSize = 64; + PrefFunctionAlignment = 3; + PrefLoopAlignment = 2; MaxInterleaveFactor = 4; + PrefetchDistance = 128; + MinPrefetchStride = 1024; + MaxPrefetchIterationsAhead = 4; + break; + case ThunderX: + case ThunderXT88: + case ThunderXT81: + case ThunderXT83: + CacheLineSize = 128; + PrefFunctionAlignment = 3; + PrefLoopAlignment = 2; break; case CortexA35: break; case CortexA53: break; @@ -133,9 +147,9 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) return AArch64II::MO_GOT; - // The small code mode's direct accesses use ADRP, which cannot necessarily - // produce the value 0 (if the code is above 4GB). - if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) + // The small code model's direct accesses use ADRP, which cannot + // necessarily produce the value 0 (if the code is above 4GB). + if (useSmallAddressing() && GV->hasExternalWeakLinkage()) return AArch64II::MO_GOT; return AArch64II::MO_NO_FLAG; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h index a99340225082..40ad9185012c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -45,7 +45,11 @@ public: ExynosM1, Falkor, Kryo, - Vulcan + ThunderX2T99, + ThunderX, + ThunderXT81, + ThunderXT83, + ThunderXT88 }; protected: @@ -61,9 +65,11 @@ protected: bool HasCRC = false; bool HasLSE = false; bool HasRAS = false; + bool HasRDM = false; bool HasPerfMon = false; bool HasFullFP16 = false; bool HasSPE = false; + bool HasLSLFast = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -73,6 +79,10 @@ protected: // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign = false; + + // NegativeImmediates - transform instructions with negative immediates + bool NegativeImmediates = true; + bool UseAA = false; bool PredictableSelectIsExpensive = false; bool BalanceFPOps = false; @@ -83,6 +93,8 @@ protected: bool UseAlternateSExtLoadCVTF32Pattern = false; bool HasArithmeticBccFusion = false; bool HasArithmeticCbzFusion = false; + bool HasFuseAES = false; + bool HasFuseLiterals = false; bool DisableLatencySchedHeuristic = false; bool UseRSqrt = false; uint8_t MaxInterleaveFactor = 2; @@ -183,6 +195,7 @@ public: bool hasCRC() const { return HasCRC; } bool hasLSE() const { return HasLSE; } bool hasRAS() const { return HasRAS; } + bool hasRDM() const { return HasRDM; } bool balanceFPOps() const { return BalanceFPOps; } bool predictableSelectIsExpensive() const { return PredictableSelectIsExpensive; @@ -195,6 +208,8 @@ public: } bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } + bool hasFuseAES() const { return HasFuseAES; } + bool hasFuseLiterals() const { return HasFuseLiterals; } bool useRSqrt() const { return UseRSqrt; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { @@ -218,6 +233,7 @@ public: bool hasPerfMon() const { return HasPerfMon; } bool hasFullFP16() const { return HasFullFP16; } bool hasSPE() const { return HasSPE; } + bool hasLSLFast() const { return HasLSLFast; } bool isLittleEndian() const { return IsLittle; } @@ -226,6 +242,7 @@ public: bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } bool isTargetAndroid() const { return TargetTriple.isAndroid(); } + bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } @@ -233,9 +250,17 @@ public: bool useAA() const override { return UseAA; } - /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size - /// that still makes it profitable to inline the call. - unsigned getMaxInlineSizeThreshold() const { return 64; } + bool useSmallAddressing() const { + switch (TLInfo.getTargetMachine().getCodeModel()) { + case CodeModel::Kernel: + // Kernel is currently allowed only for Fuchsia targets, + // where it is the same as Small for almost all purposes. + case CodeModel::Small: + return true; + default: + return false; + } + } /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td index a3736c0868fb..7c5dcb0853eb 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -18,35 +18,37 @@ include "llvm/TableGen/SearchableTable.td" // AT (address translate) instruction options. //===----------------------------------------------------------------------===// -class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, +class AT<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> : SearchableTable { let SearchableFields = ["Name", "Encoding"]; let EnumValueField = "Encoding"; string Name = name; - bits<16> Encoding; - let Encoding{15-14} = op0; + bits<14> Encoding; let Encoding{13-11} = op1; let Encoding{10-7} = crn; let Encoding{6-3} = crm; let Encoding{2-0} = op2; + code Requires = [{ {} }]; } -def : AT<"S1E1R", 0b01, 0b000, 0b0111, 0b1000, 0b000>; -def : AT<"S1E2R", 0b01, 0b100, 0b0111, 0b1000, 0b000>; -def : AT<"S1E3R", 0b01, 0b110, 0b0111, 0b1000, 0b000>; -def : AT<"S1E1W", 0b01, 0b000, 0b0111, 0b1000, 0b001>; -def : AT<"S1E2W", 0b01, 0b100, 0b0111, 0b1000, 0b001>; -def : AT<"S1E3W", 0b01, 0b110, 0b0111, 0b1000, 0b001>; -def : AT<"S1E0R", 0b01, 0b000, 0b0111, 0b1000, 0b010>; -def : AT<"S1E0W", 0b01, 0b000, 0b0111, 0b1000, 0b011>; -def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>; -def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>; -def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>; -def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>; -def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>; -def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>; - +def : AT<"S1E1R", 0b000, 0b0111, 0b1000, 0b000>; +def : AT<"S1E2R", 0b100, 0b0111, 0b1000, 0b000>; +def : AT<"S1E3R", 0b110, 0b0111, 0b1000, 0b000>; +def : AT<"S1E1W", 0b000, 0b0111, 0b1000, 0b001>; +def : AT<"S1E2W", 0b100, 0b0111, 0b1000, 0b001>; +def : AT<"S1E3W", 0b110, 0b0111, 0b1000, 0b001>; +def : AT<"S1E0R", 0b000, 0b0111, 0b1000, 0b010>; +def : AT<"S1E0W", 0b000, 0b0111, 0b1000, 0b011>; +def : AT<"S12E1R", 0b100, 0b0111, 0b1000, 0b100>; +def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>; +def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>; +def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>; + +let Requires = [{ {AArch64::HasV8_2aOps} }] in { +def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>; +def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>; +} //===----------------------------------------------------------------------===// // DMB/DSB (data barrier) instruction options. @@ -77,28 +79,31 @@ def : DB<"sy", 0xf>; // DC (data cache maintenance) instruction options. //===----------------------------------------------------------------------===// -class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, +class DC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> : SearchableTable { let SearchableFields = ["Name", "Encoding"]; let EnumValueField = "Encoding"; string Name = name; - bits<16> Encoding; - let Encoding{15-14} = op0; + bits<14> Encoding; let Encoding{13-11} = op1; let Encoding{10-7} = crn; let Encoding{6-3} = crm; let Encoding{2-0} = op2; + code Requires = [{ {} }]; } -def : DC<"ZVA", 0b01, 0b011, 0b0111, 0b0100, 0b001>; -def : DC<"IVAC", 0b01, 0b000, 0b0111, 0b0110, 0b001>; -def : DC<"ISW", 0b01, 0b000, 0b0111, 0b0110, 0b010>; -def : DC<"CVAC", 0b01, 0b011, 0b0111, 0b1010, 0b001>; -def : DC<"CSW", 0b01, 0b000, 0b0111, 0b1010, 0b010>; -def : DC<"CVAU", 0b01, 0b011, 0b0111, 0b1011, 0b001>; -def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>; -def : DC<"CISW", 0b01, 0b000, 0b0111, 0b1110, 0b010>; +def : DC<"ZVA", 0b011, 0b0111, 0b0100, 0b001>; +def : DC<"IVAC", 0b000, 0b0111, 0b0110, 0b001>; +def : DC<"ISW", 0b000, 0b0111, 0b0110, 0b010>; +def : DC<"CVAC", 0b011, 0b0111, 0b1010, 0b001>; +def : DC<"CSW", 0b000, 0b0111, 0b1010, 0b010>; +def : DC<"CVAU", 0b011, 0b0111, 0b1011, 0b001>; +def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>; +def : DC<"CISW", 0b000, 0b0111, 0b1110, 0b010>; + +let Requires = [{ {AArch64::HasV8_2aOps} }] in +def : DC<"CVAP", 0b011, 0b0111, 0b1100, 0b001>; //===----------------------------------------------------------------------===// // IC (instruction cache maintenance) instruction options. @@ -120,7 +125,7 @@ class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>; def : IC<"IALLU", 0b000, 0b0111, 0b0101, 0b000, 0>; -def : IC<"IVAU", 0b000, 0b0111, 0b0001, 0b000, 1>; +def : IC<"IVAU", 0b011, 0b0111, 0b0101, 0b001, 1>; //===----------------------------------------------------------------------===// // ISB (instruction-fetch barrier) instruction options. @@ -213,14 +218,13 @@ def : PSB<"csync", 0x11>; // TLBI (translation lookaside buffer invalidate) instruction options. //===----------------------------------------------------------------------===// -class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, +class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, bit needsreg = 1> : SearchableTable { let SearchableFields = ["Name", "Encoding"]; let EnumValueField = "Encoding"; string Name = name; - bits<16> Encoding; - let Encoding{15-14} = op0; + bits<14> Encoding; let Encoding{13-11} = op1; let Encoding{10-7} = crn; let Encoding{6-3} = crm; @@ -228,38 +232,38 @@ class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, bit NeedsReg = needsreg; } -def : TLBI<"IPAS2E1IS", 0b01, 0b100, 0b1000, 0b0000, 0b001>; -def : TLBI<"IPAS2LE1IS", 0b01, 0b100, 0b1000, 0b0000, 0b101>; -def : TLBI<"VMALLE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"ALLE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"ALLE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"VAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b001>; -def : TLBI<"VAE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b001>; -def : TLBI<"VAE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b001>; -def : TLBI<"ASIDE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b010>; -def : TLBI<"VAAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b011>; -def : TLBI<"ALLE1IS", 0b01, 0b100, 0b1000, 0b0011, 0b100, 0>; -def : TLBI<"VALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b101>; -def : TLBI<"VALE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b101>; -def : TLBI<"VALE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b101>; -def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>; -def : TLBI<"VAALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b111>; -def : TLBI<"IPAS2E1", 0b01, 0b100, 0b1000, 0b0100, 0b001>; -def : TLBI<"IPAS2LE1", 0b01, 0b100, 0b1000, 0b0100, 0b101>; -def : TLBI<"VMALLE1", 0b01, 0b000, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"ALLE2", 0b01, 0b100, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"ALLE3", 0b01, 0b110, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"VAE1", 0b01, 0b000, 0b1000, 0b0111, 0b001>; -def : TLBI<"VAE2", 0b01, 0b100, 0b1000, 0b0111, 0b001>; -def : TLBI<"VAE3", 0b01, 0b110, 0b1000, 0b0111, 0b001>; -def : TLBI<"ASIDE1", 0b01, 0b000, 0b1000, 0b0111, 0b010>; -def : TLBI<"VAAE1", 0b01, 0b000, 0b1000, 0b0111, 0b011>; -def : TLBI<"ALLE1", 0b01, 0b100, 0b1000, 0b0111, 0b100, 0>; -def : TLBI<"VALE1", 0b01, 0b000, 0b1000, 0b0111, 0b101>; -def : TLBI<"VALE2", 0b01, 0b100, 0b1000, 0b0111, 0b101>; -def : TLBI<"VALE3", 0b01, 0b110, 0b1000, 0b0111, 0b101>; -def : TLBI<"VMALLS12E1", 0b01, 0b100, 0b1000, 0b0111, 0b110, 0>; -def : TLBI<"VAALE1", 0b01, 0b000, 0b1000, 0b0111, 0b111>; +def : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>; +def : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>; +def : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>; +def : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>; +def : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>; +def : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>; +def : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>; +def : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>; +def : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>; +def : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>; +def : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>; +def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>; +def : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>; +def : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>; +def : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>; +def : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>; +def : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>; +def : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>; +def : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>; +def : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>; +def : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>; +def : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>; +def : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>; +def : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>; +def : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>; +def : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>; //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index d2883941e2c4..dcc51bf02329 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -12,9 +12,11 @@ #include "AArch64.h" #include "AArch64CallLowering.h" -#include "AArch64InstructionSelector.h" #include "AArch64LegalizerInfo.h" +#include "AArch64MacroFusion.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL #include "AArch64RegisterBankInfo.h" +#endif #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" @@ -115,7 +117,7 @@ EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden, static cl::opt<bool> EnableAddressTypePromotion("aarch64-enable-type-promotion", cl::Hidden, cl::desc("Enable the type promotion pass"), - cl::init(true)); + cl::init(false)); static cl::opt<bool> EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden, @@ -136,6 +138,11 @@ static cl::opt<bool> cl::desc("Enable the loop data prefetch pass"), cl::init(true)); +static cl::opt<int> EnableGlobalISelAtO( + "aarch64-enable-global-isel-at-O", cl::Hidden, + cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), + cl::init(-1)); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget()); @@ -278,7 +285,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { // FIXME: At this point, we can't rely on Subtarget having RBI. // It's awkward to mix passing RBI and the Subtarget; should we pass // TII/TRI as well? - GISel->InstSelector.reset(new AArch64InstructionSelector(*this, *I, *RBI)); + GISel->InstSelector.reset( + createAArch64InstructionSelector(*this, *I, *RBI)); GISel->RegBankInfo.reset(RBI); #endif @@ -323,10 +331,24 @@ public: ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + DAG->addMutation(createAArch64MacroFusionDAGMutation()); return DAG; } + ScheduleDAGInstrs * + createPostMachineScheduler(MachineSchedContext *C) const override { + const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>(); + if (ST.hasFuseLiterals()) { + // Run the Macro Fusion after RA again since literals are expanded from + // pseudos then (v. addPreSched2()). + ScheduleDAGMI *DAG = createGenericSchedPostRA(C); + DAG->addMutation(createAArch64MacroFusionDAGMutation()); + return DAG; + } + + return nullptr; + } + void addIRPasses() override; bool addPreISel() override; bool addInstSelector() override; @@ -341,6 +363,8 @@ public: void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + + bool isGlobalISelEnabled() const override; }; } // end anonymous namespace @@ -450,6 +474,10 @@ bool AArch64PassConfig::addGlobalInstructionSelect() { } #endif +bool AArch64PassConfig::isGlobalISelEnabled() const { + return TM->getOptLevel() <= EnableGlobalISelAtO; +} + bool AArch64PassConfig::addILPOpts() { if (EnableCondOpt) addPass(createAArch64ConditionOptimizerPass()); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 6fa5e83957e1..2c75a3258c1c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -21,6 +21,8 @@ namespace llvm { +class AArch64RegisterBankInfo; + class AArch64TargetMachine : public LLVMTargetMachine { protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b8833e5a5552..4d59da0c646d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -176,7 +176,8 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -436,7 +437,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, } int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { + Type *CondTy, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register @@ -463,11 +464,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return Entry->Cost; } } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, - unsigned Alignment, unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + const Instruction *I) { auto LT = TLI->getTypeLegalizationCost(DL, Ty); if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && @@ -505,12 +507,14 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); - Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); // ldN/stN only support legal vector types of size 64 or 128 in bits. - if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) - return Factor; + // Accesses having vector types that are a multiple of 128 bits can be + // matched to more than one ldN/stN instruction. + if (NumElts % Factor == 0 && + TLI->isLegalInterleavedAccessType(SubVecTy, DL)) + return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, @@ -594,8 +598,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_ld4: Info.ReadMem = true; Info.WriteMem = false; - Info.IsSimple = true; - Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(0); break; case Intrinsic::aarch64_neon_st2: @@ -603,8 +605,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_st4: Info.ReadMem = false; Info.WriteMem = true; - Info.IsSimple = true; - Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); break; } @@ -628,6 +628,38 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, return true; } +/// See if \p I should be considered for address type promotion. We check if \p +/// I is a sext with right type and used in memory accesses. If it used in a +/// "complex" getelementptr, we allow it to be promoted without finding other +/// sext instructions that sign extended the same initial value. A getelementptr +/// is considered as "complex" if it has more than 2 operands. +bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( + const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { + bool Considerable = false; + AllowPromotionWithoutCommonHeader = false; + if (!isa<SExtInst>(&I)) + return false; + Type *ConsideredSExtType = + Type::getInt64Ty(I.getParent()->getParent()->getContext()); + if (I.getType() != ConsideredSExtType) + return false; + // See if the sext is the one with the right type and used in at least one + // GetElementPtrInst. + for (const User *U : I.users()) { + if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { + Considerable = true; + // A getelementptr is considered as "complex" if it has more than 2 + // operands. We will promote a SExt used in such complex GEP as we + // expect some computation to be merged if they are done on 64 bits. + if (GEPInst->getNumOperands() > 2) { + AllowPromotionWithoutCommonHeader = true; + break; + } + } + } + return Considerable; +} + unsigned AArch64TTIImpl::getCacheLineSize() { return ST->getCacheLineSize(); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 18287ed6653f..e37c003e064c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -34,10 +34,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { const AArch64Subtarget *ST; const AArch64TargetLowering *TLI; - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); - const AArch64Subtarget *getST() const { return ST; } const AArch64TargetLowering *getTLI() const { return TLI; } @@ -90,7 +86,8 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); - int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index); @@ -107,10 +104,11 @@ public: int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, const Instruction *I = nullptr); int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); @@ -125,6 +123,10 @@ public: ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace); + bool + shouldConsiderAddressTypePromotion(const Instruction &I, + bool &AllowPromotionWithoutCommonHeader); + unsigned getCacheLineSize(); unsigned getPrefetchDistance(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp index e3b1d7cea48d..f53af2315ec9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp @@ -19,13 +19,27 @@ // is rewritten into // dup v3.4s, v2.s[1] // fmla v0.4s, v1.4s, v3.4s +// //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <map> using namespace llvm; @@ -41,14 +55,15 @@ namespace { struct AArch64VectorByElementOpt : public MachineFunctionPass { static char ID; - AArch64VectorByElementOpt() : MachineFunctionPass(ID) { - initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); - } const TargetInstrInfo *TII; MachineRegisterInfo *MRI; TargetSchedModel SchedModel; + AArch64VectorByElementOpt() : MachineFunctionPass(ID) { + initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); + } + /// Based only on latency of instructions, determine if it is cost efficient /// to replace the instruction InstDesc by the two instructions InstDescRep1 /// and InstDescRep2. @@ -90,8 +105,10 @@ struct AArch64VectorByElementOpt : public MachineFunctionPass { return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; } }; + char AArch64VectorByElementOpt::ID = 0; -} // namespace + +} // end anonymous namespace INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt", AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index b86a283b40d4..cbab68979c56 100644 --- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -74,6 +74,7 @@ private: SMLoc getLoc() const { return getParser().getTok().getLoc(); } bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); + void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S); AArch64CC::CondCode parseCondCodeString(StringRef Cond); bool parseCondCode(OperandVector &Operands, bool invertCondCode); unsigned matchRegisterNameAlias(StringRef Name, bool isVector); @@ -537,154 +538,15 @@ public: return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000; } - bool isImm0_1() const { + template <int N, int M> + bool isImmInRange() const { if (!isImm()) return false; const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); if (!MCE) return false; int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 2); - } - - bool isImm0_7() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 8); - } - - bool isImm1_8() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val > 0 && Val < 9); - } - - bool isImm0_15() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 16); - } - - bool isImm1_16() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val > 0 && Val < 17); - } - - bool isImm0_31() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 32); - } - - bool isImm1_31() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 32); - } - - bool isImm1_32() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 33); - } - - bool isImm0_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 64); - } - - bool isImm1_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 64); - } - - bool isImm1_64() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 65); - } - - bool isImm0_127() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 128); - } - - bool isImm0_255() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 256); - } - - bool isImm0_65535() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 65536); - } - - bool isImm32_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 32 && Val < 64); + return (Val >= N && Val <= M); } bool isLogicalImm32() const { @@ -804,31 +666,8 @@ public: return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue()); } - bool isBranchTarget26() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return true; - int64_t Val = MCE->getValue(); - if (Val & 0x3) - return false; - return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2)); - } - - bool isPCRelLabel19() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return true; - int64_t Val = MCE->getValue(); - if (Val & 0x3) - return false; - return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2)); - } - - bool isBranchTarget14() const { + template<int N> + bool isBranchTarget() const { if (!isImm()) return false; const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); @@ -837,7 +676,8 @@ public: int64_t Val = MCE->getValue(); if (Val & 0x3) return false; - return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2)); + assert(N > 0 && "Branch target immediate cannot be 0 bits!"); + return (Val >= -((1<<(N-1)) << 2) && Val <= (((1<<(N-1))-1) << 2)); } bool @@ -2494,6 +2334,35 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) { return MatchOperand_Success; } +static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { + if (FBS[AArch64::HasV8_1aOps]) + Str += "ARMv8.1a"; + else if (FBS[AArch64::HasV8_2aOps]) + Str += "ARMv8.2a"; + else + Str += "(unknown)"; +} + +void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands, + SMLoc S) { + const uint16_t Op2 = Encoding & 7; + const uint16_t Cm = (Encoding & 0x78) >> 3; + const uint16_t Cn = (Encoding & 0x780) >> 7; + const uint16_t Op1 = (Encoding & 0x3800) >> 11; + + const MCExpr *Expr = MCConstantExpr::create(Op1, getContext()); + + Operands.push_back( + AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); + Operands.push_back( + AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); + Operands.push_back( + AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); + Expr = MCConstantExpr::create(Op2, getContext()); + Operands.push_back( + AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); +} + /// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for /// the SYS instruction. Parse them specially so that we create a SYS MCInst. bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, @@ -2510,228 +2379,48 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, StringRef Op = Tok.getString(); SMLoc S = Tok.getLoc(); - const MCExpr *Expr = nullptr; - -#define SYS_ALIAS(op1, Cn, Cm, op2) \ - do { \ - Expr = MCConstantExpr::create(op1, getContext()); \ - Operands.push_back( \ - AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ - Operands.push_back( \ - AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); \ - Operands.push_back( \ - AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); \ - Expr = MCConstantExpr::create(op2, getContext()); \ - Operands.push_back( \ - AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ - } while (false) - if (Mnemonic == "ic") { - if (!Op.compare_lower("ialluis")) { - // SYS #0, C7, C1, #0 - SYS_ALIAS(0, 7, 1, 0); - } else if (!Op.compare_lower("iallu")) { - // SYS #0, C7, C5, #0 - SYS_ALIAS(0, 7, 5, 0); - } else if (!Op.compare_lower("ivau")) { - // SYS #3, C7, C5, #1 - SYS_ALIAS(3, 7, 5, 1); - } else { + const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op); + if (!IC) return TokError("invalid operand for IC instruction"); + else if (!IC->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("IC " + std::string(IC->Name) + " requires "); + setRequiredFeatureString(IC->getRequiredFeatures(), Str); + return TokError(Str.c_str()); } + createSysAlias(IC->Encoding, Operands, S); } else if (Mnemonic == "dc") { - if (!Op.compare_lower("zva")) { - // SYS #3, C7, C4, #1 - SYS_ALIAS(3, 7, 4, 1); - } else if (!Op.compare_lower("ivac")) { - // SYS #3, C7, C6, #1 - SYS_ALIAS(0, 7, 6, 1); - } else if (!Op.compare_lower("isw")) { - // SYS #0, C7, C6, #2 - SYS_ALIAS(0, 7, 6, 2); - } else if (!Op.compare_lower("cvac")) { - // SYS #3, C7, C10, #1 - SYS_ALIAS(3, 7, 10, 1); - } else if (!Op.compare_lower("csw")) { - // SYS #0, C7, C10, #2 - SYS_ALIAS(0, 7, 10, 2); - } else if (!Op.compare_lower("cvau")) { - // SYS #3, C7, C11, #1 - SYS_ALIAS(3, 7, 11, 1); - } else if (!Op.compare_lower("civac")) { - // SYS #3, C7, C14, #1 - SYS_ALIAS(3, 7, 14, 1); - } else if (!Op.compare_lower("cisw")) { - // SYS #0, C7, C14, #2 - SYS_ALIAS(0, 7, 14, 2); - } else if (!Op.compare_lower("cvap")) { - if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { - // SYS #3, C7, C12, #1 - SYS_ALIAS(3, 7, 12, 1); - } else { - return TokError("DC CVAP requires ARMv8.2a"); - } - } else { + const AArch64DC::DC *DC = AArch64DC::lookupDCByName(Op); + if (!DC) return TokError("invalid operand for DC instruction"); + else if (!DC->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("DC " + std::string(DC->Name) + " requires "); + setRequiredFeatureString(DC->getRequiredFeatures(), Str); + return TokError(Str.c_str()); } + createSysAlias(DC->Encoding, Operands, S); } else if (Mnemonic == "at") { - if (!Op.compare_lower("s1e1r")) { - // SYS #0, C7, C8, #0 - SYS_ALIAS(0, 7, 8, 0); - } else if (!Op.compare_lower("s1e2r")) { - // SYS #4, C7, C8, #0 - SYS_ALIAS(4, 7, 8, 0); - } else if (!Op.compare_lower("s1e3r")) { - // SYS #6, C7, C8, #0 - SYS_ALIAS(6, 7, 8, 0); - } else if (!Op.compare_lower("s1e1w")) { - // SYS #0, C7, C8, #1 - SYS_ALIAS(0, 7, 8, 1); - } else if (!Op.compare_lower("s1e2w")) { - // SYS #4, C7, C8, #1 - SYS_ALIAS(4, 7, 8, 1); - } else if (!Op.compare_lower("s1e3w")) { - // SYS #6, C7, C8, #1 - SYS_ALIAS(6, 7, 8, 1); - } else if (!Op.compare_lower("s1e0r")) { - // SYS #0, C7, C8, #3 - SYS_ALIAS(0, 7, 8, 2); - } else if (!Op.compare_lower("s1e0w")) { - // SYS #0, C7, C8, #3 - SYS_ALIAS(0, 7, 8, 3); - } else if (!Op.compare_lower("s12e1r")) { - // SYS #4, C7, C8, #4 - SYS_ALIAS(4, 7, 8, 4); - } else if (!Op.compare_lower("s12e1w")) { - // SYS #4, C7, C8, #5 - SYS_ALIAS(4, 7, 8, 5); - } else if (!Op.compare_lower("s12e0r")) { - // SYS #4, C7, C8, #6 - SYS_ALIAS(4, 7, 8, 6); - } else if (!Op.compare_lower("s12e0w")) { - // SYS #4, C7, C8, #7 - SYS_ALIAS(4, 7, 8, 7); - } else if (!Op.compare_lower("s1e1rp")) { - if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { - // SYS #0, C7, C9, #0 - SYS_ALIAS(0, 7, 9, 0); - } else { - return TokError("AT S1E1RP requires ARMv8.2a"); - } - } else if (!Op.compare_lower("s1e1wp")) { - if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { - // SYS #0, C7, C9, #1 - SYS_ALIAS(0, 7, 9, 1); - } else { - return TokError("AT S1E1WP requires ARMv8.2a"); - } - } else { + const AArch64AT::AT *AT = AArch64AT::lookupATByName(Op); + if (!AT) return TokError("invalid operand for AT instruction"); + else if (!AT->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("AT " + std::string(AT->Name) + " requires "); + setRequiredFeatureString(AT->getRequiredFeatures(), Str); + return TokError(Str.c_str()); } + createSysAlias(AT->Encoding, Operands, S); } else if (Mnemonic == "tlbi") { - if (!Op.compare_lower("vmalle1is")) { - // SYS #0, C8, C3, #0 - SYS_ALIAS(0, 8, 3, 0); - } else if (!Op.compare_lower("alle2is")) { - // SYS #4, C8, C3, #0 - SYS_ALIAS(4, 8, 3, 0); - } else if (!Op.compare_lower("alle3is")) { - // SYS #6, C8, C3, #0 - SYS_ALIAS(6, 8, 3, 0); - } else if (!Op.compare_lower("vae1is")) { - // SYS #0, C8, C3, #1 - SYS_ALIAS(0, 8, 3, 1); - } else if (!Op.compare_lower("vae2is")) { - // SYS #4, C8, C3, #1 - SYS_ALIAS(4, 8, 3, 1); - } else if (!Op.compare_lower("vae3is")) { - // SYS #6, C8, C3, #1 - SYS_ALIAS(6, 8, 3, 1); - } else if (!Op.compare_lower("aside1is")) { - // SYS #0, C8, C3, #2 - SYS_ALIAS(0, 8, 3, 2); - } else if (!Op.compare_lower("vaae1is")) { - // SYS #0, C8, C3, #3 - SYS_ALIAS(0, 8, 3, 3); - } else if (!Op.compare_lower("alle1is")) { - // SYS #4, C8, C3, #4 - SYS_ALIAS(4, 8, 3, 4); - } else if (!Op.compare_lower("vale1is")) { - // SYS #0, C8, C3, #5 - SYS_ALIAS(0, 8, 3, 5); - } else if (!Op.compare_lower("vaale1is")) { - // SYS #0, C8, C3, #7 - SYS_ALIAS(0, 8, 3, 7); - } else if (!Op.compare_lower("vmalle1")) { - // SYS #0, C8, C7, #0 - SYS_ALIAS(0, 8, 7, 0); - } else if (!Op.compare_lower("alle2")) { - // SYS #4, C8, C7, #0 - SYS_ALIAS(4, 8, 7, 0); - } else if (!Op.compare_lower("vale2is")) { - // SYS #4, C8, C3, #5 - SYS_ALIAS(4, 8, 3, 5); - } else if (!Op.compare_lower("vale3is")) { - // SYS #6, C8, C3, #5 - SYS_ALIAS(6, 8, 3, 5); - } else if (!Op.compare_lower("alle3")) { - // SYS #6, C8, C7, #0 - SYS_ALIAS(6, 8, 7, 0); - } else if (!Op.compare_lower("vae1")) { - // SYS #0, C8, C7, #1 - SYS_ALIAS(0, 8, 7, 1); - } else if (!Op.compare_lower("vae2")) { - // SYS #4, C8, C7, #1 - SYS_ALIAS(4, 8, 7, 1); - } else if (!Op.compare_lower("vae3")) { - // SYS #6, C8, C7, #1 - SYS_ALIAS(6, 8, 7, 1); - } else if (!Op.compare_lower("aside1")) { - // SYS #0, C8, C7, #2 - SYS_ALIAS(0, 8, 7, 2); - } else if (!Op.compare_lower("vaae1")) { - // SYS #0, C8, C7, #3 - SYS_ALIAS(0, 8, 7, 3); - } else if (!Op.compare_lower("alle1")) { - // SYS #4, C8, C7, #4 - SYS_ALIAS(4, 8, 7, 4); - } else if (!Op.compare_lower("vale1")) { - // SYS #0, C8, C7, #5 - SYS_ALIAS(0, 8, 7, 5); - } else if (!Op.compare_lower("vale2")) { - // SYS #4, C8, C7, #5 - SYS_ALIAS(4, 8, 7, 5); - } else if (!Op.compare_lower("vale3")) { - // SYS #6, C8, C7, #5 - SYS_ALIAS(6, 8, 7, 5); - } else if (!Op.compare_lower("vaale1")) { - // SYS #0, C8, C7, #7 - SYS_ALIAS(0, 8, 7, 7); - } else if (!Op.compare_lower("ipas2e1")) { - // SYS #4, C8, C4, #1 - SYS_ALIAS(4, 8, 4, 1); - } else if (!Op.compare_lower("ipas2le1")) { - // SYS #4, C8, C4, #5 - SYS_ALIAS(4, 8, 4, 5); - } else if (!Op.compare_lower("ipas2e1is")) { - // SYS #4, C8, C4, #1 - SYS_ALIAS(4, 8, 0, 1); - } else if (!Op.compare_lower("ipas2le1is")) { - // SYS #4, C8, C4, #5 - SYS_ALIAS(4, 8, 0, 5); - } else if (!Op.compare_lower("vmalls12e1")) { - // SYS #4, C8, C7, #6 - SYS_ALIAS(4, 8, 7, 6); - } else if (!Op.compare_lower("vmalls12e1is")) { - // SYS #4, C8, C3, #6 - SYS_ALIAS(4, 8, 3, 6); - } else { + const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByName(Op); + if (!TLBI) return TokError("invalid operand for TLBI instruction"); + else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("TLBI " + std::string(TLBI->Name) + " requires "); + setRequiredFeatureString(TLBI->getRequiredFeatures(), Str); + return TokError(Str.c_str()); } + createSysAlias(TLBI->Encoding, Operands, S); } -#undef SYS_ALIAS - Parser.Lex(); // Eat operand. bool ExpectRegister = (Op.lower().find("all") == StringRef::npos); @@ -2744,12 +2433,10 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, HasRegister = true; } - if (ExpectRegister && !HasRegister) { + if (ExpectRegister && !HasRegister) return TokError("specified " + Mnemonic + " op requires a register"); - } - else if (!ExpectRegister && HasRegister) { + else if (!ExpectRegister && HasRegister) return TokError("specified " + Mnemonic + " op does not use a register"); - } if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list")) return true; @@ -2884,7 +2571,6 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { /// parseRegister - Parse a non-vector register operand. bool AArch64AsmParser::parseRegister(OperandVector &Operands) { - MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); // Try for a vector register. if (!tryParseVectorRegister(Operands)) @@ -2897,30 +2583,6 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) { Operands.push_back( AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext())); - // A small number of instructions (FMOVXDhighr, for example) have "[1]" - // as a string token in the instruction itself. - SMLoc LBracS = getLoc(); - const AsmToken &Tok = Parser.getTok(); - if (parseOptionalToken(AsmToken::LBrac)) { - if (Tok.is(AsmToken::Integer)) { - SMLoc IntS = getLoc(); - int64_t Val = Tok.getIntVal(); - if (Val == 1) { - Parser.Lex(); - SMLoc RBracS = getLoc(); - if (parseOptionalToken(AsmToken::RBrac)) { - Operands.push_back( - AArch64Operand::CreateToken("[", false, LBracS, getContext())); - Operands.push_back( - AArch64Operand::CreateToken("1", false, IntS, getContext())); - Operands.push_back( - AArch64Operand::CreateToken("]", false, RBracS, getContext())); - return false; - } - } - } - } - return false; } @@ -3696,6 +3358,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { return Error(Loc, "immediate must be an integer in range [0, 63]."); case Match_InvalidImm0_127: return Error(Loc, "immediate must be an integer in range [0, 127]."); + case Match_InvalidImm0_255: + return Error(Loc, "immediate must be an integer in range [0, 255]."); case Match_InvalidImm0_65535: return Error(Loc, "immediate must be an integer in range [0, 65535]."); case Match_InvalidImm1_8: @@ -4120,6 +3784,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidImm0_31: case Match_InvalidImm0_63: case Match_InvalidImm0_127: + case Match_InvalidImm0_255: case Match_InvalidImm0_65535: case Match_InvalidImm1_8: case Match_InvalidImm1_16: diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index b4f85204714f..41ae70f85e58 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -16,12 +16,20 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> +#include <string> + using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -451,8 +459,8 @@ static const LdStNInstrDesc LdStNInstInfo[] = { { AArch64::LD3i64, "ld3", ".d", 1, true, 0 }, { AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 }, { AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 }, - { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, - { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, + { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, + { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, { AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 }, { AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 }, { AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 }, @@ -731,7 +739,6 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); #endif - const char *Asm = nullptr; const MCOperand &Op1 = MI->getOperand(0); const MCOperand &Cn = MI->getOperand(1); const MCOperand &Cm = MI->getOperand(2); @@ -742,230 +749,74 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, unsigned CmVal = Cm.getImm(); unsigned Op2Val = Op2.getImm(); + uint16_t Encoding = Op2Val; + Encoding |= CmVal << 3; + Encoding |= CnVal << 7; + Encoding |= Op1Val << 11; + + bool NeedsReg; + std::string Ins; + std::string Name; + if (CnVal == 7) { switch (CmVal) { - default: - break; - + default: return false; // IC aliases - case 1: - if (Op1Val == 0 && Op2Val == 0) - Asm = "ic\tialluis"; - break; - case 5: - if (Op1Val == 0 && Op2Val == 0) - Asm = "ic\tiallu"; - else if (Op1Val == 3 && Op2Val == 1) - Asm = "ic\tivau"; - break; - + case 1: case 5: { + const AArch64IC::IC *IC = AArch64IC::lookupICByEncoding(Encoding); + if (!IC || !IC->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = IC->NeedsReg; + Ins = "ic\t"; + Name = std::string(IC->Name); + } + break; // DC aliases - case 4: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tzva"; - break; - case 6: - if (Op1Val == 0 && Op2Val == 1) - Asm = "dc\tivac"; - if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tisw"; - break; - case 10: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcvac"; - else if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tcsw"; - break; - case 11: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcvau"; - break; - case 12: - if (Op1Val == 3 && Op2Val == 1 && - (STI.getFeatureBits()[AArch64::HasV8_2aOps])) - Asm = "dc\tcvap"; - break; - case 14: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcivac"; - else if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tcisw"; - break; - + case 4: case 6: case 10: case 11: case 12: case 14: + { + const AArch64DC::DC *DC = AArch64DC::lookupDCByEncoding(Encoding); + if (!DC || !DC->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = true; + Ins = "dc\t"; + Name = std::string(DC->Name); + } + break; // AT aliases - case 8: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e1r"; break; - case 1: Asm = "at\ts1e1w"; break; - case 2: Asm = "at\ts1e0r"; break; - case 3: Asm = "at\ts1e0w"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e2r"; break; - case 1: Asm = "at\ts1e2w"; break; - case 4: Asm = "at\ts12e1r"; break; - case 5: Asm = "at\ts12e1w"; break; - case 6: Asm = "at\ts12e0r"; break; - case 7: Asm = "at\ts12e0w"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e3r"; break; - case 1: Asm = "at\ts1e3w"; break; - } - break; - } - break; - case 9: - switch (Op1Val) { - default: - break; - case 0: - if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) { - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e1rp"; break; - case 1: Asm = "at\ts1e1wp"; break; - } - } - break; - } + case 8: case 9: { + const AArch64AT::AT *AT = AArch64AT::lookupATByEncoding(Encoding); + if (!AT || !AT->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = true; + Ins = "at\t"; + Name = std::string(AT->Name); + } + break; } } else if (CnVal == 8) { // TLBI aliases - switch (CmVal) { - default: - break; - case 3: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\tvmalle1is"; break; - case 1: Asm = "tlbi\tvae1is"; break; - case 2: Asm = "tlbi\taside1is"; break; - case 3: Asm = "tlbi\tvaae1is"; break; - case 5: Asm = "tlbi\tvale1is"; break; - case 7: Asm = "tlbi\tvaale1is"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle2is"; break; - case 1: Asm = "tlbi\tvae2is"; break; - case 4: Asm = "tlbi\talle1is"; break; - case 5: Asm = "tlbi\tvale2is"; break; - case 6: Asm = "tlbi\tvmalls12e1is"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle3is"; break; - case 1: Asm = "tlbi\tvae3is"; break; - case 5: Asm = "tlbi\tvale3is"; break; - } - break; - } - break; - case 0: - switch (Op1Val) { - default: - break; - case 4: - switch (Op2Val) { - default: - break; - case 1: Asm = "tlbi\tipas2e1is"; break; - case 5: Asm = "tlbi\tipas2le1is"; break; - } - break; - } - break; - case 4: - switch (Op1Val) { - default: - break; - case 4: - switch (Op2Val) { - default: - break; - case 1: Asm = "tlbi\tipas2e1"; break; - case 5: Asm = "tlbi\tipas2le1"; break; - } - break; - } - break; - case 7: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\tvmalle1"; break; - case 1: Asm = "tlbi\tvae1"; break; - case 2: Asm = "tlbi\taside1"; break; - case 3: Asm = "tlbi\tvaae1"; break; - case 5: Asm = "tlbi\tvale1"; break; - case 7: Asm = "tlbi\tvaale1"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle2"; break; - case 1: Asm = "tlbi\tvae2"; break; - case 4: Asm = "tlbi\talle1"; break; - case 5: Asm = "tlbi\tvale2"; break; - case 6: Asm = "tlbi\tvmalls12e1"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle3"; break; - case 1: Asm = "tlbi\tvae3"; break; - case 5: Asm = "tlbi\tvale3"; break; - } - break; - } - break; - } + const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding); + if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = TLBI->NeedsReg; + Ins = "tlbi\t"; + Name = std::string(TLBI->Name); } + else + return false; - if (Asm) { - unsigned Reg = MI->getOperand(4).getReg(); + std::string Str = Ins + Name; + std::transform(Str.begin(), Str.end(), Str.begin(), ::tolower); - O << '\t' << Asm; - if (StringRef(Asm).lower().find("all") == StringRef::npos) - O << ", " << getRegisterName(Reg); - } + O << '\t' << Str; + if (NeedsReg) + O << ", " << getRegisterName(MI->getOperand(4).getReg()); - return Asm != nullptr; + return true; } void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 65dca99ed04e..a45258cb97b7 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H #include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" namespace llvm { @@ -37,9 +38,11 @@ public: unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O); + virtual StringRef getRegName(unsigned RegNo) const { return getRegisterName(RegNo); } + static const char *getRegisterName(unsigned RegNo, unsigned AltIdx = AArch64::NoRegAltName); @@ -177,12 +180,15 @@ public: unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O) override; + StringRef getRegName(unsigned RegNo) const override { return getRegisterName(RegNo); } + static const char *getRegisterName(unsigned RegNo, unsigned AltIdx = AArch64::NoRegAltName); }; -} -#endif +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 14c0327f5fa8..ebf05ae303dd 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -73,7 +73,7 @@ public: } void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsPCRel, MCContext &Ctx) const override; bool mayNeedRelaxation(const MCInst &Inst) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, @@ -138,15 +138,15 @@ static unsigned AdrImmBits(unsigned Value) { } static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, - MCContext *Ctx) { + MCContext &Ctx) { unsigned Kind = Fixup.getKind(); int64_t SignedValue = static_cast<int64_t>(Value); switch (Kind) { default: llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: - if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (SignedValue > 2097151 || SignedValue < -2097152) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); return AdrImmBits(Value & 0x1fffffULL); case AArch64::fixup_aarch64_pcrel_adrp_imm21: return AdrImmBits((Value & 0x1fffff000ULL) >> 12); @@ -154,66 +154,65 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, case AArch64::fixup_aarch64_pcrel_branch19: // Signed 21-bit immediate if (SignedValue > 2097151 || SignedValue < -2097152) - if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0x3)) - Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0x3) + Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); // Low two bits are not encoded. return (Value >> 2) & 0x7ffff; case AArch64::fixup_aarch64_add_imm12: case AArch64::fixup_aarch64_ldst_imm12_scale1: // Unsigned 12-bit immediate - if (Ctx && Value >= 0x1000) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value >= 0x1000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); return Value; case AArch64::fixup_aarch64_ldst_imm12_scale2: // Unsigned 12-bit immediate which gets multiplied by 2 - if (Ctx && (Value >= 0x2000)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0x1)) - Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned"); + if (Value >= 0x2000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0x1) + Ctx.reportError(Fixup.getLoc(), "fixup must be 2-byte aligned"); return Value >> 1; case AArch64::fixup_aarch64_ldst_imm12_scale4: // Unsigned 12-bit immediate which gets multiplied by 4 - if (Ctx && (Value >= 0x4000)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0x3)) - Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned"); + if (Value >= 0x4000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0x3) + Ctx.reportError(Fixup.getLoc(), "fixup must be 4-byte aligned"); return Value >> 2; case AArch64::fixup_aarch64_ldst_imm12_scale8: // Unsigned 12-bit immediate which gets multiplied by 8 - if (Ctx && (Value >= 0x8000)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0x7)) - Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned"); + if (Value >= 0x8000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0x7) + Ctx.reportError(Fixup.getLoc(), "fixup must be 8-byte aligned"); return Value >> 3; case AArch64::fixup_aarch64_ldst_imm12_scale16: // Unsigned 12-bit immediate which gets multiplied by 16 - if (Ctx && (Value >= 0x10000)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0xf)) - Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned"); + if (Value >= 0x10000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0xf) + Ctx.reportError(Fixup.getLoc(), "fixup must be 16-byte aligned"); return Value >> 4; case AArch64::fixup_aarch64_movw: - if (Ctx) - Ctx->reportError(Fixup.getLoc(), - "no resolvable MOVZ/MOVK fixups supported yet"); + Ctx.reportError(Fixup.getLoc(), + "no resolvable MOVZ/MOVK fixups supported yet"); return Value; case AArch64::fixup_aarch64_pcrel_branch14: // Signed 16-bit immediate - if (Ctx && (SignedValue > 32767 || SignedValue < -32768)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (SignedValue > 32767 || SignedValue < -32768) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); // Low two bits are not encoded (4-byte alignment assumed). - if (Ctx && (Value & 0x3)) - Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); + if (Value & 0x3) + Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3fff; case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: // Signed 28-bit immediate - if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (SignedValue > 134217727 || SignedValue < -134217728) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); // Low two bits are not encoded (4-byte alignment assumed). - if (Ctx && (Value & 0x3)) - Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); + if (Value & 0x3) + Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3ffffff; case FK_Data_1: case FK_Data_2: @@ -264,13 +263,13 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, - bool IsPCRel) const { + bool IsPCRel, MCContext &Ctx) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); if (!Value) return; // Doesn't change encoding. MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); // Apply any target-specific value adjustments. - Value = adjustFixupValue(Fixup, Value, nullptr); + Value = adjustFixupValue(Fixup, Value, Ctx); // Shift the value into position. Value <<= Info.TargetOffset; @@ -521,17 +520,6 @@ public: return CompactUnwindEncoding; } - - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override { - // Try to get the encoded value for the fixup as-if we're mapping it into - // the instruction. This allows adjustFixupValue() to issue a diagnostic - // if the value is invalid. - if (IsResolved) - (void)adjustFixupValue(Fixup, Value, &Asm.getContext()); - } }; } // end anonymous namespace @@ -575,12 +563,6 @@ void ELFAArch64AsmBackend::processFixupValue( // to the linker -- a relocation! if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) IsResolved = false; - - // Try to get the encoded value for the fixup as-if we're mapping it into - // the instruction. This allows adjustFixupValue() to issue a diagnostic - // if the value is invalid. - if (IsResolved) - (void)adjustFixupValue(Fixup, Value, &Asm.getContext()); } } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 685907a2178e..271263507ae1 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -14,27 +14,23 @@ //===----------------------------------------------------------------------===// #include "AArch64TargetStreamer.h" -#include "llvm/MC/MCELFStreamer.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCSection.h" -#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Support/Debug.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ELF.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" @@ -106,8 +102,8 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override { EmitA64MappingSymbol(); MCELFStreamer::EmitInstruction(Inst, STI); } @@ -180,6 +176,7 @@ private: DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols; ElfMappingSymbol LastEMS; }; + } // end anonymous namespace AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() { @@ -191,6 +188,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { } namespace llvm { + MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, @@ -214,4 +212,5 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { return new AArch64TargetELFStreamer(S); return nullptr; } -} + +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index e9d38d3dcf10..f710065d9bc7 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -84,9 +84,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, // no matter how far away they are. else if (CM == CodeModel::JITDefault) CM = CodeModel::Large; - else if (CM != CodeModel::Small && CM != CodeModel::Large) - report_fatal_error( - "Only small and large code models are allowed on AArch64"); + else if (CM != CodeModel::Small && CM != CodeModel::Large) { + if (!TT.isOSFuchsia()) + report_fatal_error( + "Only small and large code models are allowed on AArch64"); + else if (CM != CodeModel::Kernel) + report_fatal_error( + "Only small, kernel, and large code models are allowed on AArch64"); + } } static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T, diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 53a68527ee8e..3d296ba4806b 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -16,14 +16,22 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCFragment.h" #include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/MachO.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> +#include <cstdint> + using namespace llvm; namespace { + class AArch64MachObjectWriter : public MCMachObjectTargetWriter { bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, @@ -38,7 +46,8 @@ public: const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) override; }; -} + +} // end anonymous namespace bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, @@ -51,18 +60,18 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( return false; case FK_Data_1: - Log2Size = llvm::Log2_32(1); + Log2Size = Log2_32(1); return true; case FK_Data_2: - Log2Size = llvm::Log2_32(2); + Log2Size = Log2_32(2); return true; case FK_Data_4: - Log2Size = llvm::Log2_32(4); + Log2Size = Log2_32(4); if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); return true; case FK_Data_8: - Log2Size = llvm::Log2_32(8); + Log2Size = Log2_32(8); if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); return true; @@ -72,7 +81,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( case AArch64::fixup_aarch64_ldst_imm12_scale4: case AArch64::fixup_aarch64_ldst_imm12_scale8: case AArch64::fixup_aarch64_ldst_imm12_scale16: - Log2Size = llvm::Log2_32(4); + Log2Size = Log2_32(4); switch (Sym->getKind()) { default: return false; @@ -87,14 +96,13 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( return true; } case AArch64::fixup_aarch64_pcrel_adrp_imm21: - Log2Size = llvm::Log2_32(4); + Log2Size = Log2_32(4); // This encompasses the relocation for the whole 21-bit value. switch (Sym->getKind()) { - default: { + default: Asm.getContext().reportError(Fixup.getLoc(), "ADR/ADRP relocations must be GOT relative"); return false; - } case MCSymbolRefExpr::VK_PAGE: RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); return true; @@ -108,7 +116,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( return true; case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: - Log2Size = llvm::Log2_32(4); + Log2Size = Log2_32(4); RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26); return true; } diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index dcc39176031c..5d76681cd97b 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -266,82 +266,86 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) { } } // end namespace AArch64CC +struct SysAlias { + const char *Name; + uint16_t Encoding; + FeatureBitset FeaturesRequired; + + SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {}; + SysAlias (const char *N, uint16_t E, FeatureBitset F) : + Name(N), Encoding(E), FeaturesRequired(F) {}; + + bool haveFeatures(FeatureBitset ActiveFeatures) const { + return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + } + + FeatureBitset getRequiredFeatures() const { return FeaturesRequired; } +}; + +struct SysAliasReg : SysAlias { + bool NeedsReg; + SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {}; +}; + namespace AArch64AT{ - struct AT { - const char *Name; - uint16_t Encoding; + struct AT : SysAlias { + using SysAlias::SysAlias; }; - #define GET_AT_DECL #include "AArch64GenSystemOperands.inc" - } + namespace AArch64DB { - struct DB { - const char *Name; - uint16_t Encoding; + struct DB : SysAlias { + using SysAlias::SysAlias; }; - #define GET_DB_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64DC { - struct DC { - const char *Name; - uint16_t Encoding; + struct DC : SysAlias { + using SysAlias::SysAlias; }; - #define GET_DC_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64IC { - struct IC { - const char *Name; - uint16_t Encoding; - bool NeedsReg; + struct IC : SysAliasReg { + using SysAliasReg::SysAliasReg; }; #define GET_IC_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64ISB { - struct ISB { - const char *Name; - uint16_t Encoding; + struct ISB : SysAlias { + using SysAlias::SysAlias; }; #define GET_ISB_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64PRFM { - struct PRFM { - const char *Name; - uint16_t Encoding; + struct PRFM : SysAlias { + using SysAlias::SysAlias; }; #define GET_PRFM_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64PState { - struct PState { - const char *Name; - uint16_t Encoding; - FeatureBitset FeaturesRequired; - - bool haveFeatures(FeatureBitset ActiveFeatures) const { - return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; - } + struct PState : SysAlias{ + using SysAlias::SysAlias; }; #define GET_PSTATE_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64PSBHint { - struct PSB { - const char *Name; - uint16_t Encoding; + struct PSB : SysAlias { + using SysAlias::SysAlias; }; #define GET_PSB_DECL #include "AArch64GenSystemOperands.inc" @@ -451,10 +455,8 @@ namespace AArch64SysReg { } namespace AArch64TLBI { - struct TLBI { - const char *Name; - uint16_t Encoding; - bool NeedsReg; + struct TLBI : SysAliasReg { + using SysAliasReg::SysAliasReg; }; #define GET_TLBI_DECL #include "AArch64GenSystemOperands.inc" |