diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64')
51 files changed, 4510 insertions, 1837 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td index 9a7d6c884db5..0bff9b592c15 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64.td @@ -32,6 +32,15 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable ARMv8 CRC-32 checksum instructions">; +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable ARMv8 PMUv3 Performance Monitors extension">; + +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Full FP16", [FeatureFPARMv8]>; + +def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", + "Enable Statistical Profiling extension">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -40,6 +49,15 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", "Has zero-cycle zeroing instructions">; +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + +def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true", + "Reserve X18, making it unavailable " + "as a GPR">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -47,6 +65,9 @@ def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [FeatureCRC]>; +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", [HasV8_1aOps]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// @@ -70,19 +91,29 @@ include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [FeatureFPARMv8, FeatureNEON, FeatureCrypto, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", [FeatureFPARMv8, FeatureNEON, FeatureCrypto, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", "Cyclone", @@ -90,12 +121,16 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", FeatureNEON, FeatureCrypto, FeatureCRC, + FeaturePerfMon, FeatureZCRegMove, FeatureZCZeroing]>; def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; +// FIXME: Cortex-A35 is currently modelled as a Cortex-A53 +def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. @@ -109,11 +144,13 @@ def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def GenericAsmParserVariant : AsmParserVariant { int Variant = 0; string Name = "generic"; + string BreakCharacters = "."; } def AppleAsmParserVariant : AsmParserVariant { int Variant = 1; string Name = "apple-neon"; + string BreakCharacters = "."; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp index d7ef3f4ef653..d215d9e831c0 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -122,7 +122,7 @@ AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) { static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { // Get the previous machine basic block in the function. - MachineFunction::iterator MBBI = *MBB; + MachineFunction::iterator MBBI(MBB); // Can't go off top of function. if (MBBI == MBB->getParent()->begin()) @@ -131,7 +131,7 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 2> Cond; - MachineBasicBlock *PrevBB = std::prev(MBBI); + MachineBasicBlock *PrevBB = &*std::prev(MBBI); for (MachineBasicBlock *S : MBB->predecessors()) if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB && !FBB) @@ -151,10 +151,9 @@ static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB, // If there is no non-pseudo in the current block, loop back around and try // the previous block (if there is one). while ((FMBB = getBBFallenThrough(FMBB, TII))) { - for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) { - if (!I->isPseudo()) - return &*I; - } + for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend())) + if (!I.isPseudo()) + return &I; } // There was no previous non-pseudo in the fallen through blocks @@ -217,8 +216,8 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) { ++Idx; } - DEBUG(dbgs() << "Scan complete, "<< Sequences.size() - << " occurences of pattern found.\n"); + DEBUG(dbgs() << "Scan complete, " << Sequences.size() + << " occurrences of pattern found.\n"); // Then update the basic block, inserting nops between the detected sequences. for (auto &MI : Sequences) { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 9d6dbd641a16..79a84ad8c6c5 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -593,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, if (Change) { Substs[MO.getReg()] = Reg; MO.setReg(Reg); - MRI->setPhysRegUsed(Reg); Changed = true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 716e1a37b1f7..3afcdfb8b930 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -57,6 +57,8 @@ EnableMerge("aarch64-type-promotion-merge", cl::Hidden, " the other."), cl::init(true)); +#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion" + //===----------------------------------------------------------------------===// // AArch64AddressTypePromotion //===----------------------------------------------------------------------===// @@ -76,7 +78,7 @@ public: } const char *getPassName() const override { - return "AArch64 Address Type Promotion"; + return AARCH64_TYPE_PROMO_NAME; } /// Iterate over the functions and promote the computation of interesting @@ -143,10 +145,10 @@ private: char AArch64AddressTypePromotion::ID = 0; INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion", - "AArch64 Type Promotion Pass", false, false) + AARCH64_TYPE_PROMO_NAME, false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion", - "AArch64 Type Promotion Pass", false, false) + AARCH64_TYPE_PROMO_NAME, false, false) FunctionPass *llvm::createAArch64AddressTypePromotionPass() { return new AArch64AddressTypePromotion(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 18d21fd38618..1644d71d2821 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -61,6 +61,12 @@ STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); +namespace llvm { +void initializeAArch64AdvSIMDScalarPass(PassRegistry &); +} + +#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization" + namespace { class AArch64AdvSIMDScalar : public MachineFunctionPass { MachineRegisterInfo *MRI; @@ -82,12 +88,14 @@ private: public: static char ID; // Pass identification, replacement for typeid. - explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {} + explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) { + initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &F) override; const char *getPassName() const override { - return "AdvSIMD Scalar Operation Optimization"; + return AARCH64_ADVSIMD_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -98,6 +106,9 @@ public: char AArch64AdvSIMDScalar::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar", + AARCH64_ADVSIMD_NAME, false, false) + static bool isGPR64(unsigned Reg, unsigned SubReg, const MachineRegisterInfo *MRI) { if (SubReg) @@ -381,7 +392,7 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { // Just check things on a one-block-at-a-time basis. for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) - if (processMachineBasicBlock(I)) + if (processMachineBasicBlock(&*I)) Changed = true; return Changed; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp index d973234dd86a..a614f555a4e9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -45,6 +45,12 @@ BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), STATISTIC(NumSplit, "Number of basic blocks split"); STATISTIC(NumRelaxed, "Number of conditional branches relaxed"); +namespace llvm { +void initializeAArch64BranchRelaxationPass(PassRegistry &); +} + +#define AARCH64_BR_RELAX_NAME "AArch64 branch relaxation pass" + namespace { class AArch64BranchRelaxation : public MachineFunctionPass { /// BasicBlockInfo - Information about the offset and size of a single @@ -93,17 +99,22 @@ class AArch64BranchRelaxation : public MachineFunctionPass { public: static char ID; - AArch64BranchRelaxation() : MachineFunctionPass(ID) {} + AArch64BranchRelaxation() : MachineFunctionPass(ID) { + initializeAArch64BranchRelaxationPass(*PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "AArch64 branch relaxation pass"; + return AARCH64_BR_RELAX_NAME; } }; char AArch64BranchRelaxation::ID = 0; } +INITIALIZE_PASS(AArch64BranchRelaxation, "aarch64-branch-relax", + AARCH64_BR_RELAX_NAME, false, false) + /// verify - check BBOffsets, BBSizes, alignment of islands void AArch64BranchRelaxation::verify() { #ifndef NDEBUG @@ -131,14 +142,14 @@ void AArch64BranchRelaxation::dumpBBs() { /// into the block immediately after it. static bool BBHasFallthrough(MachineBasicBlock *MBB) { // Get the next machine basic block in the function. - MachineFunction::iterator MBBI = MBB; + MachineFunction::iterator MBBI(MBB); // Can't fall off end of function. - MachineBasicBlock *NextBB = std::next(MBBI); + auto NextBB = std::next(MBBI); if (NextBB == MBB->getParent()->end()) return false; for (MachineBasicBlock *S : MBB->successors()) - if (S == NextBB) + if (S == &*NextBB) return true; return false; @@ -216,9 +227,7 @@ AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) { // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); - MachineFunction::iterator MBBI = OrigBB; - ++MBBI; - MF->insert(MBBI, NewBB); + MF->insert(++OrigBB->getIterator(), NewBB); // Splice the instructions starting with MI over to NewBB. NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); @@ -421,7 +430,7 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { MBB->replaceSuccessor(FBB, NewBB); NewBB->addSuccessor(FBB); } - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); + MachineBasicBlock *NextBB = &*std::next(MachineFunction::iterator(MBB)); DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() << ", invert condition and change dest. to BB#" diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h index 1e2d1c3b93bd..bc44bc5f2461 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -25,30 +25,28 @@ namespace { using namespace llvm; -static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, - AArch64::X3, AArch64::X4, AArch64::X5, - AArch64::X6, AArch64::X7}; -static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, - AArch64::H3, AArch64::H4, AArch64::H5, - AArch64::H6, AArch64::H7}; -static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, - AArch64::S3, AArch64::S4, AArch64::S5, - AArch64::S6, AArch64::S7}; -static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, - AArch64::D3, AArch64::D4, AArch64::D5, - AArch64::D6, AArch64::D7}; -static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, - AArch64::Q3, AArch64::Q4, AArch64::Q5, - AArch64::Q6, AArch64::Q7}; +static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7}; +static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, + AArch64::H3, AArch64::H4, AArch64::H5, + AArch64::H6, AArch64::H7}; +static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, + AArch64::S3, AArch64::S4, AArch64::S5, + AArch64::S6, AArch64::S7}; +static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, + AArch64::D3, AArch64::D4, AArch64::D5, + AArch64::D6, AArch64::D7}; +static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, + AArch64::Q3, AArch64::Q4, AArch64::Q5, + AArch64::Q6, AArch64::Q7}; static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State, unsigned SlotAlign) { unsigned Size = LocVT.getSizeInBits() / 8; - unsigned StackAlign = State.getMachineFunction() - .getTarget() - .getDataLayout() - ->getStackAlignment(); + unsigned StackAlign = + State.getMachineFunction().getDataLayout().getStackAlignment(); unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); for (auto &It : PendingMembers) { @@ -88,7 +86,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State) { // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. - ArrayRef<uint16_t> RegList; + ArrayRef<MCPhysReg> RegList; if (LocVT.SimpleTy == MVT::i64) RegList = XRegList; else if (LocVT.SimpleTy == MVT::f16) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 815ebef177d8..388d64ec4e99 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> : CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; /// CCIfBigEndian - Match only if we're in big endian mode. class CCIfBigEndian<CCAction A> : - CCIf<"State.getMachineFunction().getTarget().getDataLayout()->isBigEndian()", A>; + CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention @@ -279,6 +279,23 @@ def CSR_AArch64_TLS_Darwin FP, (sequence "Q%u", 0, 31))>; +// We can only handle a register pair with adjacent registers, the register pair +// should belong to the same class as well. Since the access function on the +// fast path calls a function that follows CSR_AArch64_TLS_Darwin, +// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. +def CSR_AArch64_CXX_TLS_Darwin + : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), + (sequence "D%u", 0, 31))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_AArch64_CXX_TLS_Darwin_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_AArch64_CXX_TLS_Darwin_ViaCopy + : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; + // The ELF stub used for TLS-descriptor access saves every feasible // register. Only X0 and LR are clobbered. def CSR_AArch64_TLS_ELF diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 06ff9af37fd7..9310ac4a44a2 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -117,10 +117,10 @@ struct LDTLSCleanup : public MachineFunctionPass { *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass); // Insert a copy from X0 to TLSBaseAddrReg for later. - MachineInstr *Next = I->getNextNode(); - MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), - *TLSBaseAddrReg).addReg(AArch64::X0); + MachineInstr *Copy = + BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(), + TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) + .addReg(AArch64::X0); return Copy; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index efdb2e33a36e..78c239b11ef3 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -168,6 +168,8 @@ namespace llvm { void initializeAArch64CollectLOHPass(PassRegistry &); } +#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)" + namespace { struct AArch64CollectLOH : public MachineFunctionPass { static char ID; @@ -178,7 +180,7 @@ struct AArch64CollectLOH : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "AArch64 Collect Linker Optimization Hint (LOH)"; + return AARCH64_COLLECT_LOH_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -220,12 +222,10 @@ typedef SmallVector<unsigned, 32> MapIdToReg; char AArch64CollectLOH::ID = 0; INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh", - "AArch64 Collect Linker Optimization Hint (LOH)", false, - false) + AARCH64_COLLECT_LOH_NAME, false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh", - "AArch64 Collect Linker Optimization Hint (LOH)", false, - false) + AARCH64_COLLECT_LOH_NAME, false, false) /// Given a couple (MBB, reg) get the corresponding set of instruction from /// the given "sets". @@ -353,9 +353,17 @@ static void initReachingDef(const MachineFunction &MF, for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) { MapRegToId::const_iterator ItRegId = RegToId.find(*AI); - assert(ItRegId != RegToId.end() && - "Sub-register of an " - "involved register, not recorded as involved!"); + // If this alias has not been recorded, then it is not interesting + // for the current analysis. + // We can end up in this situation because of tuple registers. + // E.g., Let say we are interested in S1. When we register + // S1, we will also register its aliases and in particular + // the tuple Q1_Q2. + // Now, when we encounter Q1_Q2, we will look through its aliases + // and will find that S2 is not registered. + if (ItRegId == RegToId.end()) + continue; + BBKillSet.set(ItRegId->second); BBGen[ItRegId->second] = &MI; } @@ -523,6 +531,8 @@ static bool isCandidateStore(const MachineInstr *Instr) { switch (Instr->getOpcode()) { default: return false; + case AArch64::STRBBui: + case AArch64::STRHHui: case AArch64::STRBui: case AArch64::STRHui: case AArch64::STRWui: @@ -884,7 +894,8 @@ static void computeOthers(const InstrToInstrs &UseToDefs, bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri); // If the chain is three instructions long and ldr is the second element, // then this ldr must load form GOT, otherwise this is not a correct chain. - if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT) + if (L2 && !IsL2Add && + !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)) continue; SmallVector<const MachineInstr *, 3> Args; MCLOHType Kind; @@ -1000,7 +1011,8 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId, DEBUG(dbgs() << "** Collect Involved Register\n"); for (const auto &MBB : MF) { for (const MachineInstr &MI : MBB) { - if (!canDefBePartOfLOH(&MI)) + if (!canDefBePartOfLOH(&MI) && + !isCandidateLoad(&MI) && !isCandidateStore(&MI)) continue; // Process defs diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index b9e41c61defe..fc27bfee73d1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -59,6 +59,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -153,13 +154,20 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( case AArch64::SUBSXri: // cmn is an alias for adds with a dead destination register. case AArch64::ADDSWri: - case AArch64::ADDSXri: - if (MRI->use_empty(I->getOperand(0).getReg())) - return I; - - DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); - return nullptr; - + case AArch64::ADDSXri: { + unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm()); + if (!I->getOperand(2).isImm()) { + DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n'); + return nullptr; + } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) { + DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n'); + return nullptr; + } else if (!MRI->use_empty(I->getOperand(0).getReg())) { + DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); + return nullptr; + } + return I; + } // Prevent false positive case like: // cmp w19, #0 // cinc w0, w19, gt diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 2b0c92fe02d5..df1320fbd4c9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -353,7 +353,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { MIOperands::PhysRegInfo PRI = MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI); - if (PRI.Reads) { + if (PRI.Read) { // The ccmp doesn't produce exactly the same flags as the original // compare, so reject the transform if there are uses of the flags // besides the terminators. @@ -362,7 +362,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { return nullptr; } - if (PRI.Clobbers) { + if (PRI.Defined || PRI.Clobbered) { DEBUG(dbgs() << "Not convertible compare: " << *I); ++NumUnknNZCVDefs; return nullptr; @@ -567,8 +567,8 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { // All CmpBB instructions are moved into Head, and CmpBB is deleted. // Update the CFG first. updateTailPHIs(); - Head->removeSuccessor(CmpBB); - CmpBB->removeSuccessor(Tail); + Head->removeSuccessor(CmpBB, true); + CmpBB->removeSuccessor(Tail, true); Head->transferSuccessorsAndUpdatePHIs(CmpBB); DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); TII->RemoveBranch(*Head); @@ -786,13 +786,13 @@ void AArch64ConditionalCompares::updateDomTree( // convert() removes CmpBB which was previously dominated by Head. // CmpBB children should be transferred to Head. MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head); - for (unsigned i = 0, e = Removed.size(); i != e; ++i) { - MachineDomTreeNode *Node = DomTree->getNode(Removed[i]); + for (MachineBasicBlock *RemovedMBB : Removed) { + MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB); assert(Node != HeadNode && "Cannot erase the head node"); assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head"); while (Node->getNumChildren()) DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); - DomTree->eraseNode(Removed[i]); + DomTree->eraseNode(RemovedMBB); } } @@ -801,8 +801,8 @@ void AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) { if (!Loops) return; - for (unsigned i = 0, e = Removed.size(); i != e; ++i) - Loops->removeBlock(Removed[i]); + for (MachineBasicBlock *RemovedMBB : Removed) + Loops->removeBlock(RemovedMBB); } /// Invalidate MachineTraceMetrics before if-conversion. @@ -899,7 +899,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { Loops = getAnalysisIfAvailable<MachineLoopInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; - MinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); + MinSize = MF.getFunction()->optForMinSize(); bool Changed = false; CmpConv.runOnMachineFunction(MF); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 74fc167433f6..576cf4a74167 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -26,6 +26,12 @@ using namespace llvm; STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced"); +namespace llvm { +void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry &); +} + +#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions" + namespace { class AArch64DeadRegisterDefinitions : public MachineFunctionPass { private: @@ -35,11 +41,14 @@ private: bool usesFrameIndex(const MachineInstr &MI); public: static char ID; // Pass identification, replacement for typeid. - explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {} + explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) { + initializeAArch64DeadRegisterDefinitionsPass( + *PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &F) override; - const char *getPassName() const override { return "Dead register definitions"; } + const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -49,6 +58,9 @@ public: char AArch64DeadRegisterDefinitions::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs", + AARCH64_DEAD_REG_DEF_NAME, false, false) + bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg( unsigned Reg, const MachineInstr &MI) { for (const MachineOperand &MO : MI.implicit_operands()) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index c2470f747a38..d24e42a93763 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -22,18 +22,26 @@ #include "llvm/Support/MathExtras.h" using namespace llvm; +namespace llvm { +void initializeAArch64ExpandPseudoPass(PassRegistry &); +} + +#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass" + namespace { class AArch64ExpandPseudo : public MachineFunctionPass { public: static char ID; - AArch64ExpandPseudo() : MachineFunctionPass(ID) {} + AArch64ExpandPseudo() : MachineFunctionPass(ID) { + initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry()); + } const AArch64InstrInfo *TII; bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "AArch64 pseudo instruction expansion pass"; + return AARCH64_EXPAND_PSEUDO_NAME; } private: @@ -45,6 +53,9 @@ private: char AArch64ExpandPseudo::ID = 0; } +INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo", + AARCH64_EXPAND_PSEUDO_NAME, false, false) + /// \brief Transfer implicit operands on the pseudo instruction to the /// instructions created from the expansion. static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 072819836bb3..0ac4b39b0357 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -523,7 +523,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) U = C; } - if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType())) + if (auto *Ty = dyn_cast<PointerType>(Obj->getType())) if (Ty->getAddressSpace() > 255) // Fast instruction selection doesn't support the special // address spaces. @@ -969,7 +969,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // Cannot encode an offset register and an immediate offset in the same // instruction. Fold the immediate offset into the load/store instruction and - // emit an additonal add to take care of the offset register. + // emit an additional add to take care of the offset register. if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg()) RegisterOffsetNeedsLowering = true; @@ -1058,8 +1058,8 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr, // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size // and alignment should be based on the VT. MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // Now add the rest of the operands. MIB.addFrameIndex(FI).addImm(Offset); } else { @@ -1178,7 +1178,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, } // Check if the mul can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (isMulPowOf2(RHS)) { const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); @@ -1193,12 +1193,16 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(MulLHS); - return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, - AArch64_AM::LSL, ShiftVal, SetFlags, WantResult); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags, + WantResult); + if (ResultReg) + return ResultReg; } + } // Check if the shift can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) { if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend; @@ -1214,12 +1218,15 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftType, ShiftVal, SetFlags, - WantResult); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftType, ShiftVal, SetFlags, + WantResult); + if (ResultReg) + return ResultReg; } } } + } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) @@ -1323,6 +1330,10 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; + // Don't deal with undefined shifts. + if (ShiftImm >= RetVT.getSizeInBits()) + return 0; + static const unsigned OpcTable[2][2][2] = { { { AArch64::SUBWrs, AArch64::SUBXrs }, { AArch64::ADDWrs, AArch64::ADDXrs } }, @@ -1360,6 +1371,9 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; + if (ShiftImm >= 4) + return 0; + static const unsigned OpcTable[2][2][2] = { { { AArch64::SUBWrx, AArch64::SUBXrx }, { AArch64::ADDWrx, AArch64::ADDXrx } }, @@ -1542,7 +1556,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, return ResultReg; // Check if the mul can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (isMulPowOf2(RHS)) { const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); @@ -1558,12 +1572,15 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(MulLHS); - return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + if (ResultReg) + return ResultReg; } + } // Check if the shift can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (const auto *SI = dyn_cast<ShlOperator>(RHS)) if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { uint64_t ShiftVal = C->getZExtValue(); @@ -1571,9 +1588,12 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + if (ResultReg) + return ResultReg; } + } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) @@ -1646,6 +1666,11 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, { AArch64::ORRWrs, AArch64::ORRXrs }, { AArch64::EORWrs, AArch64::EORXrs } }; + + // Don't deal with undefined shifts. + if (ShiftImm >= RetVT.getSizeInBits()) + return 0; + const TargetRegisterClass *RC; unsigned Opc; switch (RetVT.SimpleTy) { @@ -2235,14 +2260,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { MIB.addImm(TestBit); MIB.addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - fastEmitBranch(FBB, DbgLoc); - + finishCondBranch(BI->getParent(), TBB, FBB); return true; } @@ -2257,7 +2275,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; - AArch64CC::CondCode CC = AArch64CC::NE; if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { if (CI->hasOneUse() && isValueAvailable(CI)) { // Try to optimize or fold the cmp. @@ -2289,7 +2306,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch // instruction. - CC = getCompareCC(Predicate); + AArch64CC::CondCode CC = getCompareCC(Predicate); AArch64CC::CondCode ExtraCC = AArch64CC::AL; switch (Predicate) { default: @@ -2317,52 +2334,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { .addImm(CC) .addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); - return true; - } - } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { - MVT SrcVT; - if (TI->hasOneUse() && isValueAvailable(TI) && - isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) { - unsigned CondReg = getRegForValue(TI->getOperand(0)); - if (!CondReg) - return false; - bool CondIsKill = hasTrivialKill(TI->getOperand(0)); - - // Issue an extract_subreg to get the lower 32-bits. - if (SrcVT == MVT::i64) { - CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill, - AArch64::sub_32); - CondIsKill = true; - } - - unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1); - assert(ANDReg && "Unexpected AND instruction emission failure."); - emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0); - - if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { - std::swap(TBB, FBB); - CC = AArch64CC::EQ; - } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) - .addMBB(TBB); - - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { @@ -2371,34 +2343,31 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B)) .addMBB(Target); - // Obtain the branch weight and add the target to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - Target->getBasicBlock()); - FuncInfo.MBB->addSuccessor(Target, BranchWeight); + // Obtain the branch probability and add the target to the successor list. + if (FuncInfo.BPI) { + auto BranchProbability = FuncInfo.BPI->getEdgeProbability( + BI->getParent(), Target->getBasicBlock()); + FuncInfo.MBB->addSuccessor(Target, BranchProbability); + } else + FuncInfo.MBB->addSuccessorWithoutProb(Target); return true; - } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) { - // Fake request the condition, otherwise the intrinsic might be completely - // optimized away. - unsigned CondReg = getRegForValue(BI->getCondition()); - if (!CondReg) - return false; - - // Emit the branch. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) - .addMBB(TBB); + } else { + AArch64CC::CondCode CC = AArch64CC::NE; + if (foldXALUIntrinsic(CC, I, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned CondReg = getRegForValue(BI->getCondition()); + if (!CondReg) + return false; - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + // Emit the branch. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); - fastEmitBranch(FBB, DbgLoc); - return true; + finishCondBranch(BI->getParent(), TBB, FBB); + return true; + } } unsigned CondReg = getRegForValue(BI->getCondition()); @@ -2406,32 +2375,22 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { return false; bool CondRegIsKill = hasTrivialKill(BI->getCondition()); - // We've been divorced from our compare! Our block was split, and - // now our compare lives in a predecessor block. We musn't - // re-compare here, as the children of the compare aren't guaranteed - // live across the block boundary (we *could* check for this). - // Regardless, the compare has been done in the predecessor block, - // and it left a value for us in a virtual register. Ergo, we test - // the one-bit value left in the virtual register. - emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0); - + // i1 conditions come as i32 values, test the lowest bit with tb(n)z. + unsigned Opcode = AArch64::TBNZW; if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { std::swap(TBB, FBB); - CC = AArch64CC::EQ; + Opcode = AArch64::TBZW; } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) + const MCInstrDesc &II = TII.get(Opcode); + unsigned ConstrainedCondReg + = constrainOperandRegClass(II, CondReg, II.getNumDefs()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill)) + .addImm(0) .addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } @@ -2447,8 +2406,8 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg); // Make sure the CFG is up-to-date. - for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i) - FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]); + for (auto *Succ : BI->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]); return true; } @@ -2456,6 +2415,10 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) { bool AArch64FastISel::selectCmp(const Instruction *I) { const CmpInst *CI = cast<CmpInst>(I); + // Vectors of i1 are weird: bail out. + if (CI->getType()->isVectorTy()) + return false; + // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); unsigned ResultReg = 0; @@ -2954,8 +2917,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, .addImm(NumBytes); // Process the args. - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; + for (CCValAssign &VA : ArgLocs) { const Value *ArgVal = CLI.OutVals[VA.getValNo()]; MVT ArgVT = OutVTs[VA.getValNo()]; @@ -3018,8 +2980,8 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getStack(Addr.getOffset()), - MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); + MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); if (!emitStore(ArgVT, ArgReg, Addr, MMO)) return false; @@ -3318,8 +3280,8 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC, return false; // Make sure nothing is in the way - BasicBlock::const_iterator Start = I; - BasicBlock::const_iterator End = II; + BasicBlock::const_iterator Start(I); + BasicBlock::const_iterator End(II); for (auto Itr = std::prev(Start); Itr != End; --Itr) { // We only expect extractvalue instructions between the intrinsic and the // instruction to be selected. @@ -3684,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (F.isVarArg()) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + // Build a list of return value registers. SmallVector<unsigned, 4> RetRegs; @@ -3763,8 +3728,8 @@ bool AArch64FastISel::selectRet(const Instruction *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::RET_ReallyLR)); - for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) - MIB.addReg(RetRegs[i], RegState::Implicit); + for (unsigned RetReg : RetRegs) + MIB.addReg(RetReg, RegState::Implicit); return true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index a76473f7e539..11ae8005370d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -72,9 +72,9 @@ // // For most functions, some of the frame areas are empty. For those functions, // it may not be necessary to set up fp or bp: -// * A base pointer is definitly needed when there are both VLAs and local +// * A base pointer is definitely needed when there are both VLAs and local // variables with more-than-default alignment requirements. -// * A frame pointer is definitly needed when there are local variables with +// * A frame pointer is definitely needed when there are local variables with // more-than-default alignment requirements. // // In some cases when a base pointer is not strictly needed, it is generated @@ -216,11 +216,11 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( if (CSI.empty()) return; - const DataLayout *TD = MF.getTarget().getDataLayout(); + const DataLayout &TD = MF.getDataLayout(); bool HasFP = hasFP(MF); // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD->getPointerSize(0); + int stackGrowth = -TD.getPointerSize(0); // Calculate offsets. int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; @@ -280,14 +280,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); bool HasFP = hasFP(MF); - DebugLoc DL = MBB.findDebugLoc(MBBI); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. @@ -354,7 +357,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes && NeedsRealignment) { // Use the first callee-saved register as a scratch register. scratchSPReg = AArch64::X9; - MF.getRegInfo().setPhysRegUsed(scratchSPReg); } // If we're a leaf function, try using the red zone. @@ -400,8 +402,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (needsFrameMoves) { - const DataLayout *TD = MF.getTarget().getDataLayout(); - const int StackGrowth = -TD->getPointerSize(0); + const DataLayout &TD = MF.getDataLayout(); + const int StackGrowth = -TD.getPointerSize(0); unsigned FramePtr = RegInfo->getFrameRegister(MF); // An example of the prologue: // @@ -513,33 +515,33 @@ static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { return false; } -static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) { +/// Checks whether the given instruction restores callee save registers +/// and if so returns how many. +static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) { unsigned RtIdx = 0; - if (MI->getOpcode() == AArch64::LDPXpost || - MI->getOpcode() == AArch64::LDPDpost) + switch (MI.getOpcode()) { + case AArch64::LDPXpost: + case AArch64::LDPDpost: RtIdx = 1; - - if (MI->getOpcode() == AArch64::LDPXpost || - MI->getOpcode() == AArch64::LDPDpost || - MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) { - if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) || - !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) || - MI->getOperand(RtIdx + 2).getReg() != AArch64::SP) - return false; - return true; + // FALLTHROUGH + case AArch64::LDPXi: + case AArch64::LDPDi: + if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) || + !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) || + MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) + return 0; + return 2; } - - return false; + return 0; } void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool IsTailCallReturn = false; if (MBB.end() != MBBI) { @@ -585,7 +587,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // ---------------------| --- | // | | | | // | CalleeSavedReg | | | - // | (NumRestores * 16) | | | + // | (NumRestores * 8) | | | // | | | | // ---------------------| | NumBytes // | | StackSize (StackAdjustUp) @@ -606,17 +608,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); - if (LastPopI != MBB.begin()) { - do { - ++NumRestores; - --LastPopI; - } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs)); - if (!isCSRestore(LastPopI, CSRegs)) { + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastPopI != Begin) { + --LastPopI; + unsigned Restores = getNumCSRestores(*LastPopI, CSRegs); + NumRestores += Restores; + if (Restores == 0) { ++LastPopI; - --NumRestores; + break; } } - NumBytes -= NumRestores * 16; + NumBytes -= NumRestores * 8; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { @@ -634,15 +636,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // be able to save any instructions. if (NumBytes || MFI->hasVarSizedObjects()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags); -} - -/// getFrameIndexOffset - Returns the displacement from the frame register to -/// the stack frame of the specified index. -int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - unsigned FrameReg; - return getFrameIndexReference(MF, FI, FrameReg); + -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -739,9 +733,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( DebugLoc DL; assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - if (MI != MBB.end()) - DL = MI->getDebugLoc(); - for (unsigned i = 0; i < Count; i += 2) { unsigned idx = Count - i - 2; unsigned Reg1 = CSI[idx].getReg(); @@ -911,7 +902,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned NumFPRSpilled = 0; bool ExtraCSSpill = false; bool CanEliminateFrame = true; - DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:"); + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Check pairs of consecutive callee-saved registers. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 731f031ff855..427afdf4acbf 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -37,7 +37,6 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; int resolveFrameIndexReference(const MachineFunction &MF, int FI, @@ -61,6 +60,11 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; + + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return true; + } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 772e894f4f0a..6c868880bcac 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -34,7 +34,6 @@ using namespace llvm; namespace { class AArch64DAGToDAGISel : public SelectionDAGISel { - AArch64TargetMachine &TM; /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -45,7 +44,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { public: explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr), + : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), ForCodeSize(false) {} const char *getPassName() const override { @@ -53,9 +52,7 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - ForCodeSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize); + ForCodeSize = MF.getFunction()->optForSize(); Subtarget = &MF.getSubtarget<AArch64Subtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -79,6 +76,21 @@ public: bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { return SelectShiftedRegister(N, true, Reg, Shift); } + bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 1, Base, OffImm); + } + bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 2, Base, OffImm); + } + bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 4, Base, OffImm); + } + bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 8, Base, OffImm); + } + bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 16, Base, OffImm); + } bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectAddrModeIndexed(N, 1, Base, OffImm); } @@ -153,8 +165,7 @@ public: SDNode *SelectBitfieldExtractOp(SDNode *N); SDNode *SelectBitfieldInsertOp(SDNode *N); - - SDNode *SelectLIBM(SDNode *N); + SDNode *SelectBitfieldInsertInZeroOp(SDNode *N); SDNode *SelectReadRegister(SDNode *N); SDNode *SelectWriteRegister(SDNode *N); @@ -165,6 +176,8 @@ public: private: bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg, SDValue &Shift); + bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, SDValue &OffImm); bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base, @@ -422,7 +435,7 @@ static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) { return true; } -// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a +// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a // high lane extract. static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp, SDValue &LaneOp, int &LaneIdx) { @@ -572,7 +585,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, } // AArch64 mandates that the RHS of the operation must use the smallest - // register classs that could contain the size being extended from. Thus, + // register class that could contain the size being extended from. Thus, // if we're folding a (sext i8), we need the RHS to be a GPR32, even though // there might not be an actual 32-bit value in the program. We can // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here. @@ -587,7 +600,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, /// need to create a real ADD instruction from it anyway and there's no point in /// folding it into the mem op. Theoretically, it shouldn't matter, but there's /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding -/// leads to duplaicated ADRP instructions. +/// leads to duplicated ADRP instructions. static bool isWorthFoldingADDlow(SDValue N) { for (auto Use : N->uses()) { if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && @@ -604,6 +617,51 @@ static bool isWorthFoldingADDlow(SDValue N) { return true; } +/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit +/// immediate" address. The "Size" argument is the size in bytes of the memory +/// reference, which determines the scale. +bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size, + SDValue &Base, + SDValue &OffImm) { + SDLoc dl(N); + const DataLayout &DL = CurDAG->getDataLayout(); + const TargetLowering *TLI = getTargetLowering(); + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; + } + + // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed + // selected here doesn't support labels/immediates, only base+offset. + + if (CurDAG->isBaseWithConstantOffset(N)) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int64_t RHSC = RHS->getSExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) && + RHSC < (0x40 << Scale)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + } + OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); + return true; + } + } + } + + // Base only. The address will be materialized into a register before + // the memory is accessed. + // add x0, Xbase, #offset + // stp x1, x2, [x0] + Base = N; + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; +} + /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit /// immediate" address. The "Size" argument is the size in bytes of the memory /// reference, which determines the scale. @@ -867,7 +925,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, if (isa<ConstantSDNode>(RHS)) { int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue(); unsigned Scale = Log2_32(Size); - // Skip the immediate can be seleced by load/store addressing mode. + // Skip the immediate can be selected by load/store addressing mode. // Also skip the immediate can be encoded by a single ADD (SUB is also // checked by using -ImmOff). if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) || @@ -1034,6 +1092,8 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { // it into an i64. DstVT = MVT::i32; } + } else if (VT == MVT::f16) { + Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; } else if (VT == MVT::f32) { Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost; } else if (VT == MVT::f64 || VT.is64BitVector()) { @@ -1222,8 +1282,8 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, SDValue SuperReg = SDValue(Ld, 0); EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, - AArch64::qsub3 }; + static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; for (unsigned i = 0; i < NumVecs; ++i) { SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); if (Narrow) @@ -1275,8 +1335,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg); } else { EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, - AArch64::qsub3 }; + static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; for (unsigned i = 0; i < NumVecs; ++i) { SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); @@ -1420,7 +1480,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, // The resulting code will be at least as good as the original one // plus it may expose more opportunities for bitfield insert pattern. // FIXME: Currently we limit this to the bigger pattern, because - // some optimizations expect AND and not UBFM + // some optimizations expect AND and not UBFM. Opd0 = N->getOperand(0); } else return false; @@ -1852,6 +1912,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) { /// Does this tree qualify as an attempt to move a bitfield into position, /// essentially "(and (shl VAL, N), Mask)". static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, + bool BiggerPattern, SDValue &Src, int &ShiftAmount, int &MaskWidth) { EVT VT = Op.getValueType(); @@ -1874,6 +1935,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, Op = Op.getOperand(0); } + // Don't match if the SHL has more than one use, since then we'll end up + // generating SHL+UBFIZ instead of just keeping SHL+AND. + if (!BiggerPattern && !Op.hasOneUse()) + return false; + uint64_t ShlImm; if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm)) return false; @@ -1887,7 +1953,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, // BFI encompasses sufficiently many nodes that it's worth inserting an extra // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL - // amount. + // amount. BiggerPattern is true when this pattern is being matched for BFI, + // BiggerPattern is false when this pattern is being matched for UBFIZ, in + // which case it is not profitable to insert an extra shift. + if (ShlImm - ShiftAmount != 0 && !BiggerPattern) + return false; Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount); return true; @@ -1904,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, SDValue &Src, unsigned &ImmR, - unsigned &ImmS, SelectionDAG *CurDAG) { + unsigned &ImmS, const APInt &UsefulBits, + SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); // Set Opc @@ -1918,23 +1989,30 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // Because of simplify-demanded-bits in DAGCombine, involved masks may not // have the expected shape. Try to undo that. - APInt UsefulBits; - getUsefulBits(SDValue(N, 0), UsefulBits); unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); - // OR is commutative, check both possibilities (does llvm provide a - // way to do that directely, e.g., via code matcher?) - SDValue OrOpd1Val = N->getOperand(1); - SDNode *OrOpd0 = N->getOperand(0).getNode(); - SDNode *OrOpd1 = N->getOperand(1).getNode(); - for (int i = 0; i < 2; - ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) { + // OR is commutative, check all combinations of operand order and values of + // BiggerPattern, i.e. + // Opd0, Opd1, BiggerPattern=false + // Opd1, Opd0, BiggerPattern=false + // Opd0, Opd1, BiggerPattern=true + // Opd1, Opd0, BiggerPattern=true + // Several of these combinations may match, so check with BiggerPattern=false + // first since that will produce better results by matching more instructions + // and/or inserting fewer extra instructions. + for (int I = 0; I < 4; ++I) { + + bool BiggerPattern = I / 2; + SDNode *OrOpd0 = N->getOperand(I % 2).getNode(); + SDValue OrOpd1Val = N->getOperand((I + 1) % 2); + SDNode *OrOpd1 = OrOpd1Val.getNode(); + unsigned BFXOpc; int DstLSB, Width; if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS, - NumberOfIgnoredLowBits, true)) { + NumberOfIgnoredLowBits, BiggerPattern)) { // Check that the returned opcode is compatible with the pattern, // i.e., same type and zero extended (U and not S) if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) || @@ -1952,8 +2030,9 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // If the mask on the insertee is correct, we have a BFXIL operation. We // can share the ImmR and ImmS values from the already-computed UBFM. - } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src, - DstLSB, Width)) { + } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), + BiggerPattern, + Src, DstLSB, Width)) { ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); ImmS = Width - 1; } else @@ -2003,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { unsigned Opc; unsigned LSB, MSB; SDValue Opd0, Opd1; + EVT VT = N->getValueType(0); + APInt NUsefulBits; + getUsefulBits(SDValue(N, 0), NUsefulBits); + + // If all bits are not useful, just return UNDEF. + if (!NUsefulBits) + return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT); - if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG)) + if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits, + CurDAG)) return nullptr; - EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[] = { Opd0, Opd1, @@ -2016,58 +2102,37 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { return CurDAG->SelectNodeTo(N, Opc, VT, Ops); } -SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) { +/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the +/// equivalent of a left shift by a constant amount followed by an and masking +/// out a contiguous set of bits. +SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) { + if (N->getOpcode() != ISD::AND) + return nullptr; + EVT VT = N->getValueType(0); - unsigned Variant; unsigned Opc; - unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr }; - - if (VT == MVT::f32) { - Variant = 0; - } else if (VT == MVT::f64) { - Variant = 1; - } else - return nullptr; // Unrecognized argument type. Fall back on default codegen. - - // Pick the FRINTX variant needed to set the flags. - unsigned FRINTXOpc = FRINTXOpcs[Variant]; - - switch (N->getOpcode()) { - default: - return nullptr; // Unrecognized libm ISD node. Fall back on default codegen. - case ISD::FCEIL: { - unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr }; - Opc = FRINTPOpcs[Variant]; - break; - } - case ISD::FFLOOR: { - unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr }; - Opc = FRINTMOpcs[Variant]; - break; - } - case ISD::FTRUNC: { - unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr }; - Opc = FRINTZOpcs[Variant]; - break; - } - case ISD::FROUND: { - unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr }; - Opc = FRINTAOpcs[Variant]; - break; - } - } + if (VT == MVT::i32) + Opc = AArch64::UBFMWri; + else if (VT == MVT::i64) + Opc = AArch64::UBFMXri; + else + return nullptr; - SDLoc dl(N); - SDValue In = N->getOperand(0); - SmallVector<SDValue, 2> Ops; - Ops.push_back(In); + SDValue Op0; + int DstLSB, Width; + if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false, + Op0, DstLSB, Width)) + return nullptr; - if (!TM.Options.UnsafeFPMath) { - SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In); - Ops.push_back(SDValue(FRINTX, 1)); - } + // ImmR is the rotate right amount. + unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + // ImmS is the most significant bit of the source to be moved. + unsigned ImmS = Width - 1; - return CurDAG->getMachineNode(Opc, dl, VT, Ops); + SDLoc DL(N); + SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + return CurDAG->SelectNodeTo(N, Opc, VT, Ops); } bool @@ -2119,7 +2184,7 @@ AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, // into a single value to be used in the MRS/MSR instruction. static int getIntOperandFromRegisterString(StringRef RegString) { SmallVector<StringRef, 5> Fields; - RegString.split(Fields, ":"); + RegString.split(Fields, ':'); if (Fields.size() == 1) return -1; @@ -2206,7 +2271,15 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { assert (isa<ConstantSDNode>(N->getOperand(2)) && "Expected a constant integer expression."); uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); - return CurDAG->getMachineNode(AArch64::MSRpstate, DL, MVT::Other, + unsigned State; + if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) { + assert(Immed < 2 && "Bad imm"); + State = AArch64::MSRpstateImm1; + } else { + assert(Immed < 16 && "Bad imm"); + State = AArch64::MSRpstateImm4; + } + return CurDAG->getMachineNode(State, DL, MVT::Other, CurDAG->getTargetConstant(Reg, DL, MVT::i32), CurDAG->getTargetConstant(Immed, DL, MVT::i16), N->getOperand(0)); @@ -2279,6 +2352,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case ISD::SRA: if (SDNode *I = SelectBitfieldExtractOp(Node)) return I; + if (SDNode *I = SelectBitfieldInsertInZeroOp(Node)) + return I; break; case ISD::OR: @@ -2802,6 +2877,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { break; } } + break; } case AArch64ISD::LD2post: { if (VT == MVT::v8i8) @@ -3214,14 +3290,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); break; } - - case ISD::FCEIL: - case ISD::FFLOOR: - case ISD::FTRUNC: - case ISD::FROUND: - if (SDNode *I = SelectLIBM(Node)) - return I; - break; } // Select the default instruction diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3e8f46cf1ecd..9f5beff12100 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -40,23 +40,6 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); -namespace { -enum AlignMode { - StrictAlign, - NoStrictAlign -}; -} - -static cl::opt<AlignMode> -Align(cl::desc("Load/store alignment support"), - cl::Hidden, cl::init(NoStrictAlign), - cl::values( - clEnumValN(StrictAlign, "aarch64-strict-align", - "Disallow all unaligned memory accesses"), - clEnumValN(NoStrictAlign, "aarch64-no-strict-align", - "Allow unaligned memory accesses"), - clEnumValEnd)); - // Place holder until extr generation is tested fully. static cl::opt<bool> EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, @@ -76,6 +59,9 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +/// Value type used for condition codes. +static const MVT MVT_CC = MVT::i32; + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -210,11 +196,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); - // Exception handling. - // FIXME: These are guesses. Has this been defined yet? - setExceptionPointerRegister(AArch64::X0); - setExceptionSelectorRegister(AArch64::X1); - // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -234,6 +215,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); @@ -252,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); @@ -315,6 +304,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FMINNUM, MVT::f16, Promote); setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINNAN, MVT::f16, Promote); + setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); // v4f16 is also a storage-only type, so promote it to v4f32 when that is // known to be safe. @@ -403,10 +394,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); + setOperationAction(ISD::FMINNUM, Ty, Legal); + setOperationAction(ISD::FMAXNUM, Ty, Legal); + setOperationAction(ISD::FMINNAN, Ty, Legal); + setOperationAction(ISD::FMAXNAN, Ty, Legal); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. + // This requires the Performance Monitors extension. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + if (Subtarget->isTargetMachO()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret to avoid memory @@ -456,12 +456,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedLoadAction(im, MVT::i64, Legal); setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); + setIndexedLoadAction(im, MVT::f16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i64, Legal); setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); + setIndexedStoreAction(im, MVT::f16, Legal); } // Trap. @@ -479,6 +481,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); @@ -487,16 +493,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); + if (Subtarget->supportsAddressTopByteIgnored()) + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -512,10 +520,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setMinFunctionAlignment(2); - RequireStrictAlign = (Align == StrictAlign); - setHasExtractBitsInsn(true); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: @@ -646,6 +654,9 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + + // But we do support custom-lowering for FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); @@ -686,6 +697,12 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT.getSimpleVT(), Legal); + // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). + if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) + for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, + ISD::FMINNUM, ISD::FMAXNUM}) + setOperationAction(Opcode, VT.getSimpleVT(), Legal); + if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { @@ -730,7 +747,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( break; } case ISD::INTRINSIC_W_CHAIN: { - ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); + ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); switch (IntID) { default: return; @@ -780,6 +797,34 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, return MVT::i64; } +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + // FIXME: This is mostly true for Cyclone, but not necessarily others. + if (Fast) { + // FIXME: Define an attribute for slow unaligned accesses instead of + // relying on the CPU type as a proxy. + // On Cyclone, unaligned 128-bit stores are slow. + *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + VT == MVT::v2i64; + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -809,9 +854,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; + case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; + case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; + case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; - case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; - case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; case AArch64ISD::DUP: return "AArch64ISD::DUP"; case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; @@ -931,8 +977,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); - MachineFunction::iterator It = MBB; - ++It; + MachineFunction::iterator It = ++MBB->getIterator(); unsigned DestReg = MI->getOperand(0).getReg(); unsigned IfTrueReg = MI->getOperand(1).getReg(); @@ -1141,8 +1186,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; - if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) && - cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 && + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags @@ -1156,8 +1200,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // the absence of information about op2. Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); - } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->getZExtValue() == 0 && + } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST // (a.k.a. ANDS) except that the flags are only guaranteed to work for one @@ -1167,14 +1210,230 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, LHS = LHS.getOperand(0); } - return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) + return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } +/// \defgroup AArch64CCMP CMP;CCMP matching +/// +/// These functions deal with the formation of CMP;CCMP;... sequences. +/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of +/// a comparison. They set the NZCV flags to a predefined value if their +/// predicate is false. This allows to express arbitrary conjunctions, for +/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" +/// expressed as: +/// cmp A +/// ccmp B, inv(CB), CA +/// check for CB flags +/// +/// In general we can create code for arbitrary "... (and (and A B) C)" +/// sequences. We can also implement some "or" expressions, because "(or A B)" +/// is equivalent to "not (and (not A) (not B))" and we can implement some +/// negation operations: +/// We can negate the results of a single comparison by inverting the flags +/// used when the predicate fails and inverting the flags tested in the next +/// instruction; We can also negate the results of the whole previous +/// conditional compare sequence by inverting the flags tested in the next +/// instruction. However there is no way to negate the result of a partial +/// sequence. +/// +/// Therefore on encountering an "or" expression we can negate the subtree on +/// one side and have to be able to push the negate to the leafs of the subtree +/// on the other side (see also the comments in code). As complete example: +/// "or (or (setCA (cmp A)) (setCB (cmp B))) +/// (and (setCC (cmp C)) (setCD (cmp D)))" +/// is transformed to +/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) +/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" +/// and implemented as: +/// cmp C +/// ccmp D, inv(CD), CC +/// ccmp A, CA, inv(CD) +/// ccmp B, CB, inv(CA) +/// check for CB flags +/// A counterexample is "or (and A B) (and C D)" which cannot be implemented +/// by conditional compare sequences. +/// @{ + +/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. +static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, + ISD::CondCode CC, SDValue CCOp, + SDValue Condition, unsigned NZCV, + SDLoc DL, SelectionDAG &DAG) { + unsigned Opcode = 0; + if (LHS.getValueType().isFloatingPoint()) + Opcode = AArch64ISD::FCCMP; + else if (RHS.getOpcode() == ISD::SUB) { + SDValue SubOp0 = RHS.getOperand(0); + if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } + } + if (Opcode == 0) + Opcode = AArch64ISD::CCMP; + + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); +} + +/// Returns true if @p Val is a tree of AND/OR/SETCC operations. +/// CanPushNegate is set to true if we can push a negate operation through +/// the tree in a was that we are left with AND operations and negate operations +/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to +/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be +/// brought into such a form. +static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, + unsigned Depth = 0) { + if (!Val.hasOneUse()) + return false; + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + CanPushNegate = true; + return true; + } + // Protect against stack overflow. + if (Depth > 15) + return false; + if (Opcode == ISD::AND || Opcode == ISD::OR) { + SDValue O0 = Val->getOperand(0); + SDValue O1 = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) + return false; + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) + return false; + // We cannot push a negate through an AND operation (it would become an OR), + // we can however change a (not (or x y)) to (and (not x) (not y)) if we can + // push the negate through the x/y subtrees. + CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; + return true; + } + return false; +} + +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. See @ref AArch64CCMP. +/// Tries to transform the given i1 producing node @p Val to a series compare +/// and conditional compare operations. @returns an NZCV flags producing node +/// and sets @p OutCC to the flags that should be tested or returns SDValue() if +/// transformation was not possible. +/// On recursive invocations @p PushNegate may be set to true to have negation +/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate +/// for the comparisons in the current subtree; @p Depth limits the search +/// depth to avoid stack overflow. +static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC, bool PushNegate = false, + SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, + unsigned Depth = 0) { + // We're at a tree leaf, produce a conditional comparison operation. + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); + bool isInteger = LHS.getValueType().isInteger(); + if (PushNegate) + CC = getSetCCInverse(CC, isInteger); + SDLoc DL(Val); + // Determine OutCC and handle FP special case. + if (isInteger) { + OutCC = changeIntCCToAArch64CC(CC); + } else { + assert(LHS.getValueType().isFloatingPoint()); + AArch64CC::CondCode ExtraCC; + changeFPCCToAArch64CC(CC, OutCC, ExtraCC); + // Surpisingly some floating point conditions can't be tested with a + // single condition code. Construct an additional comparison in this case. + // See comment below on how we deal with OR conditions. + if (ExtraCC != AArch64CC::AL) { + SDValue ExtraCmp; + if (!CCOp.getNode()) + ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); + else { + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + // Note that we want the inverse of ExtraCC, so NZCV is not inversed. + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, + NZCV, DL, DAG); + } + CCOp = ExtraCmp; + Predicate = AArch64CC::getInvertedCondCode(ExtraCC); + OutCC = AArch64CC::getInvertedCondCode(OutCC); + } + } + + // Produce a normal comparison if we are first in the chain + if (!CCOp.getNode()) + return emitComparison(LHS, RHS, CC, DL, DAG); + // Otherwise produce a ccmp. + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); + return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + DAG); + } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) + return SDValue(); + + assert((Opcode == ISD::OR || !PushNegate) + && "Can only push negate through OR operation"); + + // Check if both sides can be transformed. + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) + return SDValue(); + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) + return SDValue(); + + // Do we need to negate our operands? + bool NegateOperands = Opcode == ISD::OR; + // We can negate the results of all previous operations by inverting the + // predicate flags giving us a free negation for one side. For the other side + // we need to be able to push the negation to the leafs of the tree. + if (NegateOperands) { + if (!CanPushNegateL && !CanPushNegateR) + return SDValue(); + // Order the side where we can push the negate through to LHS. + if (!CanPushNegateL && CanPushNegateR) + std::swap(LHS, RHS); + } else { + bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; + bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; + if (NeedsNegOutL && NeedsNegOutR) + return SDValue(); + // Order the side where we need to negate the output flags to RHS so it + // gets emitted first. + if (NeedsNegOutL) + std::swap(LHS, RHS); + } + + // Emit RHS. If we want to negate the tree we only need to push a negate + // through if we are already in a PushNegate case, otherwise we can negate + // the "flags to test" afterwards. + AArch64CC::CondCode RHSCC; + SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, + CCOp, Predicate, Depth+1); + if (NegateOperands && !PushNegate) + RHSCC = AArch64CC::getInvertedCondCode(RHSCC); + // Emit LHS. We must push the negate through if we need to negate it. + SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, + CmpR, RHSCC, Depth+1); + // If we transformed an OR to and AND then we have to negate the result + // (or absorb a PushNegate resulting in a double negation). + if (Opcode == ISD::OR && !PushNegate) + OutCC = AArch64CC::getInvertedCondCode(OutCC); + return CmpL; +} + +/// @} + static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { - SDValue Cmp; - AArch64CC::CondCode AArch64CC; if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1229,47 +1488,56 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } } - // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. - // For the i8 operand, the largest immediate is 255, so this can be easily - // encoded in the compare instruction. For the i16 operand, however, the - // largest immediate cannot be encoded in the compare. - // Therefore, use a sign extending load and cmn to avoid materializing the -1 - // constant. For example, - // movz w1, #65535 - // ldrh w0, [x0, #0] - // cmp w0, w1 - // > - // ldrsh w0, [x0, #0] - // cmn w0, #1 - // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) - // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure - // both the LHS and RHS are truely zero extended and to make sure the - // transformation is profitable. + SDValue Cmp; + AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { - if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && - isa<LoadSDNode>(LHS)) { - if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && - cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && - LHS.getNode()->hasNUsesOfValue(1, 0)) { - int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); - if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { - SDValue SExt = - DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, - DAG.getValueType(MVT::i16)); - Cmp = emitComparison(SExt, - DAG.getConstant(ValueofRHS, dl, - RHS.getValueType()), - CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); - return Cmp; - } + const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); + + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the + // -1 constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to + // ensure both the LHS and RHS are truly zero extended and to make sure the + // transformation is profitable. + if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && + cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, + RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); } } + + if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { + if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { + if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) + AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); + } + } + } + + if (!Cmp) { + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); } - Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); + AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } @@ -1391,8 +1659,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, - SDLoc(Op)).first; + return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { @@ -1571,8 +1838,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { @@ -1581,6 +1848,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); + unsigned NumElts = InVT.getVectorNumElements(); + + // f16 vectors are promoted to f32 before a conversion. + if (InVT.getVectorElementType() == MVT::f16) { + MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); + SDLoc dl(Op); + return DAG.getNode( + Op.getOpcode(), dl, Op.getValueType(), + DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); + } if (VT.getSizeInBits() < InVT.getSizeInBits()) { SDLoc dl(Op); @@ -1628,8 +1905,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, - SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -1931,6 +2207,31 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } +SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::aarch64_thread_pointer: { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::aarch64_neon_smax: + return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umax: + return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_smin: + return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umin: + return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2032,14 +2333,11 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFSINCOS(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: + return LowerINTRINSIC_WO_CHAIN(Op, DAG); } } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { - return 2; -} - //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -2214,9 +2512,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments( break; } - ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - MemVT, false, false, false, 0); + ArgValue = DAG.getExtLoad( + ExtType, DL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MemVT, false, false, false, 0); InVals.push_back(ArgValue); } @@ -2289,9 +2588,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 8), false, false, 0); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, + false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2318,9 +2618,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 16), false, false, 0); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), + false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); @@ -2453,8 +2754,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) - if (!ArgLocs[i].isRegLoc()) + for (const CCValAssign &ArgLoc : ArgLocs) + if (!ArgLoc.isRegLoc()) return false; } @@ -2758,7 +3059,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); DstAddr = DAG.getFrameIndex(FI, PtrVT); - DstInfo = MachinePointerInfo::getFixedStack(FI); + DstInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Make sure any stack arguments overlapping with where we're storing // are loaded before this eventual operation. Otherwise they'll be @@ -2768,7 +3070,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); - DstInfo = MachinePointerInfo::getStack(LocMemOffset); + DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), + LocMemOffset); } if (Outs[i].Flags.isByVal()) { @@ -2802,9 +3105,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, - RegsToPass[i].second, InFlag); + for (auto &RegToPass : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, + RegToPass.second, InFlag); InFlag = Chain.getValue(1); } @@ -2860,9 +3163,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Add argument registers to the end of the list so that they are known live // into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) - Ops.push_back(DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); + for (auto &RegToPass : RegsToPass) + Ops.push_back(DAG.getRegister(RegToPass.first, + RegToPass.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; @@ -2968,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AArch64::GPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AArch64::FPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } RetOps[0] = Chain; // Update chain. @@ -3010,11 +3326,12 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr, - MachinePointerInfo::getConstantPool(), - /*isVolatile=*/ false, - /*isNonTemporal=*/ true, - /*isInvariant=*/ true, 8); + SDValue GlobalAddr = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), PoolAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + /*isVolatile=*/false, + /*isNonTemporal=*/true, + /*isInvariant=*/true, 8); if (GN->getOffset() != 0) return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, DAG.getConstant(GN->getOffset(), DL, PtrVT)); @@ -3087,8 +3404,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = - DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), - false, true, true, 8); + DAG.getLoad(MVT::i64, DL, Chain, DescAddr, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, + true, true, 8); Chain = FuncTLVGet.getValue(1); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -3160,6 +3478,10 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); + + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; @@ -3277,8 +3599,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = LHS.getOpcode(); - if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->isOne() && + if (LHS.getResNo() == 1 && isOneConstant(RHS) && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && @@ -3392,17 +3713,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); - if (SrcVT != VT) { - if (SrcVT == MVT::f32 && VT == MVT::f64) - In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); - else if (SrcVT == MVT::f64 && VT == MVT::f32) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, - DAG.getIntPtrConstant(0, DL)); - else - // FIXME: Src type is different, bail out for now. Can VT really be a - // vector type? - return SDValue(); - } + + if (SrcVT.bitsLT(VT)) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT.bitsGT(VT)) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; EVT EltVT; @@ -3410,7 +3725,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue VecVal1, VecVal2; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { EltVT = MVT::i32; - VecVT = MVT::v4i32; + VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); EltMask = 0x80000000ULL; if (!VT.isVector()) { @@ -3571,32 +3886,6 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } } -/// A SELECT_CC operation is really some kind of max or min if both values being -/// compared are, in some sense, equal to the results in either case. However, -/// it is permissible to compare f32 values and produce directly extended f64 -/// values. -/// -/// Extending the comparison operands would also be allowed, but is less likely -/// to happen in practice since their use is right here. Note that truncate -/// operations would *not* be semantically equivalent. -static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { - if (Cmp == Result) - return (Cmp.getValueType() == MVT::f32 || - Cmp.getValueType() == MVT::f64); - - ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp); - ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result); - if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && - Result.getValueType() == MVT::f64) { - bool Lossy; - APFloat CmpVal = CCmp->getValueAPF(); - CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); - return CResult->getValueAPF().bitwiseIsEqual(CmpVal); - } - - return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; -} - SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, SDLoc dl, @@ -3614,7 +3903,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } } - // Handle integers first. + // Also handle f16, for which we need to do a f32 comparison. + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + } + + // Next, handle integers. if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); @@ -3637,9 +3932,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. - ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1)); - - if (CVal && CVal->isAllOnesValue()) { + if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); @@ -3647,9 +3940,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so // that we can match with a CSNEG rather than a CSEL. - ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0)); - - if (CVal && CVal->isNullValue()) { + if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); @@ -4109,46 +4400,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + HiBitsForLo = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + HiBitsForLo, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue LoForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); - SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); // AArch64 shifts larger than the register width are wrapped rather than // clamped, so we can't just emit "hi >> x". - SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue TrueValHi = Opc == ISD::SRA - ? DAG.getNode(Opc, dl, VT, ShOpHi, - DAG.getConstant(VTBits - 1, dl, - MVT::i64)) - : DAG.getConstant(0, dl, VT); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); + SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue HiForBigShift = + Opc == ISD::SRA + ? DAG.getNode(Opc, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, dl, MVT::i64)) + : DAG.getConstant(0, dl, VT); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } + /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -4156,31 +4458,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + LoBitsForHi = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + LoBitsForHi, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); - SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue HiForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); - SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. - SDValue TrueValLo = DAG.getConstant(0, dl, VT); - SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + SDValue LoForBigShift = DAG.getConstant(0, dl, VT); + SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); @@ -4362,8 +4674,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint( // Validate and return a target constant for them if we can. case 'z': { // 'z' maps to xzr or wzr so it needs an input of 0. - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); - if (!C || C->getZExtValue() != 0) + if (!isNullConstant(Op)) return; if (Op.getValueType() == MVT::i64) @@ -5653,11 +5964,10 @@ static SDValue NormalizeBuildVector(SDValue Op, return Op; SmallVector<SDValue, 16> Ops; - for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { - SDValue Lane = Op.getOperand(I); - if (Lane.getOpcode() == ISD::Constant) { + for (SDValue Lane : Op->ops()) { + if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { APInt LowBits(EltTy.getSizeInBits(), - cast<ConstantSDNode>(Lane)->getZExtValue()); + CstLane->getZExtValue()); Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } Ops.push_back(Lane); @@ -5997,8 +6307,7 @@ FailedModImm: // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { - SDValue shuffle = ReconstructShuffle(Op, DAG); - if (shuffle != SDValue()) + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; } @@ -6017,7 +6326,10 @@ FailedModImm: // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the // value is already in an S or D register. - if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { + // Do not do this for UNDEF/LOAD nodes because we have better patterns + // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. + if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && + (ElemSize == 32 || ElemSize == 64)) { unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, @@ -6123,24 +6435,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, unsigned Val = Cst->getZExtValue(); unsigned Size = Op.getValueType().getSizeInBits(); - if (Val == 0) { - switch (Size) { - case 8: - return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 16: - return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 32: - return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(), - Op.getOperand(0)); - case 64: - return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(), - Op.getOperand(0)); - default: - llvm_unreachable("Unexpected vector type in extract_subvector!"); - } - } + + // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. + if (Val == 0) + return Op; + // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) @@ -6213,26 +6512,20 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { +/// operand of a vector shift right operation. The value must be in the range: +/// 1 <= Value <= ElementBits for a right shift; or +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; - if (isIntrinsic) - Cnt = -Cnt; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } @@ -6261,8 +6554,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, case ISD::SRA: case ISD::SRL: // Right shift immediate - if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && - Cnt < EltSize) { + if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), @@ -6451,7 +6743,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; @@ -6477,7 +6769,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += DL.getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); @@ -6720,10 +7012,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); - unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); + unsigned VecSize = DL.getTypeSizeInBits(VecTy); - // Skip illegal vector types. - if (VecSize != 64 && VecSize != 128) + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) return false; // A pointer vector can not be the return type of the ldN intrinsics. Need to @@ -6806,10 +7098,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - // Skip illegal vector types. - if (SubVecSize != 64 && SubVecSize != 128) + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) return false; Value *Op0 = SVI->getOperand(0); @@ -7228,8 +7520,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. - SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); - if (Res != SDValue()) + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; EVT VT = N->getValueType(0); @@ -7242,7 +7533,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, // If the result of an integer load is only used by an integer-to-float // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. - // This eliminates an "integer-to-vector-move UOP and improve throughput. + // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. @@ -7265,6 +7556,134 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Fold a floating-point multiply by power of two into floating-point to +/// fixed-point conversion. +static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + SDValue ConstVec = Op->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., float -> i64). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t Bits = IntBits == 64 ? 64 : 32; + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); + if (C == -1 || C == 0 || C > Bits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs + : Intrinsic::aarch64_neon_vcvtfp2fxu; + SDValue FixConv = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); + // We can handle smaller integers by generating an extra trunc. + if (IntBits < FloatBits) + FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); + + return FixConv; +} + +/// Fold a floating-point divide by power of two into fixed-point to +/// floating-point conversion. +static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + unsigned Opc = Op->getOpcode(); + if (!Op.getValueType().isVector() || + (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) + return SDValue(); + + SDValue ConstVec = N->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); + int32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + int32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., i64 -> float). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); + if (C == -1 || C == 0 || C > FloatBits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + SDValue ConvInput = Op.getOperand(0); + bool IsSigned = Opc == ISD::SINT_TO_FP; + if (IntBits < FloatBits) + ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, + ResTy, ConvInput); + + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp + : Intrinsic::aarch64_neon_vcvtfxu2fp; + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, + DAG.getConstant(C, DL, MVT::i32)); +} + /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, @@ -7964,7 +8383,6 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); - break; case Intrinsic::aarch64_neon_saddv: return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); case Intrinsic::aarch64_neon_uaddv: @@ -7978,10 +8396,16 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: - return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmin: - return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fmaxnm: + return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fminnm: + return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: @@ -8141,7 +8565,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { unsigned Alignment = std::min(OrigAlignment, EltOffset); // Create scalar stores. This is at least as good as the code sequence for a - // split unaligned store wich is a dup.s, ext.b, and two stores. + // split unaligned store which is a dup.s, ext.b, and two stores. // Most of the time the three stores should be replaced by store pair // instructions (stp). SDLoc DL(St); @@ -8162,10 +8586,9 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { return NewST1; } -static SDValue performSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { +static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); @@ -8173,15 +8596,17 @@ static SDValue performSTORECombine(SDNode *N, if (S->isVolatile()) return SDValue(); + // FIXME: The logic for deciding if an unaligned store should be split should + // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be + // a call to that function here. + // Cyclone has bad performance on unaligned 16B stores when crossing line and // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) return SDValue(); - // Don't split at Oz. - MachineFunction &MF = DAG.getMachineFunction(); - bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); - if (IsMinSize) + // Don't split at -Oz. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); SDValue StVal = S->getValue(); @@ -8204,8 +8629,7 @@ static SDValue performSTORECombine(SDNode *N, // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. - SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); - if (ReplacedSplat != SDValue()) + if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) return ReplacedSplat; SDLoc DL(S); @@ -8326,6 +8750,299 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } +/// Simplify \Addr given that the top byte of it is ignored by HW during +/// address translation. +static bool performTBISimplification(SDValue Addr, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + APInt DemandedMask = APInt::getLowBitsSet(64, 56); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + return true; + } + return false; +} + +static SDValue performSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDValue Split = split16BStores(N, DCI, DAG, Subtarget); + if (Split.getNode()) + return Split; + + if (Subtarget->supportsAddressTopByteIgnored() && + performTBISimplification(N->getOperand(2), DCI, DAG)) + return SDValue(N, 0); + + return SDValue(); +} + + /// This function handles the log2-shuffle pattern produced by the +/// LoopVectorizer for the across vector reduction. It consists of +/// log2(NumVectorElements) steps and, in each step, 2^(s) elements +/// are reduced, where s is an induction variable from 0 to +/// log2(NumVectorElements). +static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, + unsigned Op, + SelectionDAG &DAG) { + EVT VTy = OpV->getOperand(0).getValueType(); + if (!VTy.isVector()) + return SDValue(); + + int NumVecElts = VTy.getVectorNumElements(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (NumVecElts != 4) + return SDValue(); + } else { + if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) + return SDValue(); + } + + int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); + SDValue PreOp = OpV; + // Iterate over each step of the across vector reduction. + for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { + SDValue CurOp = PreOp.getOperand(0); + SDValue Shuffle = PreOp.getOperand(1); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { + // Try to swap the 1st and 2nd operand as add and min/max instructions + // are commutative. + CurOp = PreOp.getOperand(1); + Shuffle = PreOp.getOperand(0); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + } + + // Check if the input vector is fed by the operator we want to handle, + // except the last step; the very first input vector is not necessarily + // the same operator we are handling. + if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) + return SDValue(); + + // Check if it forms one step of the across vector reduction. + // E.g., + // %cur = add %1, %0 + // %shuffle = vector_shuffle %cur, <2, 3, u, u> + // %pre = add %cur, %shuffle + if (Shuffle.getOperand(0) != CurOp) + return SDValue(); + + int NumMaskElts = 1 << CurStep; + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); + // Check mask values in each step. + // We expect the shuffle mask in each step follows a specific pattern + // denoted here by the <M, U> form, where M is a sequence of integers + // starting from NumMaskElts, increasing by 1, and the number integers + // in M should be NumMaskElts. U is a sequence of UNDEFs and the number + // of undef in U should be NumVecElts - NumMaskElts. + // E.g., for <8 x i16>, mask values in each step should be : + // step 0 : <1,u,u,u,u,u,u,u> + // step 1 : <2,3,u,u,u,u,u,u> + // step 2 : <4,5,6,7,u,u,u,u> + for (int i = 0; i < NumVecElts; ++i) + if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || + (i >= NumMaskElts && !(Mask[i] < 0))) + return SDValue(); + + PreOp = CurOp; + } + unsigned Opcode; + bool IsIntrinsic = false; + + switch (Op) { + default: + llvm_unreachable("Unexpected operator for across vector reduction"); + case ISD::ADD: + Opcode = AArch64ISD::UADDV; + break; + case ISD::SMAX: + Opcode = AArch64ISD::SMAXV; + break; + case ISD::UMAX: + Opcode = AArch64ISD::UMAXV; + break; + case ISD::SMIN: + Opcode = AArch64ISD::SMINV; + break; + case ISD::UMIN: + Opcode = AArch64ISD::UMINV; + break; + case ISD::FMAXNUM: + Opcode = Intrinsic::aarch64_neon_fmaxnmv; + IsIntrinsic = true; + break; + case ISD::FMINNUM: + Opcode = Intrinsic::aarch64_neon_fminnmv; + IsIntrinsic = true; + break; + } + SDLoc DL(N); + + return IsIntrinsic + ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), + DAG.getConstant(Opcode, DL, MVT::i32), PreOp) + : DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + +/// Target-specific DAG combine for the across vector min/max reductions. +/// This function specifically handles the final clean-up step of the vector +/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which narrows down and finds the final min/max value from all +/// elements of the vector. +/// For example, for a <16 x i8> vector : +/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> +/// %smax0 = smax %arr, svn0 +/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax1 = smax %smax0, %svn1 +/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax2 = smax %smax1, svn2 +/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %sc = setcc %smax2, %svn3, gt +/// %n0 = extract_vector_elt %sc, #0 +/// %n1 = extract_vector_elt %smax2, #0 +/// %n2 = extract_vector_elt $smax2, #1 +/// %result = select %n0, %n1, n2 +/// becomes : +/// %1 = smaxv %0 +/// %result = extract_vector_elt %1, 0 +static SDValue +performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + + // Check if the SELECT merges up the final result of the min/max + // from a vector. + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Expect N0 is fed by SETCC. + SDValue SetCC = N0.getOperand(0); + EVT SetCCVT = SetCC.getValueType(); + if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || + SetCCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue VectorOp = SetCC.getOperand(0); + unsigned Op = VectorOp->getOpcode(); + // Check if the input vector is fed by the operator we want to handle. + if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && + Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) + return SDValue(); + + EVT VTy = VectorOp.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (EltTy != MVT::f32) + return SDValue(); + } else { + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + } + + // Check if extracting from the same vector. + // For example, + // %sc = setcc %vector, %svn1, gt + // %n0 = extract_vector_elt %sc, #0 + // %n1 = extract_vector_elt %vector, #0 + // %n2 = extract_vector_elt $vector, #1 + if (!(VectorOp == IfTrue->getOperand(0) && + VectorOp == IfFalse->getOperand(0))) + return SDValue(); + + // Check if the condition code is matched with the operator type. + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || + (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || + (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || + (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || + (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && + CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && + CC != ISD::SETGE) || + (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && + CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && + CC != ISD::SETLE)) + return SDValue(); + + // Expect to check only lane 0 from the vector SETCC. + if (!isNullConstant(N0.getOperand(1))) + return SDValue(); + + // Expect to extract the true value from lane 0. + if (!isNullConstant(IfTrue.getOperand(1))) + return SDValue(); + + // Expect to extract the false value from lane 1. + if (!isOneConstant(IfFalse.getOperand(1))) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); +} + +/// Target-specific DAG combine for the across vector add reduction. +/// This function specifically handles the final clean-up step of the vector +/// add reduction produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which adds all elements of a vector together. +/// For example, for a <4 x i32> vector : +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %result = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %result = extract_vector_elt %0, 0 +static SDValue +performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the ADD. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isNullConstant(N1)) + return SDValue(); + + EVT VTy = N0.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -8751,10 +9468,10 @@ static SDValue performBRCONDCombine(SDNode *N, if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return SDValue(); - if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue()) + if (isNullConstant(LHS)) std::swap(LHS, RHS); - if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue()) + if (!isNullConstant(RHS)) return SDValue(); if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || @@ -8868,75 +9585,6 @@ static SDValue performSelectCombine(SDNode *N, return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } -/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC -/// to match FMIN/FMAX patterns. -static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) { - // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y". - // Unless the NoNaNsFPMath option is set, be careful about NaNs: - // vmax/vmin return NaN if either operand is a NaN; - // only do the transformation when it matches that behavior. - - SDValue CondLHS = N->getOperand(0); - SDValue CondRHS = N->getOperand(1); - SDValue LHS = N->getOperand(2); - SDValue RHS = N->getOperand(3); - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); - - unsigned Opcode; - bool IsReversed; - if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) && - selectCCOpsAreFMaxCompatible(CondRHS, RHS)) { - IsReversed = false; // x CC y ? x : y - } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) && - selectCCOpsAreFMaxCompatible(CondLHS, RHS)) { - IsReversed = true ; // x CC y ? y : x - } else { - return SDValue(); - } - - bool IsUnordered = false, IsOrEqual; - switch (CC) { - default: - return SDValue(); - case ISD::SETULT: - case ISD::SETULE: - IsUnordered = true; - case ISD::SETOLT: - case ISD::SETOLE: - case ISD::SETLT: - case ISD::SETLE: - IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE); - Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN; - break; - - case ISD::SETUGT: - case ISD::SETUGE: - IsUnordered = true; - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETGT: - case ISD::SETGE: - IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE); - Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX; - break; - } - - // If LHS is NaN, an ordered comparison will be false and the result will be - // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking - // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - return SDValue(); - - // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true, - // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be - // used for unsafe math or if one of the operands is known to be nonzero. - if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - return SDValue(); - - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); -} - /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) @@ -8961,6 +9609,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performFpToIntCombine(N, DAG, Subtarget); + case ISD::FDIV: + return performFDivCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: @@ -8973,12 +9626,18 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: - return performSelectCombine(N, DCI); + case ISD::SELECT: { + SDValue RV = performSelectCombine(N, DCI); + if (!RV.getNode()) + RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); + return RV; + } case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); - case ISD::SELECT_CC: - return performSelectCCCombine(N, DCI.DAG); + case ISD::LOAD: + if (performTBISimplification(N->getOperand(1), DCI, DAG)) + return SDValue(N, 0); + break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: @@ -8991,6 +9650,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -9157,6 +9818,20 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } +static void ReplaceReductionResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG, unsigned InterOp, + unsigned AcrossOp) { + EVT LoVT, HiVT; + SDValue Lo, Hi; + SDLoc dl(N); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); + SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); + Results.push_back(SplitVal); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -9165,6 +9840,24 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; + case AArch64ISD::SADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); + return; + case AArch64ISD::UADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); + return; + case AArch64ISD::SMINV: + ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); + return; + case AArch64ISD::UMINV: + ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); + return; + case AArch64ISD::SMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); + return; + case AArch64ISD::UMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); + return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); @@ -9177,10 +9870,10 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const { return true; } -bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { +unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are three or more FDIVs. - return NumUsers > 2; + return 3; } TargetLoweringBase::LegalizeTypeAction @@ -9206,20 +9899,21 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. -bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return Size == 128; + return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128 ? AtomicRMWExpansionKind::LLSC - : AtomicRMWExpansionKind::None; + return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } -bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { +bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { return true; } @@ -9258,6 +9952,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, cast<PointerType>(Addr->getType())->getElementType()); } +void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall( + llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); +} + Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { @@ -9294,3 +9995,70 @@ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } + +bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, + EVT) const { + return false; +} + +Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + const unsigned TlsOffset = 0x48; + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Function *ThreadPointerFunc = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); + return IRB.CreatePointerCast( + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); +} + +void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in AArch64unctionInfo. + AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); + AFI->setIsSplitCSR(true); +} + +void AArch64TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AArch64::GPR64RegClass.contains(*I)) + RC = &AArch64::GPR64RegClass; + else if (AArch64::FPR64RegClass.contains(*I)) + RC = &AArch64::FPR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h index c73ce1e54b3e..e99616c94068 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H +#include "AArch64.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/CallingConv.h" @@ -58,13 +59,14 @@ enum NodeType : unsigned { SBCS, ANDS, + // Conditional compares. Operands: left,right,falsecc,cc,flags + CCMP, + CCMN, + FCCMP, + // Floating point comparison FCMP, - // Floating point max and min instructions. - FMAX, - FMIN, - // Scalar extract EXTR, @@ -217,8 +219,6 @@ class AArch64Subtarget; class AArch64TargetMachine; class AArch64TargetLowering : public TargetLowering { - bool RequireStrictAlign; - public: explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); @@ -226,46 +226,35 @@ public: /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; - /// computeKnownBitsForTargetNode - Determine which of the bits specified in - /// Mask are known to be either zero or one and return them in the - /// KnownZero/KnownOne bitsets. + /// Determine which of the bits specified in Mask are known to be either zero + /// or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; - /// allowsMisalignedMemoryAccesses - Returns true if the target allows - /// unaligned memory accesses of the specified type. + /// Returns true if the target allows unaligned memory accesses of the + /// specified type. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, - bool *Fast = nullptr) const override { - if (RequireStrictAlign) - return false; - // FIXME: True for Cyclone, but not necessary others. - if (Fast) - *Fast = true; - return true; - } + bool *Fast = nullptr) const override; - /// LowerOperation - Provide custom lowering hooks for some operations. + /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; const char *getTargetNodeName(unsigned Opcode) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - /// getFunctionAlignment - Return the Log2 alignment of this function. - unsigned getFunctionAlignment(const Function *F) const; - /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Addrspacecasts are always noops. return true; } - /// createFastISel - This method returns a target specific FastISel object, - /// or null if the target does not support "fast" ISel. + /// This method returns a target specific FastISel object, or null if the + /// target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; @@ -273,11 +262,11 @@ public: bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - /// isShuffleMaskLegal - Return true if the given shuffle mask can be - /// codegen'd directly, or if it should be stack expanded. + /// Return true if the given shuffle mask can be codegen'd directly, or if it + /// should be stack expanded. bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override; - /// getSetCCResultType - Return the ISD::SETCC ValueType + /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -322,8 +311,8 @@ public: bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; - /// isLegalAddressingMode - Return true if the addressing mode represented - /// by AM is legal for this target, for a load/store of the specified type. + /// Return true if the addressing mode represented by AM is legal for this + /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; @@ -335,10 +324,9 @@ public: int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this method + /// returns true, otherwise fmuladd is expanded to fmul + fadd. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; @@ -351,25 +339,65 @@ public: bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; - bool hasLoadLinkedStoreConditional() const override; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicRMWExpansionKind + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + bool useLoadStackGuardNode() const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + /// If the target has a standard location for the unsafe stack pointer, + /// returns the address of that location. Otherwise, returns nullptr. + Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + // FIXME: This is a guess. Has this been defined yet? + return AArch64::X0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + // FIXME: This is a guess. Has this been defined yet? + return AArch64::X1; + } + + bool isCheapToSpeculateCttz() const override { + return true; + } + + bool isCheapToSpeculateCtlz() const override { + return true; + } + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; - /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; @@ -392,6 +420,8 @@ private: SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + bool isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, @@ -470,7 +500,7 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; - bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; unsigned getRegisterByName(const char* RegName, EVT VT, @@ -516,6 +546,8 @@ private: bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; + + bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; }; namespace AArch64 { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 3f2e772a90c4..6ac2175e5035 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> { let PrintMethod = "printImmScale<16>"; } +def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>; +def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>; +def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>; +def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>; +def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; + class AsmImmRange<int Low, int High> : AsmOperandClass { let Name = "Imm" # Low # "_" # High; let DiagnosticType = "InvalidImm" # Low # "_" # High; @@ -346,9 +352,11 @@ class fixedpoint_i64<ValueType FloatVT> let ParserMatchClass = Imm1_64Operand; } +def fixedpoint_f16_i32 : fixedpoint_i32<f16>; def fixedpoint_f32_i32 : fixedpoint_i32<f32>; def fixedpoint_f64_i32 : fixedpoint_i32<f64>; +def fixedpoint_f16_i64 : fixedpoint_i64<f16>; def fixedpoint_f32_i64 : fixedpoint_i64<f32>; def fixedpoint_f64_i64 : fixedpoint_i64<f64>; @@ -402,6 +410,7 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{ let ParserMatchClass = Imm1_32Operand; } +def Imm0_1Operand : AsmImmRange<0, 1>; def Imm0_7Operand : AsmImmRange<0, 7>; def Imm0_15Operand : AsmImmRange<0, 15>; def Imm0_31Operand : AsmImmRange<0, 31>; @@ -525,6 +534,20 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{ let ParserMatchClass = Imm0_31Operand; } +// True if the 32-bit immediate is in the range [0,31] +def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 32; +}]> { + let ParserMatchClass = Imm0_31Operand; +} + +// imm0_1 predicate - True if the immediate is in the range [0,1] +def imm0_1 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 2; +}]> { + let ParserMatchClass = Imm0_1Operand; +} + // imm0_15 predicate - True if the immediate is in the range [0,15] def imm0_15 : Operand<i64>, ImmLeaf<i64, [{ return ((uint64_t)Imm) < 16; @@ -542,7 +565,9 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{ // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 16; -}]>; +}]> { + let ParserMatchClass = Imm0_15Operand; +} // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr @@ -690,6 +715,17 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>, } // Floating-point immediate. +def fpimm16 : Operand<f16>, + PatLeaf<(f16 fpimm), [{ + return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = AArch64_AM::getFP16Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} def fpimm32 : Operand<f32>, PatLeaf<(f32 fpimm), [{ return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1; @@ -822,7 +858,7 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands> // model patterns with sufficiently fine granularity let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in class HintI<string mnemonic> - : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "", + : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "", [(int_aarch64_hint imm0_127:$imm)]>, Sched<[WriteHint]> { bits <7> imm; @@ -875,6 +911,25 @@ def msr_sysreg_op : Operand<i32> { let PrintMethod = "printMSRSystemRegister"; } +def PSBHintOperand : AsmOperandClass { + let Name = "PSBHint"; + let ParserMethod = "tryParsePSBHint"; +} +def psbhint_op : Operand<i32> { + let ParserMatchClass = PSBHintOperand; + let PrintMethod = "printPSBHintOp"; + let MCOperandPredicate = [{ + // Check, if operand is valid, to fix exhaustive aliasing in disassembly. + // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. + if (!MCOp.isImm()) + return false; + bool ValidNamed; + (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(), + STI.getFeatureBits(), ValidNamed); + return ValidNamed; + }]; +} + class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; @@ -890,19 +945,19 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), let Inst{20-5} = systemreg; } -def SystemPStateFieldOperand : AsmOperandClass { - let Name = "SystemPStateField"; +def SystemPStateFieldWithImm0_15Operand : AsmOperandClass { + let Name = "SystemPStateFieldWithImm0_15"; let ParserMethod = "tryParseSysReg"; } -def pstatefield_op : Operand<i32> { - let ParserMatchClass = SystemPStateFieldOperand; +def pstatefield4_op : Operand<i32> { + let ParserMatchClass = SystemPStateFieldWithImm0_15Operand; let PrintMethod = "printSystemPStateField"; } let Defs = [NZCV] in -class MSRpstateI - : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm), - "msr", "\t$pstate_field, $imm">, +class MSRpstateImm0_15 + : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm), + "msr", "\t$pstatefield, $imm">, Sched<[WriteSys]> { bits<6> pstatefield; bits<4> imm; @@ -913,6 +968,37 @@ class MSRpstateI let Inst{7-5} = pstatefield{2-0}; let DecoderMethod = "DecodeSystemPStateInstruction"; + // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns + // Fail the decoder should attempt to decode the instruction as MSRI. + let hasCompleteDecoder = 0; +} + +def SystemPStateFieldWithImm0_1Operand : AsmOperandClass { + let Name = "SystemPStateFieldWithImm0_1"; + let ParserMethod = "tryParseSysReg"; +} +def pstatefield1_op : Operand<i32> { + let ParserMatchClass = SystemPStateFieldWithImm0_1Operand; + let PrintMethod = "printSystemPStateField"; +} + +let Defs = [NZCV] in +class MSRpstateImm0_1 + : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm), + "msr", "\t$pstatefield, $imm">, + Sched<[WriteSys]> { + bits<6> pstatefield; + bit imm; + let Inst{20-19} = 0b00; + let Inst{18-16} = pstatefield{5-3}; + let Inst{15-9} = 0b0100000; + let Inst{8} = imm; + let Inst{7-5} = pstatefield{2-0}; + + let DecoderMethod = "DecodeSystemPStateInstruction"; + // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns + // Fail the decoder should attempt to decode the instruction as MSRI. + let hasCompleteDecoder = 0; } // SYS and SYSL generic system instructions. @@ -1341,7 +1427,7 @@ multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> { } class ShiftAlias<string asm, Instruction inst, RegisterClass regtype> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>; class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype, @@ -1407,13 +1493,13 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode> } class MulAccumWAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>; class MulAccumXAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>; class WideMulAccumAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>; class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg, @@ -1643,7 +1729,7 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype, class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype, RegisterClass src1Regtype, RegisterClass src2Regtype, int shiftExt> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2, shiftExt)>; @@ -1701,10 +1787,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias, } // add Rd, Rb, -imm -> sub Rd, Rn, imm - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; @@ -1776,43 +1862,43 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, } // Defs = [NZCV] // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; // Compare aliases - def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri") + def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>; - def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri") + def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx") WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx") XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64") XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs") WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm - def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Wri") + def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Xri") + def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>; // Compare shorthands - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs") WZR, GPR32:$src1, GPR32:$src2, 0), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, 0), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx") WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64") XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>; // Register/register aliases with no shift when SP is not used. @@ -1998,7 +2084,7 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype, // Aliases for register+register logical instructions. class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>; multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, @@ -2017,10 +2103,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, let Inst{31} = 1; } - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2039,10 +2125,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode, } } // end Defs = [NZCV] - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2105,9 +2191,12 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic, //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $imm, $nzcv, $cond", "", []>, +class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype, + string mnemonic, SDNode OpNode> + : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $imm, $nzcv, $cond", "", + [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2127,19 +2216,13 @@ class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm> let Inst{3-0} = nzcv; } -multiclass CondSetFlagsImm<bit op, string asm> { - def Wi : BaseCondSetFlagsImm<op, GPR32, asm> { - let Inst{31} = 0; - } - def Xi : BaseCondSetFlagsImm<op, GPR64, asm> { - let Inst{31} = 1; - } -} - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, +class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic, + SDNode OpNode> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", + [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2159,11 +2242,19 @@ class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm> let Inst{3-0} = nzcv; } -multiclass CondSetFlagsReg<bit op, string asm> { - def Wr : BaseCondSetFlagsReg<op, GPR32, asm> { +multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> { + // immediate operand variants + def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> { let Inst{31} = 0; } - def Xr : BaseCondSetFlagsReg<op, GPR64, asm> { + def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> { + let Inst{31} = 1; + } + // register operand variants + def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2328,7 +2419,7 @@ multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2340,7 +2431,7 @@ multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2508,7 +2599,7 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, } class ROInstAlias<string asm, RegisterClass regtype, Instruction INST> - : InstAlias<asm # " $Rt, [$Rn, $Rm]", + : InstAlias<asm # "\t$Rt, [$Rn, $Rm]", (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, @@ -2934,7 +3025,7 @@ multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2946,7 +3037,7 @@ multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2958,7 +3049,7 @@ multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm, asm, pat>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>; } @@ -2993,7 +3084,7 @@ multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc, (ins GPR64sp:$Rn, simm9:$offset), asm>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -3005,7 +3096,7 @@ multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc, asm>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -3136,7 +3227,7 @@ multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype, (ins GPR64sp:$Rn, indextype:$offset), asm>, Sched<[WriteLD, WriteLDHi]>; - def : InstAlias<asm # " $Rt, $Rt2, [$Rn]", + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } @@ -3151,7 +3242,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype, asm>, Sched<[WriteSTP]>; - def : InstAlias<asm # " $Rt, $Rt2, [$Rn]", + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } @@ -3230,8 +3321,8 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype, let mayStore = 1, mayLoad = 0 in class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype, Operand idxtype, string asm> - : BaseLoadStorePairPostIdx<opc, V, 0, (outs), - (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2, + : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback), + (ins regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, idxtype:$offset), asm>, Sched<[WriteAdr, WriteSTP]>; @@ -3477,6 +3568,20 @@ class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode, multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { + // Unscaled half-precision to 32-bit + def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, + [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Predicates = [HasFullFP16]; + } + + // Unscaled half-precision to 64-bit + def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, + [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Predicates = [HasFullFP16]; + } + // Unscaled single-precision to 32-bit def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm, [(set GPR32:$Rd, (OpN FPR32:$Rn))]> { @@ -3504,6 +3609,25 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { + // Scaled half-precision to 32-bit + def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, + fixedpoint_f16_i32, asm, + [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, + fixedpoint_f16_i32:$scale)))]> { + let Inst{31} = 0; // 32-bit GPR flag + let scale{5} = 1; + let Predicates = [HasFullFP16]; + } + + // Scaled half-precision to 64-bit + def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, + fixedpoint_f16_i64, asm, + [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, + fixedpoint_f16_i64:$scale)))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Predicates = [HasFullFP16]; + } + // Scaled single-precision to 32-bit def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32, fixedpoint_f32_i32, asm, @@ -3553,7 +3677,7 @@ class BaseIntegerToFP<bit isUnsigned, bits<5> Rd; bits<5> Rn; bits<6> scale; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21-17} = 0b00001; let Inst{16} = isUnsigned; let Inst{15-10} = scale; @@ -3570,7 +3694,7 @@ class BaseIntegerToFPUnscaled<bit isUnsigned, bits<5> Rd; bits<5> Rn; bits<6> scale; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21-17} = 0b10001; let Inst{16} = isUnsigned; let Inst{15-10} = 0b000000; @@ -3580,33 +3704,55 @@ class BaseIntegerToFPUnscaled<bit isUnsigned, multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { // Unscaled + def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; } def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } // Scaled + def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm, + [(set FPR16:$Rd, + (fdiv (node GPR32:$Rn), + fixedpoint_f16_i32:$scale))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let scale{5} = 1; + let Predicates = [HasFullFP16]; + } + def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm, [(set FPR32:$Rd, (fdiv (node GPR32:$Rn), fixedpoint_f32_i32:$scale))]> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag let scale{5} = 1; } @@ -3615,16 +3761,25 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { (fdiv (node GPR32:$Rn), fixedpoint_f64_i32:$scale))]> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag let scale{5} = 1; } + def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm, + [(set FPR16:$Rd, + (fdiv (node GPR64:$Rn), + fixedpoint_f16_i64:$scale))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm, [(set FPR32:$Rd, (fdiv (node GPR64:$Rn), fixedpoint_f32_i64:$scale))]> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm, @@ -3632,7 +3787,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { (fdiv (node GPR64:$Rn), fixedpoint_f64_i64:$scale))]> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } } @@ -3654,7 +3809,7 @@ class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode, Sched<[WriteFCopy]> { bits<5> Rd; bits<5> Rn; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21} = 1; let Inst{20-19} = rmode; let Inst{18-16} = opcode; @@ -3704,26 +3859,49 @@ class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode, } - multiclass UnscaledConversion<string asm> { + def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; } def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128, @@ -3796,7 +3974,7 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype, Sched<[WriteF]> { bits<5> Rd; bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21-19} = 0b100; let Inst{18-15} = opcode; let Inst{14-10} = 0b10000; @@ -3806,12 +3984,17 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype, multiclass SingleOperandFPData<bits<4> opcode, string asm, SDPatternOperator node = null_frag> { + def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3828,7 +4011,7 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype, bits<5> Rd; bits<5> Rn; bits<5> Rm; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = opcode; @@ -3839,28 +4022,41 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype, multiclass TwoOperandFPData<bits<4> opcode, string asm, SDPatternOperator node = null_frag> { + def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, + [(set (f16 FPR16:$Rd), + (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPData<opcode, FPR32, asm, [(set (f32 FPR32:$Rd), (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drr : BaseTwoOperandFPData<opcode, FPR64, asm, [(set (f64 FPR64:$Rd), (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> { + def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, + [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPData<opcode, FPR32, asm, [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drr : BaseTwoOperandFPData<opcode, FPR64, asm, [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3878,7 +4074,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub, bits<5> Rn; bits<5> Rm; bits<5> Ra; - let Inst{31-23} = 0b000111110; + let Inst{31-24} = 0b00011111; let Inst{21} = isNegated; let Inst{20-16} = Rm; let Inst{15} = isSub; @@ -3889,16 +4085,23 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub, multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm, SDPatternOperator node> { + def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm, + [(set FPR16:$Rd, + (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm, [(set FPR32:$Rd, (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm, [(set FPR64:$Rd, (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3913,7 +4116,7 @@ class BaseOneOperandFPComparison<bit signalAllNans, : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>, Sched<[WriteFCmp]> { bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{15-10} = 0b001000; @@ -3932,7 +4135,7 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype, Sched<[WriteFCmp]> { bits<5> Rm; bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-10} = 0b001000; @@ -3944,24 +4147,36 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype, multiclass FPComparison<bit signalAllNans, string asm, SDPatternOperator OpNode = null_frag> { let Defs = [NZCV] in { + def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm, + [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + + def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm, + [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm, [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm, [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm, [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm, [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } // Defs = [NZCV] } @@ -3971,17 +4186,20 @@ multiclass FPComparison<bit signalAllNans, string asm, //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseFPCondComparison<bit signalAllNans, - RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, +class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype, + string mnemonic, list<dag> pat> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>, Sched<[WriteFCmp]> { + let Uses = [NZCV]; + let Defs = [NZCV]; + bits<5> Rn; bits<5> Rm; bits<4> nzcv; bits<4> cond; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = cond; @@ -3991,16 +4209,24 @@ class BaseFPCondComparison<bit signalAllNans, let Inst{3-0} = nzcv; } -multiclass FPCondComparison<bit signalAllNans, string asm> { - let Defs = [NZCV], Uses = [NZCV] in { - def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> { - let Inst{22} = 0; +multiclass FPCondComparison<bit signalAllNans, string mnemonic, + SDPatternOperator OpNode = null_frag> { + def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; } - def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> { - let Inst{22} = 1; + def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic, + [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]> { + let Inst{23-22} = 0b00; + } + + def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic, + [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]> { + let Inst{23-22} = 0b01; } - } // Defs = [NZCV], Uses = [NZCV] } //--- @@ -4019,7 +4245,7 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm> bits<5> Rm; bits<4> cond; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = cond; @@ -4030,12 +4256,17 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm> multiclass FPCondSelect<string asm> { let Uses = [NZCV] in { + def Hrrr : BaseFPCondSelect<FPR16, f16, asm> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Srrr : BaseFPCondSelect<FPR32, f32, asm> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Drrr : BaseFPCondSelect<FPR64, f64, asm> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } // Uses = [NZCV] } @@ -4050,7 +4281,7 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm> Sched<[WriteFImm]> { bits<5> Rd; bits<8> imm; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-13} = imm; let Inst{12-5} = 0b10000000; @@ -4058,12 +4289,17 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm> } multiclass FPMoveImmediate<string asm> { + def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } } // end of 'let Predicates = [HasFPARMv8]' @@ -4079,7 +4315,7 @@ let Predicates = [HasNEON] in { //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, @@ -4093,8 +4329,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -4103,7 +4338,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm, @@ -4117,8 +4352,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -4129,25 +4363,25 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; - def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128, asm, ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; } @@ -4155,49 +4389,49 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, // As above, but D sized elements unsupported. multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>; - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; } multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; - def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; @@ -4206,54 +4440,80 @@ multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm, // As above, but only B sized elements supported. multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; } -// As above, but only S and D sized floating point elements supported. -multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc, +// As above, but only floating point elements supported. +multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } -multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc, +multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } -multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc, +multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$dst), + (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$dst), + (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; @@ -4262,16 +4522,16 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc, // As above, but D and B sized elements unsupported. multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; } @@ -4279,10 +4539,10 @@ multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm, // Logical three vector ops share opcode bits, and only use B sized elements. multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm, SDPatternOperator OpNode = null_frag> { - def v8i8 : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>; @@ -4303,11 +4563,11 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm, multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64, + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128, + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128, asm, ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), @@ -4347,8 +4607,8 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, string asm, string dstkind, - string srckind, list<dag> pattern> + bits<2> size2, RegisterOperand regtype, string asm, + string dstkind, string srckind, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "", pattern>, @@ -4360,7 +4620,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4369,8 +4631,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, string asm, string dstkind, - string srckind, list<dag> pattern> + bits<2> size2, RegisterOperand regtype, + string asm, string dstkind, string srckind, + list<dag> pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, @@ -4382,7 +4645,9 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4392,22 +4657,22 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, // Supports B, H, and S element sizes. multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } @@ -4450,49 +4715,49 @@ multiclass SIMDVectorLShiftLongBySizeBHS { // Supports all element sizes. multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".4h", ".8b", [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".8h", ".16b", [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".2s", ".4h", [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".4s", ".8h", [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".1d", ".2s", [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".2d", ".4s", [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, + def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, asm, ".4h", ".8b", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v8i8 V64:$Rn)))]>; - def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, + def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, asm, ".8h", ".16b", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v16i8 V128:$Rn)))]>; - def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, + def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, asm, ".2s", ".4h", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v4i16 V64:$Rn)))]>; - def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, + def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, asm, ".4s", ".8h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v8i16 V128:$Rn)))]>; - def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, + def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, asm, ".1d", ".2s", [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd), (v2i32 V64:$Rn)))]>; - def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, + def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, asm, ".2d", ".4s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v4i32 V128:$Rn)))]>; @@ -4501,50 +4766,50 @@ multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm, // Supports all element sizes, except 1xD. multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>; - def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>; } multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } @@ -4553,10 +4818,10 @@ multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm, // Supports only B element sizes. multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; @@ -4565,16 +4830,16 @@ multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm, // Supports only B and H element sizes. multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>; } @@ -4583,13 +4848,21 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm, // as an extra opcode bit. multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } @@ -4597,10 +4870,10 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, // Supports only S element size. multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } @@ -4608,26 +4881,42 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm, multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } @@ -4706,10 +4995,10 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; } -class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, - string asm, string kind, string zero, - ValueType dty, ValueType sty, SDNode OpNode> +class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2, + bits<5> opcode, RegisterOperand regtype, string asm, + string kind, string zero, ValueType dty, + ValueType sty, SDNode OpNode> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", @@ -4722,7 +5011,9 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4732,54 +5023,74 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, // Comparisons support all element sizes, except 1xD. multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm, SDNode OpNode> { - def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64, + def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64, asm, ".8b", "0", v8i8, v8i8, OpNode>; - def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128, + def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128, asm, ".16b", "0", v16i8, v16i8, OpNode>; - def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64, + def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64, asm, ".4h", "0", v4i16, v4i16, OpNode>; - def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128, + def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128, asm, ".8h", "0", v8i16, v8i16, OpNode>; - def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64, + def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64, asm, ".2s", "0", v2i32, v2i32, OpNode>; - def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128, + def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128, asm, ".4s", "0", v4i32, v4i32, OpNode>; - def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128, + def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128, asm, ".2d", "0", v2i64, v2i64, OpNode>; } -// FP Comparisons support only S and D element sizes. +// FP Comparisons support only S and D element sizes (and H for v8.2a). multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc, string asm, SDNode OpNode> { - def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64, + asm, ".4h", "0.0", + v4i16, v4f16, OpNode>; + def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128, + asm, ".8h", "0.0", + v8i16, v8f16, OpNode>; + } // Predicates = [HasNEON, HasFullFP16] + def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64, asm, ".2s", "0.0", v2i32, v2f32, OpNode>; - def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128, + def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128, asm, ".4s", "0.0", v4i32, v4f32, OpNode>; - def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128, + def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128, asm, ".2d", "0.0", v2i64, v2f64, OpNode>; - def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0", + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0", + (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0", + (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; + } + def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0", (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; - def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0", + def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0", (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0", + def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0", (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # ".2s $Vd, $Vn, #0", + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # ".4h\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # ".8h\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; + } + def : InstAlias<asm # ".2s\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; - def : InstAlias<asm # ".4s $Vd, $Vn, #0", + def : InstAlias<asm # ".4s\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # ".2d $Vd, $Vn, #0", + def : InstAlias<asm # ".2d\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; } @@ -5325,7 +5636,7 @@ multiclass SIMDZipVector<bits<3>opc, string asm, //---------------------------------------------------------------------------- let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode, RegisterClass regtype, string asm, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, @@ -5337,8 +5648,7 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode, let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -5369,17 +5679,17 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode, multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm, + def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>; } multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm, + def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>; - def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>; - def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>; - def v1i8 : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>; + def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>; + def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>; + def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>; def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>; @@ -5389,9 +5699,9 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm, multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, + def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; - def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>; + def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>; } multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm, @@ -5404,26 +5714,34 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm, asm, []>; } -multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm, + def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; - def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm, + def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>; + } // Predicates = [HasNEON, HasFullFP16] } def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; } -multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm, + def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; - def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm, + def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + []>; + } // Predicates = [HasNEON, HasFullFP16] } def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), @@ -5482,7 +5800,7 @@ multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm, //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, RegisterClass regtype, RegisterClass regtype2, string asm, list<dag> pat> : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm, @@ -5494,7 +5812,9 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -5523,7 +5843,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, RegisterClass regtype, string asm, string zero> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn, #" # zero, "", []>, @@ -5534,7 +5854,9 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -5556,21 +5878,28 @@ class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm> multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">; + def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">; def : Pat<(v1i64 (OpNode FPR64:$Rn)), (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>; } -multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">; - def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">; + def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">; + def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">; + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">; + } - def : InstAlias<asm # " $Rd, $Rn, #0", + def : InstAlias<asm # "\t$Rd, $Rn, #0", (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>; - def : InstAlias<asm # " $Rd, $Rn, #0", + def : InstAlias<asm # "\t$Rd, $Rn, #0", (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>; + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # "\t$Rd, $Rn, #0", + (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>; + } def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))), (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>; @@ -5578,35 +5907,42 @@ multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm, multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>; def : Pat<(i64 (OpNode (i64 FPR64:$Rn))), (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>; } -multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> { - def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>; - def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>; +multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> { + def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>; + def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>; + let Predicates = [HasNEON, HasFullFP16] in { + def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>; + } } -multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm, [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>; - def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm, + def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm, [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm, + [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>; + } } multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm, [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>; - def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm, + def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm, [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>; - def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>; - def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>; + def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>; + def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>; } def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))), @@ -5633,10 +5969,10 @@ multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm, + def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm, [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>; - def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>; - def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>; + def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>; + def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>; } //---------------------------------------------------------------------------- @@ -5668,10 +6004,14 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> { asm, ".2d">; } -multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> { - def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64, +multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64, + asm, ".2h">; + } + def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64, asm, ".2s">; - def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128, + def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128, asm, ".2d">; } @@ -5727,8 +6067,16 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> { asm, ".4s", []>; } -multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm, +multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm, Intrinsic intOp> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, + asm, ".4h", + [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; + def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, + asm, ".8h", + [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, asm, ".4s", [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>; @@ -5925,7 +6273,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst, class SIMDInsElementMovAlias<string size, Instruction inst, Operand idxtype> : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # - # "|" # size #" $dst$idx, $src$idx2}", + # "|" # size #"\t$dst$idx, $src$idx2}", (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; @@ -6215,7 +6563,7 @@ multiclass SIMDScalarCPY<string asm> { // AdvSIMD modified immediate instructions //---------------------------------------------------------------------------- -class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops, +class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops, string asm, string op_string, string cstr, list<dag> pattern> : I<oops, iops, asm, op_string, cstr, pattern>, @@ -6227,16 +6575,17 @@ class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops, let Inst{29} = op; let Inst{28-19} = 0b0111100000; let Inst{18-16} = imm8{7-5}; - let Inst{11-10} = 0b01; + let Inst{11} = op2; + let Inst{10} = 1; let Inst{9-5} = imm8{4-0}; let Inst{4-0} = Rd; } -class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype, +class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype, Operand immtype, dag opt_shift_iop, string opt_shift, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd), + : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd), !con((ins immtype:$imm8), opt_shift_iop), asm, "{\t$Rd" # kind # ", $imm8" # opt_shift # "|" # kind # "\t$Rd, $imm8" # opt_shift # "}", @@ -6248,7 +6597,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype, Operand immtype, dag opt_shift_iop, string opt_shift, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst), + : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst), !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop), asm, "{\t$Rd" # kind # ", $imm8" # opt_shift # "|" # kind # "\t$Rd, $imm8" # opt_shift # "}", @@ -6259,7 +6608,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype, class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins logical_vec_shift:$shift), "$shift", asm, kind, pattern> { bits<2> shift; @@ -6284,7 +6633,7 @@ class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12, class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins logical_vec_hw_shift:$shift), "$shift", asm, kind, pattern> { bits<2> shift; @@ -6349,7 +6698,7 @@ multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode, class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins move_vec_shift:$shift), "$shift", asm, kind, pattern> { bits<1> shift; @@ -6357,18 +6706,18 @@ class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode, let Inst{12} = shift; } -class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode, +class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode, RegisterOperand vectype, Operand imm_type, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "", + : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "", asm, kind, pattern> { let Inst{15-12} = cmode; } class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm, + : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm, "\t$Rd, $imm8", "", pattern> { let Inst{15-12} = cmode; let DecoderMethod = "DecodeModImmInstruction"; @@ -6438,8 +6787,36 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc, let Inst{4-0} = Rd; } -multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, - SDPatternOperator OpNode> { +multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc, + V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4f16 V64:$Rd), + (OpNode (v4f16 V64:$Rn), + (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8f16 V128:$Rd), + (OpNode (v8f16 V128:$Rn), + (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, @@ -6476,6 +6853,21 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, let Inst{21} = 0; } + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", + [(set (f16 FPR16Op:$Rd), + (OpNode (f16 FPR16Op:$Rn), + (f16 (vector_extract (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", @@ -6501,7 +6893,7 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, } } -multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> { +multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> { // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), @@ -6553,7 +6945,28 @@ multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> { V128:$Rm, VectorIndexD:$idx)>; } -multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> { +multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", []> { @@ -6580,6 +6993,16 @@ multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> { let Inst{21} = 0; } + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, @@ -7117,7 +7540,13 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm, } -multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> { +multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, + FPR16, FPR16, vecshiftR16, asm, []> { + let Inst{19-16} = imm{3-0}; + } + } // Predicates = [HasNEON, HasFullFP16] def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, FPR32, FPR32, vecshiftR32, asm, []> { let Inst{20-16} = imm{4-0}; @@ -7297,6 +7726,23 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm, multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm, Intrinsic OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + } // Predicates = [HasNEON, HasFullFP16] def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", @@ -7322,8 +7768,26 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm, } } -multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm, +multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm, Intrinsic OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", @@ -8604,9 +9068,8 @@ let Predicates = [HasNEON, HasV8_1a] in { class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> - : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, + : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, pattern> { - let Inst{21}=0; } multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm, SDPatternOperator Accum> { @@ -9041,6 +9504,7 @@ def : TokenAlias<".8H", ".8h">; def : TokenAlias<".4S", ".4s">; def : TokenAlias<".2D", ".2d">; def : TokenAlias<".1Q", ".1q">; +def : TokenAlias<".2H", ".2h">; def : TokenAlias<".B", ".b">; def : TokenAlias<".H", ".h">; def : TokenAlias<".S", ".s">; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index c0b3f2c60916..3ef3c8b840cb 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" -#include "AArch64MachineCombinerPattern.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -533,6 +532,14 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, CC); } +/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. +static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { + uint64_t Imm = MI->getOperand(1).getImm(); + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); +} + // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { @@ -573,6 +580,12 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ORRWrr: case AArch64::ORRXrr: return true; + // If MOVi32imm or MOVi64imm can be expanded into ORRWri or + // ORRXri, it is as cheap as MOV + case AArch64::MOVi32imm: + return canBeExpandedToORR(MI, 32); + case AArch64::MOVi64imm: + return canBeExpandedToORR(MI, 64); } llvm_unreachable("Unknown opcode to check as cheap as a move!"); @@ -1379,42 +1392,34 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( Width = 1; Scale = 1; break; + case AArch64::LDRQui: + case AArch64::STRQui: + Scale = Width = 16; + break; case AArch64::LDRXui: + case AArch64::LDRDui: case AArch64::STRXui: + case AArch64::STRDui: Scale = Width = 8; break; case AArch64::LDRWui: + case AArch64::LDRSui: case AArch64::STRWui: + case AArch64::STRSui: Scale = Width = 4; break; - case AArch64::LDRBui: - case AArch64::STRBui: - Scale = Width = 1; - break; case AArch64::LDRHui: + case AArch64::LDRHHui: case AArch64::STRHui: + case AArch64::STRHHui: Scale = Width = 2; break; - case AArch64::LDRSui: - case AArch64::STRSui: - Scale = Width = 4; - break; - case AArch64::LDRDui: - case AArch64::STRDui: - Scale = Width = 8; - break; - case AArch64::LDRQui: - case AArch64::STRQui: - Scale = Width = 16; - break; + case AArch64::LDRBui: case AArch64::LDRBBui: + case AArch64::STRBui: case AArch64::STRBBui: Scale = Width = 1; break; - case AArch64::LDRHHui: - case AArch64::STRHHui: - Scale = Width = 2; - break; }; BaseReg = LdSt->getOperand(1).getReg(); @@ -1445,23 +1450,43 @@ bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, MachineInstr *Second) const { - // Cyclone can fuse CMN, CMP followed by Bcc. - - // FIXME: B0 can also fuse: - // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ. - if (Second->getOpcode() != AArch64::Bcc) - return false; - switch (First->getOpcode()) { - default: - return false; - case AArch64::SUBSWri: - case AArch64::ADDSWri: - case AArch64::ANDSWri: - case AArch64::SUBSXri: - case AArch64::ADDSXri: - case AArch64::ANDSXri: - return true; + if (Subtarget.isCyclone()) { + // Cyclone can fuse CMN, CMP, TST followed by Bcc. + unsigned SecondOpcode = Second->getOpcode(); + if (SecondOpcode == AArch64::Bcc) { + switch (First->getOpcode()) { + default: + return false; + case AArch64::SUBSWri: + case AArch64::ADDSWri: + case AArch64::ANDSWri: + case AArch64::SUBSXri: + case AArch64::ADDSXri: + case AArch64::ANDSXri: + return true; + } + } + // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { + switch (First->getOpcode()) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::ANDWri: + case AArch64::ANDXri: + case AArch64::EORWri: + case AArch64::EORXri: + case AArch64::ORRWri: + case AArch64::ORRXri: + case AArch64::SUBWri: + case AArch64::SUBXri: + return true; + } + } } + return false; } MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( @@ -1814,7 +1839,7 @@ void AArch64InstrInfo::storeRegToStackSlot( MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); unsigned Opc = 0; @@ -1911,7 +1936,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); @@ -2226,11 +2251,19 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, case AArch64::LDPDi: case AArch64::STPXi: case AArch64::STPDi: + case AArch64::LDNPXi: + case AArch64::LDNPDi: + case AArch64::STNPXi: + case AArch64::STNPDi: + ImmIdx = 3; IsSigned = true; Scale = 8; break; case AArch64::LDPQi: case AArch64::STPQi: + case AArch64::LDNPQi: + case AArch64::STNPQi: + ImmIdx = 3; IsSigned = true; Scale = 16; break; @@ -2238,6 +2271,11 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, case AArch64::LDPSi: case AArch64::STPWi: case AArch64::STPSi: + case AArch64::LDNPWi: + case AArch64::LDNPSi: + case AArch64::STNPWi: + case AArch64::STNPSi: + ImmIdx = 3; IsSigned = true; Scale = 4; break; @@ -2457,7 +2495,7 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { unsigned Opc = Root.getOpcode(); MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; @@ -2485,76 +2523,76 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( "ADDWrr does not have register operands"); if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP2); + Patterns.push_back(MachineCombinerPattern::MULADDW_OP2); Found = true; } break; case AArch64::ADDXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP2); + Patterns.push_back(MachineCombinerPattern::MULADDX_OP2); Found = true; } break; case AArch64::SUBWrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP2); + Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2); Found = true; } break; case AArch64::SUBXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP2); + Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2); Found = true; } break; case AArch64::ADDWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDWI_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1); Found = true; } break; case AArch64::ADDXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDXI_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1); Found = true; } break; case AArch64::SUBWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1); Found = true; } break; case AArch64::SUBXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1); Found = true; } break; @@ -2661,7 +2699,7 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, /// this function generates the instructions that could replace the /// original code sequence void AArch64InstrInfo::genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, + MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { @@ -2677,13 +2715,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence( default: // signal error. break; - case MachineCombinerPattern::MC_MULADDW_OP1: - case MachineCombinerPattern::MC_MULADDX_OP1: + case MachineCombinerPattern::MULADDW_OP1: + case MachineCombinerPattern::MULADDX_OP1: // MUL I=A,B,0 // ADD R,I,C // ==> MADD R,A,B,C // --- Create(MADD); - if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) { + if (Pattern == MachineCombinerPattern::MULADDW_OP1) { Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2692,13 +2730,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; - case MachineCombinerPattern::MC_MULADDW_OP2: - case MachineCombinerPattern::MC_MULADDX_OP2: + case MachineCombinerPattern::MULADDW_OP2: + case MachineCombinerPattern::MULADDX_OP2: // MUL I=A,B,0 // ADD R,C,I // ==> MADD R,A,B,C // --- Create(MADD); - if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) { + if (Pattern == MachineCombinerPattern::MULADDW_OP2) { Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2707,8 +2745,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; - case MachineCombinerPattern::MC_MULADDWI_OP1: - case MachineCombinerPattern::MC_MULADDXI_OP1: { + case MachineCombinerPattern::MULADDWI_OP1: + case MachineCombinerPattern::MULADDXI_OP1: { // MUL I=A,B,0 // ADD R,I,Imm // ==> ORR V, ZR, Imm @@ -2716,7 +2754,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *OrrRC; unsigned BitSize, OrrOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) { + if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { OrrOpc = AArch64::ORRWri; OrrRC = &AArch64::GPR32spRegClass; BitSize = 32; @@ -2751,8 +2789,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; } - case MachineCombinerPattern::MC_MULSUBW_OP1: - case MachineCombinerPattern::MC_MULSUBX_OP1: { + case MachineCombinerPattern::MULSUBW_OP1: + case MachineCombinerPattern::MULSUBX_OP1: { // MUL I=A,B,0 // SUB R,I, C // ==> SUB V, 0, C @@ -2760,7 +2798,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *SubRC; unsigned SubOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) { + if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { SubOpc = AArch64::SUBWrr; SubRC = &AArch64::GPR32spRegClass; ZeroReg = AArch64::WZR; @@ -2784,13 +2822,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); break; } - case MachineCombinerPattern::MC_MULSUBW_OP2: - case MachineCombinerPattern::MC_MULSUBX_OP2: + case MachineCombinerPattern::MULSUBW_OP2: + case MachineCombinerPattern::MULSUBX_OP2: // MUL I=A,B,0 // SUB R,C,I // ==> MSUB R,A,B,C (computes C - A*B) // --- Create(MSUB); - if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) { + if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { Opc = AArch64::MSUBWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2799,8 +2837,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; - case MachineCombinerPattern::MC_MULSUBWI_OP1: - case MachineCombinerPattern::MC_MULSUBXI_OP1: { + case MachineCombinerPattern::MULSUBWI_OP1: + case MachineCombinerPattern::MULSUBXI_OP1: { // MUL I=A,B,0 // SUB R,I, Imm // ==> ORR V, ZR, -Imm @@ -2808,7 +2846,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *OrrRC; unsigned BitSize, OrrOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) { + if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { OrrOpc = AArch64::ORRWri; OrrRC = &AArch64::GPR32spRegClass; BitSize = 32; @@ -2944,3 +2982,34 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { MI->eraseFromParent(); return true; } + +std::pair<unsigned, unsigned> +AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + const unsigned Mask = AArch64II::MO_FRAGMENT; + return std::make_pair(TF & Mask, TF & ~Mask); +} + +ArrayRef<std::pair<unsigned, const char *>> +AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace AArch64II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_PAGE, "aarch64-page"}, + {MO_PAGEOFF, "aarch64-pageoff"}, + {MO_G3, "aarch64-g3"}, + {MO_G2, "aarch64-g2"}, + {MO_G1, "aarch64-g1"}, + {MO_G0, "aarch64-g0"}, + {MO_HI12, "aarch64-hi12"}}; + return makeArrayRef(TargetFlags); +} + +ArrayRef<std::pair<unsigned, const char *>> +AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace AArch64II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT, "aarch64-got"}, + {MO_NC, "aarch64-nc"}, + {MO_TLS, "aarch64-tls"}, + {MO_CONSTPOOL, "aarch64-constant-pool"}}; + return makeArrayRef(TargetFlags); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 68c2a2882580..ae02822a32e6 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -167,13 +167,13 @@ public: /// for an instruction chain ending in <Root>. All potential patterns are /// listed in the <Patterns> array. bool getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) + SmallVectorImpl<MachineCombinerPattern> &Patterns) const override; /// When getMachineCombinerPatterns() finds patterns, this function generates /// the instructions that could replace the original code sequence void genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, + MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; @@ -181,6 +181,14 @@ public: bool useMachineCombiner() const override; bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; + private: void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td index fa1a46acba84..d02bc9ff394d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -16,6 +16,8 @@ // def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, @@ -24,6 +26,12 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16", "fullfp16">; +def HasSPE : Predicate<"Subtarget->hasSPE()">, + AssemblerPredicate<"FeatureSPE", "spe">; + def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsCyclone : Predicate<"Subtarget->isCyclone()">; @@ -66,6 +74,20 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4, SDTCisSameAs<0, 2>, SDTCisInt<3>, SDTCisVT<4, i32>]>; +def SDT_AArch64CCMP : SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, + SDTCisInt<1>, + SDTCisSameAs<1, 2>, + SDTCisInt<3>, + SDTCisInt<4>, + SDTCisVT<5, i32>]>; +def SDT_AArch64FCCMP : SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, + SDTCisFP<1>, + SDTCisSameAs<1, 2>, + SDTCisInt<3>, + SDTCisInt<4>, + SDTCisVT<5, i32>]>; def SDT_AArch64FCmp : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>; @@ -160,13 +182,14 @@ def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut, def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>; def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>; +def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>; +def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>; +def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>; + def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; -def AArch64fmax : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>; -def AArch64fmin : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>; - def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>; def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>; @@ -361,6 +384,9 @@ def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +// v8.2a Statistical Profiling extension +def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; + // As far as LLVM is concerned this writes to the system's exclusive monitors. let mayLoad = 1, mayStore = 1 in def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">; @@ -383,12 +409,17 @@ def : InstAlias<"isb", (ISB 0xf)>; def MRS : MRSI; def MSR : MSRI; -def MSRpstate: MSRpstateI; +def MSRpstateImm1 : MSRpstateImm0_1; +def MSRpstateImm4 : MSRpstateImm0_15; // The thread pointer (on Linux, at least, where this has been implemented) is // TPIDR_EL0. def : Pat<(AArch64threadpointer), (MRS 0xde82)>; +// The cycle counter PMC register is PMCCNTR_EL0. +let Predicates = [HasPerfMon] in +def : Pat<(readcyclecounter), (MRS 0xdce8)>; + // Generic system instructions def SYSxt : SystemXtI<0, "sys">; def SYSLxt : SystemLXtI<1, "sysl">; @@ -595,10 +626,12 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm), (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>; def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm), (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>; +let AddedComplexity = 1 in { def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3), (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>; def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3), (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>; +} // Because of the immediate format for add/sub-imm instructions, the // expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). @@ -823,7 +856,7 @@ defm AND : LogicalReg<0b00, 0, "and", and>; defm BIC : LogicalReg<0b00, 1, "bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>; defm EON : LogicalReg<0b10, 1, "eon", - BinOpFrag<(xor node:$LHS, (not node:$RHS))>>; + BinOpFrag<(not (xor node:$LHS, node:$RHS))>>; defm EOR : LogicalReg<0b10, 0, "eor", xor>; defm ORN : LogicalReg<0b01, 1, "orn", BinOpFrag<(or node:$LHS, (not node:$RHS))>>; @@ -1020,13 +1053,10 @@ def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; //===----------------------------------------------------------------------===// -// Conditionally set flags instructions. +// Conditional comparison instructions. //===----------------------------------------------------------------------===// -defm CCMN : CondSetFlagsImm<0, "ccmn">; -defm CCMP : CondSetFlagsImm<1, "ccmp">; - -defm CCMN : CondSetFlagsReg<0, "ccmn">; -defm CCMP : CondSetFlagsReg<1, "ccmp">; +defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>; +defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>; //===----------------------------------------------------------------------===// // Conditional select instructions. @@ -2421,6 +2451,26 @@ defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvt defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; } +multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> { + def : Pat<(i32 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; +} + +defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">; +defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">; +defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">; +defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">; +defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">; +defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">; +defm : FPToIntegerPats<fp_to_sint, frnd, "FCVTAS">; +defm : FPToIntegerPats<fp_to_uint, frnd, "FCVTAU">; + //===----------------------------------------------------------------------===// // Scaled integer to floating point conversion instructions. //===----------------------------------------------------------------------===// @@ -2466,14 +2516,7 @@ defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>; def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))), (FRINTNDr FPR64:$Rn)>; -// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior -// in the C spec. Setting hasSideEffects ensures it is not DCE'd. -// <rdar://problem/13715968> -// TODO: We should really model the FPSR flags correctly. This is really ugly. -let hasSideEffects = 1 in { defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>; -} - defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>; let SchedRW = [WriteFDiv] in { @@ -2488,23 +2531,23 @@ defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>; let SchedRW = [WriteFDiv] in { defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>; } -defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>; -defm FMAX : TwoOperandFPData<0b0100, "fmax", AArch64fmax>; -defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>; -defm FMIN : TwoOperandFPData<0b0101, "fmin", AArch64fmin>; +defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>; +defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>; +defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>; +defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>; let SchedRW = [WriteFMul] in { defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>; defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>; } defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>; -def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMINDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>; //===----------------------------------------------------------------------===// @@ -2556,7 +2599,7 @@ defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; //===----------------------------------------------------------------------===// defm FCCMPE : FPCondComparison<1, "fccmpe">; -defm FCCMP : FPCondComparison<0, "fccmp">; +defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>; //===----------------------------------------------------------------------===// // Floating point conditional select instruction. @@ -2589,6 +2632,40 @@ defm FMOV : FPMoveImmediate<"fmov">; // Advanced SIMD two vector instructions. //===----------------------------------------------------------------------===// +defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", + int_aarch64_neon_uabd>; +// Match UABDL in log2-shuffle patterns. +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (v8i8 V64:$opA)), + (zext (v8i8 V64:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), + (zext (extract_high_v16i8 V128:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (v4i16 V64:$opA)), + (zext (v4i16 V64:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)), + (zext (extract_high_v8i16 V128:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (v2i32 V64:$opA)), + (zext (v2i32 V64:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)), + (zext (extract_high_v4i32 V128:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; + defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))), (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))), @@ -2780,29 +2857,29 @@ defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>; defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>; defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>; -defm FABD : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>; -defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>; -defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>; -defm FADD : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>; -defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>; -defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>; -defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>; -defm FDIV : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>; -defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; -defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>; -defm FMAXP : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>; -defm FMAX : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>; -defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>; -defm FMINNM : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>; -defm FMINP : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>; -defm FMIN : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>; +defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>; +defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; +defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>; +defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>; +defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; +defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>; +defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; +defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>; +defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>; +defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>; +defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>; +defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>; +defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>; +defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>; // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the // instruction expects the addend first, while the fma intrinsic puts it last. -defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla", +defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla", TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; -defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls", +defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls", TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; // The following def pats catch the case where the LHS of an FMA is negated. @@ -2816,11 +2893,11 @@ def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>; -defm FMULX : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>; -defm FMUL : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>; -defm FRECPS : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>; -defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>; -defm FSUB : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>; +defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>; +defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>; +defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>; defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", @@ -2833,9 +2910,9 @@ defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>; defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; -defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>; +defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>; -defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>; +defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>; defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>; @@ -2852,9 +2929,9 @@ defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>; defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; -defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>; +defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>; -defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>; +defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>; defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; @@ -2879,54 +2956,6 @@ defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; -def : Pat<(v8i8 (smin V64:$Rn, V64:$Rm)), - (SMINv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (smin V64:$Rn, V64:$Rm)), - (SMINv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (smin V64:$Rn, V64:$Rm)), - (SMINv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (smin V128:$Rn, V128:$Rm)), - (SMINv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (smin V128:$Rn, V128:$Rm)), - (SMINv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (smin V128:$Rn, V128:$Rm)), - (SMINv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (smax V64:$Rn, V64:$Rm)), - (SMAXv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (smax V64:$Rn, V64:$Rm)), - (SMAXv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (smax V64:$Rn, V64:$Rm)), - (SMAXv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (smax V128:$Rn, V128:$Rm)), - (SMAXv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (smax V128:$Rn, V128:$Rm)), - (SMAXv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (smax V128:$Rn, V128:$Rm)), - (SMAXv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (umin V64:$Rn, V64:$Rm)), - (UMINv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (umin V64:$Rn, V64:$Rm)), - (UMINv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (umin V64:$Rn, V64:$Rm)), - (UMINv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (umin V128:$Rn, V128:$Rm)), - (UMINv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (umin V128:$Rn, V128:$Rm)), - (UMINv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (umin V128:$Rn, V128:$Rm)), - (UMINv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (umax V64:$Rn, V64:$Rm)), - (UMAXv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (umax V64:$Rn, V64:$Rm)), - (UMAXv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (umax V64:$Rn, V64:$Rm)), - (UMAXv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (umax V128:$Rn, V128:$Rm)), - (UMAXv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (umax V128:$Rn, V128:$Rm)), - (UMAXv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (umax V128:$Rn, V128:$Rm)), - (UMAXv4i32 V128:$Rn, V128:$Rm)>; def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; @@ -3052,6 +3081,14 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" # "|cmlt.2d\t$dst, $src1, $src2}", (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" # + "|fcmle.4h\t$dst, $src1, $src2}", + (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" # + "|fcmle.8h\t$dst, $src1, $src2}", + (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" # "|fcmle.2s\t$dst, $src1, $src2}", (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3062,6 +3099,14 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" # "|fcmle.2d\t$dst, $src1, $src2}", (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" # + "|fcmlt.4h\t$dst, $src1, $src2}", + (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" # + "|fcmlt.8h\t$dst, $src1, $src2}", + (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" # "|fcmlt.2s\t$dst, $src1, $src2}", (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3072,6 +3117,14 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" # "|fcmlt.2d\t$dst, $src1, $src2}", (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" # + "|facle.4h\t$dst, $src1, $src2}", + (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" # + "|facle.8h\t$dst, $src1, $src2}", + (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" # "|facle.2s\t$dst, $src1, $src2}", (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3082,6 +3135,14 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" # "|facle.2d\t$dst, $src1, $src2}", (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" # + "|faclt.4h\t$dst, $src1, $src2}", + (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" # + "|faclt.8h\t$dst, $src1, $src2}", + (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" # "|faclt.2s\t$dst, $src1, $src2}", (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3103,19 +3164,19 @@ defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>; defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>; defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>; defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>; -defm FABD : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>; +defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>; def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FABD64 FPR64:$Rn, FPR64:$Rm)>; -defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge", +defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge", int_aarch64_neon_facge>; -defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt", +defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt", int_aarch64_neon_facgt>; -defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>; -defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>; -defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>; -defm FMULX : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>; -defm FRECPS : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>; -defm FRSQRTS : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>; +defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; +defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>; +defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>; defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>; @@ -3198,35 +3259,35 @@ defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>; defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>; defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>; defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>; -defm FCMEQ : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; -defm FCMGE : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>; -defm FCMGT : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; -defm FCMLE : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>; -defm FCMLT : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; -defm FCVTAS : SIMDTwoScalarSD< 0, 0, 0b11100, "fcvtas">; -defm FCVTAU : SIMDTwoScalarSD< 1, 0, 0b11100, "fcvtau">; -defm FCVTMS : SIMDTwoScalarSD< 0, 0, 0b11011, "fcvtms">; -defm FCVTMU : SIMDTwoScalarSD< 1, 0, 0b11011, "fcvtmu">; -defm FCVTNS : SIMDTwoScalarSD< 0, 0, 0b11010, "fcvtns">; -defm FCVTNU : SIMDTwoScalarSD< 1, 0, 0b11010, "fcvtnu">; -defm FCVTPS : SIMDTwoScalarSD< 0, 1, 0b11010, "fcvtps">; -defm FCVTPU : SIMDTwoScalarSD< 1, 1, 0b11010, "fcvtpu">; +defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; +defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>; +defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; +defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>; +defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; +defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">; +defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">; +defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">; +defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">; +defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">; +defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">; +defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">; +defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">; def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; -defm FCVTZS : SIMDTwoScalarSD< 0, 1, 0b11011, "fcvtzs">; -defm FCVTZU : SIMDTwoScalarSD< 1, 1, 0b11011, "fcvtzu">; -defm FRECPE : SIMDTwoScalarSD< 0, 1, 0b11101, "frecpe">; -defm FRECPX : SIMDTwoScalarSD< 0, 1, 0b11111, "frecpx">; -defm FRSQRTE : SIMDTwoScalarSD< 1, 1, 0b11101, "frsqrte">; +defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; +defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; +defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">; +defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">; +defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">; defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", UnOpFrag<(sub immAllZerosV, node:$LHS)> >; -defm SCVTF : SIMDTwoScalarCVTSD< 0, 0, 0b11101, "scvtf", AArch64sitof>; +defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>; defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>; defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>; defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd", int_aarch64_neon_suqadd>; -defm UCVTF : SIMDTwoScalarCVTSD< 1, 0, 0b11101, "ucvtf", AArch64uitof>; +defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>; defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>; defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", int_aarch64_neon_usqadd>; @@ -3390,8 +3451,6 @@ defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", int_aarch64_neon_uabd>; -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - int_aarch64_neon_uabd>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", @@ -3449,8 +3508,8 @@ defm : Neon_mulacc_widen_patterns< // Patterns for 64-bit pmull def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), (PMULLv1i64 V64:$Rn, V64:$Rm)>; -def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)), - (vector_extract (v2i64 V128:$Rm), (i64 1))), +def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), + (extractelt (v2i64 V128:$Rm), (i64 1))), (PMULLv2i64 V128:$Rn, V128:$Rm)>; // CodeGen patterns for addhn and subhn instructions, which can actually be @@ -3593,11 +3652,11 @@ defm CPY : SIMDScalarCPY<"cpy">; //---------------------------------------------------------------------------- defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">; -defm FADDP : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">; -defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">; -defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">; -defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">; -defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">; +defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">; +defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">; +defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; +defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; +defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; def : Pat<(v2i64 (AArch64saddv V128:$Rn)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), @@ -3713,12 +3772,12 @@ defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>; multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP, SDNodeXForm IdxXFORM> { - def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn), + def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn), imm:$idx))))), (DUP V128:$Rn, (IdxXFORM imm:$idx))>; - def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn), - imm:$idx))))), + def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn), + imm:$idx))))), (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; } @@ -3747,6 +3806,13 @@ def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))), (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>; +def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn), + VectorIndexB:$idx)))), i8), + (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn), + VectorIndexH:$idx)))), i16), + (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>; + // Extracting i8 or i16 elements will have the zero-extend transformed to // an 'and' mask by type legalization since neither i8 nor i16 are legal types // for AArch64. Match these patterns here since UMOV already zeroes out the high @@ -3784,6 +3850,11 @@ def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (i64 FPR64:$Rn), dsub))>; +def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), @@ -3949,10 +4020,10 @@ defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">; defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">; defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">; defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">; -defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>; -defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; -defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; -defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>; +defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>; +defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; +defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; +defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>; // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. @@ -4199,15 +4270,23 @@ def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; // AdvSIMD FMOV -def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8, +def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8, "fmov", ".2d", [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; -def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64, fpimm8, +def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8, "fmov", ".2s", [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; -def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8, +def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8, "fmov", ".4s", [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +let Predicates = [HasNEON, HasFullFP16] in { +def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8, + "fmov", ".4h", + [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; +def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8, + "fmov", ".8h", + [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +} // Predicates = [HasNEON, HasFullFP16] // AdvSIMD MOVI @@ -4235,7 +4314,7 @@ def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>; // The movi_edit node has the immediate value already encoded, so we use // a plain imm0_255 in the pattern let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128, +def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, simdimmtype10, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; @@ -4296,10 +4375,10 @@ def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s", (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; // Per byte: 8b & 16b -def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64, imm0_255, +def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255, "movi", ".8b", [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>; -def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255, +def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255, "movi", ".16b", [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>; @@ -4340,8 +4419,8 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", //---------------------------------------------------------------------------- let hasSideEffects = 0 in { - defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">; - defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">; + defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">; + defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">; } // NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the @@ -4349,18 +4428,18 @@ let hasSideEffects = 0 in { // On the other hand, there are quite a few valid combinatorial options due to // the commutativity of multiplication and the fact that (-x) * y = x * (-y). -defm : SIMDFPIndexedSDTiedPatterns<"FMLA", +defm : SIMDFPIndexedTiedPatterns<"FMLA", TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>; -defm : SIMDFPIndexedSDTiedPatterns<"FMLA", +defm : SIMDFPIndexedTiedPatterns<"FMLA", TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { @@ -4424,7 +4503,9 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, V128:$Rm, VectorIndexS:$idx)>; def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 (fneg V64:$Rm)), + (vector_extract (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i32 0))), VectorIndexS:$idx))), (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; @@ -4442,8 +4523,8 @@ defm : FMLSIndexedAfterNegPatterns< defm : FMLSIndexedAfterNegPatterns< TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >; -defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; -defm FMUL : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>; +defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>; def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), (FMULv2i32_indexed V64:$Rn, @@ -4497,10 +4578,10 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), //---------------------------------------------------------------------------- // AdvSIMD scalar shift instructions //---------------------------------------------------------------------------- -defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">; -defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">; -defm SCVTF : SIMDScalarRShiftSD<0, 0b11100, "scvtf">; -defm UCVTF : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">; +defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">; +defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">; +defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">; +defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">; // Codegen patterns for the above. We don't put these directly on the // instructions because TableGen's type inference can't handle the truth. // Having the same base pattern for fp <--> int totally freaks it out. @@ -4573,7 +4654,7 @@ defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", //---------------------------------------------------------------------------- defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>; defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>; -defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf", +defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf", int_aarch64_neon_vcvtfxs2fp>; defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", int_aarch64_neon_rshrn>; @@ -4608,7 +4689,7 @@ defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll", defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; -defm UCVTF : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf", +defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", int_aarch64_neon_uqrshrn>; @@ -5133,10 +5214,10 @@ def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; def : Pat<(i64 (anyext GPR32:$src)), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>; -// When we need to explicitly zero-extend, we use an unsigned bitfield move -// instruction (UBFM) on the enclosing super-reg. +// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and +// then assert the extension has happened. def : Pat<(i64 (zext GPR32:$src)), - (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>; + (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; // To sign extend, we use a signed bitfield move instruction (SBFM) on the // containing super-reg. @@ -5801,6 +5882,21 @@ def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 (REV16v16i8 FPR128:$src))>; } +def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; + def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))), @@ -5852,6 +5948,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +// Patterns for nontemporal/no-allocate stores. +// We have to resort to tricks to turn a single-input store into a store pair, +// because there is no single-input nontemporal store, only STNP. +let Predicates = [IsLE] in { +let AddedComplexity = 15 in { +class NTStore128Pat<ValueType VT> : + Pat<(nontemporalstore (VT FPR128:$Rt), + (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub), + (CPYi64 FPR128:$Rt, (i64 1)), + GPR64sp:$Rn, simm7s8:$offset)>; + +def : NTStore128Pat<v2i64>; +def : NTStore128Pat<v4i32>; +def : NTStore128Pat<v8i16>; +def : NTStore128Pat<v16i8>; + +class NTStore64Pat<ValueType VT> : + Pat<(nontemporalstore (VT FPR64:$Rt), + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub), + (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), + GPR64sp:$Rn, simm7s4:$offset)>; + +// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64? +def : NTStore64Pat<v1f64>; +def : NTStore64Pat<v1i64>; +def : NTStore64Pat<v2i32>; +def : NTStore64Pat<v4i16>; +def : NTStore64Pat<v8i8>; + +def : Pat<(nontemporalstore GPR64:$Rt, + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), + (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32), + GPR64sp:$Rn, simm7s4:$offset)>; +} // AddedComplexity=10 +} // Predicates = [IsLE] + // Tail call return handling. These are all compiler pseudo-instructions, // so no encoding information or anything like that. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 82f77a77ab5e..566aa2c9a9ba 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -41,54 +41,85 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); +STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); +STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); +STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); -// Place holder while testing unscaled load/store combining -static cl::opt<bool> EnableAArch64UnscaledMemOp( - "aarch64-unscaled-mem-op", cl::Hidden, - cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); +namespace llvm { +void initializeAArch64LoadStoreOptPass(PassRegistry &); +} + +#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" namespace { + +typedef struct LdStPairFlags { + // If a matching instruction is found, MergeForward is set to true if the + // merge is to remove the first instruction and replace the second with + // a pair-wise insn, and false if the reverse is true. + bool MergeForward; + + // SExtIdx gives the index of the result of the load pair that must be + // extended. The value of SExtIdx assumes that the paired load produces the + // value in this order: (I, returned iterator), i.e., -1 means no value has + // to be extended, 0 means I, and 1 means the returned iterator. + int SExtIdx; + + LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} + + void setMergeForward(bool V = true) { MergeForward = V; } + bool getMergeForward() const { return MergeForward; } + + void setSExtIdx(int V) { SExtIdx = V; } + int getSExtIdx() const { return SExtIdx; } + +} LdStPairFlags; + struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; - AArch64LoadStoreOpt() : MachineFunctionPass(ID) {} + AArch64LoadStoreOpt() : MachineFunctionPass(ID) { + initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); + } const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; + const AArch64Subtarget *Subtarget; // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). - // If a matching instruction is found, MergeForward is set to true if the - // merge is to remove the first instruction and replace the second with - // a pair-wise insn, and false if the reverse is true. - // \p SExtIdx[out] gives the index of the result of the load pair that - // must be extended. The value of SExtIdx assumes that the paired load - // produces the value in this order: (I, returned iterator), i.e., - // -1 means no value has to be extended, 0 means I, and 1 means the - // returned iterator. MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, int &SExtIdx, + LdStPairFlags &Flags, unsigned Limit); + + // Scan the instructions looking for a store that writes to the address from + // which the current load instruction reads. Return true if one is found. + bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &StoreI); + // Merge the two instructions indicated into a single pair-wise instruction. // If MergeForward is true, erase the first instruction and fold its // operation into the second. If false, the reverse. Return the instruction // following the first instruction (which may change during processing). - // \p SExtIdx index of the result that must be extended for a paired load. - // -1 means none, 0 means I, and 1 means Paired. MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, bool MergeForward, - int SExtIdx); + MachineBasicBlock::iterator Paired, + const LdStPairFlags &Flags); + + // Promote the load that reads directly from the address stored to. + MachineBasicBlock::iterator + promoteLoadFromStore(MachineBasicBlock::iterator LoadI, + MachineBasicBlock::iterator StoreI); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int Value); + int UnscaledOffset); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -96,97 +127,177 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { MachineBasicBlock::iterator findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); - // Merge a pre-index base register update into a ld/st instruction. - MachineBasicBlock::iterator - mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); + // Find an instruction that updates the base register of the ld/st + // instruction. + bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + unsigned BaseReg, int Offset); - // Merge a post-index base register update into a ld/st instruction. + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator - mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); + mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, bool IsPreIdx); + + // Find and merge foldable ldr/str instructions. + bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); - bool optimizeBlock(MachineBasicBlock &MBB); + // Find and promote load instructions which read directly from store. + bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); + + // Check if converting two narrow loads into a single wider load with + // bitfield extracts could be enabled. + bool enableNarrowLdMerge(MachineFunction &Fn); + + bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "AArch64 load / store optimization pass"; + return AARCH64_LOAD_STORE_OPT_NAME; } - -private: - int getMemSize(MachineInstr *MemMI); }; char AArch64LoadStoreOpt::ID = 0; } // namespace -static bool isUnscaledLdst(unsigned Opc) { +INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", + AARCH64_LOAD_STORE_OPT_NAME, false, false) + +static bool isUnscaledLdSt(unsigned Opc) { switch (Opc) { default: return false; case AArch64::STURSi: - return true; case AArch64::STURDi: - return true; case AArch64::STURQi: - return true; + case AArch64::STURBBi: + case AArch64::STURHHi: case AArch64::STURWi: - return true; case AArch64::STURXi: - return true; case AArch64::LDURSi: - return true; case AArch64::LDURDi: - return true; case AArch64::LDURQi: - return true; case AArch64::LDURWi: - return true; case AArch64::LDURXi: - return true; case AArch64::LDURSWi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: return true; } } -// Size in bytes of the data moved by an unscaled load or store -int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { - switch (MemMI->getOpcode()) { +static bool isUnscaledLdSt(MachineInstr *MI) { + return isUnscaledLdSt(MI->getOpcode()); +} + +static unsigned getBitExtrOpcode(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode."); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: + return AArch64::UBFMWri; + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + return AArch64::SBFMWri; + } +} + +static bool isNarrowStore(unsigned Opc) { + switch (Opc) { default: - llvm_unreachable("Opcode has unknown size!"); + return false; + case AArch64::STRBBui: + case AArch64::STURBBi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return true; + } +} + +static bool isNarrowStore(MachineInstr *MI) { + return isNarrowStore(MI->getOpcode()); +} + +static bool isNarrowLoad(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + return true; + } +} + +static bool isNarrowLoad(MachineInstr *MI) { + return isNarrowLoad(MI->getOpcode()); +} + +// Scaling factor for unscaled load or store. +static int getMemScale(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::STRBBui: + case AArch64::STURBBi: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return 2; + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRSWui: + case AArch64::LDURSWi: + case AArch64::LDRWui: + case AArch64::LDURWi: case AArch64::STRSui: case AArch64::STURSi: - return 4; - case AArch64::STRDui: - case AArch64::STURDi: - return 8; - case AArch64::STRQui: - case AArch64::STURQi: - return 16; case AArch64::STRWui: case AArch64::STURWi: - return 4; - case AArch64::STRXui: - case AArch64::STURXi: - return 8; - case AArch64::LDRSui: - case AArch64::LDURSi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: return 4; case AArch64::LDRDui: case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: return 8; case AArch64::LDRQui: case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: return 16; - case AArch64::LDRWui: - case AArch64::LDURWi: - return 4; - case AArch64::LDRXui: - case AArch64::LDURXi: - return 8; - case AArch64::LDRSWui: - case AArch64::LDURSWi: - return 4; } } @@ -203,6 +314,10 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, case AArch64::STURDi: case AArch64::STRQui: case AArch64::STURQi: + case AArch64::STRBBui: + case AArch64::STURBBi: + case AArch64::STRHHui: + case AArch64::STURHHi: case AArch64::STRWui: case AArch64::STURWi: case AArch64::STRXui: @@ -219,11 +334,23 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, case AArch64::STURSi: case AArch64::LDRSui: case AArch64::LDURSi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRBBui: + case AArch64::LDURBBi: return Opc; case AArch64::LDRSWui: return AArch64::LDRWui; case AArch64::LDURSWi: return AArch64::LDURWi; + case AArch64::LDRSBWui: + return AArch64::LDRBBui; + case AArch64::LDRSHWui: + return AArch64::LDRHHui; + case AArch64::LDURSBWi: + return AArch64::LDURBBi; + case AArch64::LDURSHWi: + return AArch64::LDURHHi; } } @@ -240,6 +367,14 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::STRQui: case AArch64::STURQi: return AArch64::STPQi; + case AArch64::STRBBui: + return AArch64::STRHHui; + case AArch64::STRHHui: + return AArch64::STRWui; + case AArch64::STURBBi: + return AArch64::STURHHi; + case AArch64::STURHHi: + return AArch64::STURWi; case AArch64::STRWui: case AArch64::STURWi: return AArch64::STPWi; @@ -264,6 +399,48 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + return AArch64::LDURWi; + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + return AArch64::LDRHHui; + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + return AArch64::LDURHHi; + } +} + +static unsigned isMatchingStore(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + unsigned LdOpc = LoadInst->getOpcode(); + unsigned StOpc = StoreInst->getOpcode(); + switch (LdOpc) { + default: + llvm_unreachable("Unsupported load instruction!"); + case AArch64::LDRBBui: + return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || + StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURBBi: + return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || + StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRHHui: + return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || + StOpc == AArch64::STRXui; + case AArch64::LDURHHi: + return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || + StOpc == AArch64::STURXi; + case AArch64::LDRWui: + return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURWi: + return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRXui: + return StOpc == AArch64::STRXui; + case AArch64::LDURXi: + return StOpc == AArch64::STURXi; } } @@ -277,6 +454,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::STRDpre; case AArch64::STRQui: return AArch64::STRQpre; + case AArch64::STRBBui: + return AArch64::STRBBpre; + case AArch64::STRHHui: + return AArch64::STRHHpre; case AArch64::STRWui: return AArch64::STRWpre; case AArch64::STRXui: @@ -287,12 +468,38 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::LDRDpre; case AArch64::LDRQui: return AArch64::LDRQpre; + case AArch64::LDRBBui: + return AArch64::LDRBBpre; + case AArch64::LDRHHui: + return AArch64::LDRHHpre; case AArch64::LDRWui: return AArch64::LDRWpre; case AArch64::LDRXui: return AArch64::LDRXpre; case AArch64::LDRSWui: return AArch64::LDRSWpre; + case AArch64::LDPSi: + return AArch64::LDPSpre; + case AArch64::LDPSWi: + return AArch64::LDPSWpre; + case AArch64::LDPDi: + return AArch64::LDPDpre; + case AArch64::LDPQi: + return AArch64::LDPQpre; + case AArch64::LDPWi: + return AArch64::LDPWpre; + case AArch64::LDPXi: + return AArch64::LDPXpre; + case AArch64::STPSi: + return AArch64::STPSpre; + case AArch64::STPDi: + return AArch64::STPDpre; + case AArch64::STPQi: + return AArch64::STPQpre; + case AArch64::STPWi: + return AArch64::STPWpre; + case AArch64::STPXi: + return AArch64::STPXpre; } } @@ -306,6 +513,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::STRDpost; case AArch64::STRQui: return AArch64::STRQpost; + case AArch64::STRBBui: + return AArch64::STRBBpost; + case AArch64::STRHHui: + return AArch64::STRHHpost; case AArch64::STRWui: return AArch64::STRWpost; case AArch64::STRXui: @@ -316,19 +527,111 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::LDRDpost; case AArch64::LDRQui: return AArch64::LDRQpost; + case AArch64::LDRBBui: + return AArch64::LDRBBpost; + case AArch64::LDRHHui: + return AArch64::LDRHHpost; case AArch64::LDRWui: return AArch64::LDRWpost; case AArch64::LDRXui: return AArch64::LDRXpost; case AArch64::LDRSWui: return AArch64::LDRSWpost; + case AArch64::LDPSi: + return AArch64::LDPSpost; + case AArch64::LDPSWi: + return AArch64::LDPSWpost; + case AArch64::LDPDi: + return AArch64::LDPDpost; + case AArch64::LDPQi: + return AArch64::LDPQpost; + case AArch64::LDPWi: + return AArch64::LDPWpost; + case AArch64::LDPXi: + return AArch64::LDPXpost; + case AArch64::STPSi: + return AArch64::STPSpost; + case AArch64::STPDi: + return AArch64::STPDpost; + case AArch64::STPQi: + return AArch64::STPQpost; + case AArch64::STPWi: + return AArch64::STPWpost; + case AArch64::STPXi: + return AArch64::STPXpost; } } +static bool isPairedLdSt(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + return true; + } +} + +static const MachineOperand &getLdStRegOp(const MachineInstr *MI, + unsigned PairedRegOp = 0) { + assert(PairedRegOp < 2 && "Unexpected register operand idx."); + unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; + return MI->getOperand(Idx); +} + +static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { + unsigned Idx = isPairedLdSt(MI) ? 2 : 1; + return MI->getOperand(Idx); +} + +static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { + unsigned Idx = isPairedLdSt(MI) ? 3 : 2; + return MI->getOperand(Idx); +} + +static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); + int LoadSize = getMemScale(LoadInst); + int StoreSize = getMemScale(StoreInst); + int UnscaledStOffset = isUnscaledLdSt(StoreInst) + ? getLdStOffsetOp(StoreInst).getImm() + : getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int UnscaledLdOffset = isUnscaledLdSt(LoadInst) + ? getLdStOffsetOp(LoadInst).getImm() + : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + return (UnscaledStOffset <= UnscaledLdOffset) && + (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); +} + +// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. +static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, + MachineInstr *Op1) { + assert(MI->memoperands_empty() && "expected a new machineinstr"); + size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + + (Op1->memoperands_end() - Op1->memoperands_begin()); + + MachineFunction *MF = MI->getParent()->getParent(); + MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); + MachineSDNode::mmo_iterator MemEnd = + std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); + MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); + MI->setMemRefs(MemBegin, MemEnd); +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - bool MergeForward, int SExtIdx) { + const LdStPairFlags &Flags) { MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need @@ -338,25 +641,26 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (NextI == Paired) ++NextI; + int SExtIdx = Flags.getSExtIdx(); unsigned Opc = SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); - bool IsUnscaled = isUnscaledLdst(Opc); - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; + bool IsUnscaled = isUnscaledLdSt(Opc); + int OffsetStride = IsUnscaled ? getMemScale(I) : 1; + bool MergeForward = Flags.getMergeForward(); unsigned NewOpc = getMatchingPairOpcode(Opc); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. - MachineOperand &BaseRegOp = - MergeForward ? Paired->getOperand(1) : I->getOperand(1); + const MachineOperand &BaseRegOp = + MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; - if (I->getOperand(2).getImm() == - Paired->getOperand(2).getImm() + OffsetStride) { + if (getLdStOffsetOp(I).getImm() == + getLdStOffsetOp(Paired).getImm() + OffsetStride) { RtMI = Paired; Rt2MI = I; // Here we swapped the assumption made for SExtIdx. @@ -368,18 +672,135 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, RtMI = I; Rt2MI = Paired; } - // Handle Unscaled - int OffsetImm = RtMI->getOperand(2).getImm(); - if (IsUnscaled && EnableAArch64UnscaledMemOp) - OffsetImm /= OffsetStride; + + int OffsetImm = getLdStOffsetOp(RtMI).getImm(); + + if (isNarrowLoad(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + MachineInstr *RtNewDest = MergeForward ? I : Paired; + // When merging small (< 32 bit) loads for big-endian targets, the order of + // the component parts gets swapped. + if (!Subtarget->isLittleEndian()) + std::swap(RtMI, Rt2MI); + // Construct the new load instruction. + MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; + NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + + // Copy MachineMemOperands from the original loads. + concatenateMemOperands(NewMemMI, I, Paired); + + DEBUG( + dbgs() + << "Creating the new load and extract. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG((NewMemMI)->print(dbgs())); + + int Width = getMemScale(I) == 1 ? 8 : 16; + int LSBLow = 0; + int LSBHigh = Width; + int ImmsLow = LSBLow + Width - 1; + int ImmsHigh = LSBHigh + Width - 1; + MachineInstr *ExtDestMI = MergeForward ? Paired : I; + if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { + // Create the bitfield extract for high bits. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(Rt2MI))) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); + // Create the bitfield extract for low bits. + if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { + // For unsigned, prefer to use AND for low bits. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(ImmsLow); + } else { + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(RtMI))) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); + } + } else { + // Create the bitfield extract for low bits. + if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { + // For unsigned, prefer to use AND for low bits. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(ImmsLow); + } else { + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(RtMI))) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); + } + + // Create the bitfield extract for high bits. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(Rt2MI))) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); + } + DEBUG(dbgs() << " "); + DEBUG((BitExtMI1)->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI2)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + return NextI; + } // Construct the new instruction. - MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, - I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(RtMI->getOperand(0)) - .addOperand(Rt2MI->getOperand(0)) - .addOperand(BaseRegOp) - .addImm(OffsetImm); + MachineInstrBuilder MIB; + if (isNarrowStore(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(I)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + // Copy MachineMemOperands from the original stores. + concatenateMemOperands(MIB, I, Paired); + } else { + // Handle Unscaled + if (IsUnscaled) + OffsetImm /= OffsetStride; + MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtMI)) + .addOperand(getLdStRegOp(Rt2MI)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + } + (void)MIB; // FIXME: Do we need/want to copy the mem operands from the source @@ -439,13 +860,112 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, return NextI; } +MachineBasicBlock::iterator +AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, + MachineBasicBlock::iterator StoreI) { + MachineBasicBlock::iterator NextI = LoadI; + ++NextI; + + int LoadSize = getMemScale(LoadI); + int StoreSize = getMemScale(StoreI); + unsigned LdRt = getLdStRegOp(LoadI).getReg(); + unsigned StRt = getLdStRegOp(StoreI).getReg(); + bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); + + assert((IsStoreXReg || + TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && + "Unexpected RegClass"); + + MachineInstr *BitExtMI; + if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { + // Remove the load, if the destination register of the loads is the same + // register for stored value. + if (StRt == LdRt && LoadSize == 8) { + DEBUG(dbgs() << "Remove load instruction:\n "); + DEBUG(LoadI->print(dbgs())); + DEBUG(dbgs() << "\n"); + LoadI->eraseFromParent(); + return NextI; + } + // Replace the load with a mov if the load and store are in the same size. + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) + .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) + .addReg(StRt) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + // FIXME: Currently we disable this transformation in big-endian targets as + // performance and correctness are verified only in little-endian. + if (!Subtarget->isLittleEndian()) + return NextI; + bool IsUnscaled = isUnscaledLdSt(LoadI); + assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match"); + assert(LoadSize <= StoreSize && "Invalid load size"); + int UnscaledLdOffset = IsUnscaled + ? getLdStOffsetOp(LoadI).getImm() + : getLdStOffsetOp(LoadI).getImm() * LoadSize; + int UnscaledStOffset = IsUnscaled + ? getLdStOffsetOp(StoreI).getImm() + : getLdStOffsetOp(StoreI).getImm() * StoreSize; + int Width = LoadSize * 8; + int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + int Imms = Immr + Width - 1; + unsigned DestReg = IsStoreXReg + ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, + &AArch64::GPR64RegClass) + : LdRt; + + assert((UnscaledLdOffset >= UnscaledStOffset && + (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && + "Invalid offset"); + + Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + Imms = Immr + Width - 1; + if (UnscaledLdOffset == UnscaledStOffset) { + uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N + | ((Immr) << 6) // immr + | ((Imms) << 0) // imms + ; + + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), + DestReg) + .addReg(StRt) + .addImm(AndMaskEncoded); + } else { + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), + DestReg) + .addReg(StRt) + .addImm(Immr) + .addImm(Imms); + } + } + + DEBUG(dbgs() << "Promoting load by replacing :\n "); + DEBUG(StoreI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(LoadI->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG(StoreI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + LoadI->eraseFromParent(); + return NextI; +} + /// trackRegDefsUses - Remember what registers the specified instruction uses /// and modifies. -static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, +static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, BitVector &UsedRegs, const TargetRegisterInfo *TRI) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (MO.isRegMask()) ModifiedRegs.setBitsNotInMask(MO.getRegMask()); @@ -464,16 +984,12 @@ static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, } static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { - if (!IsUnscaled && (Offset > 63 || Offset < -64)) - return false; - if (IsUnscaled) { - // Convert the byte-offset used by unscaled into an "element" offset used - // by the scaled pair load/store instructions. - int ElemOffset = Offset / OffsetStride; - if (ElemOffset > 63 || ElemOffset < -64) - return false; - } - return true; + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + if (IsUnscaled) + Offset /= OffsetStride; + + return Offset <= 63 && Offset >= -64; } // Do alignment, specialized to power of 2 and for signed ints, @@ -507,12 +1023,65 @@ static bool mayAlias(MachineInstr *MIa, return false; } +bool AArch64LoadStoreOpt::findMatchingStore( + MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &StoreI) { + MachineBasicBlock::iterator E = I->getParent()->begin(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + + // Track which registers have been modified and used between the first insn + // and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + + for (unsigned Count = 0; MBBI != E && Count < Limit;) { + --MBBI; + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + // Now that we know this is a real instruction, count it. + ++Count; + + // If the load instruction reads directly from the address to which the + // store instruction writes and the stored value is not modified, we can + // promote the load. Since we do not handle stores with pre-/post-index, + // it's unnecessary to check if BaseReg is modified by the store itself. + if (MI->mayStore() && isMatchingStore(FirstMI, MI) && + BaseReg == getLdStBaseOp(MI).getReg() && + isLdOffsetInRangeOfSt(FirstMI, MI) && + !ModifiedRegs[getLdStRegOp(MI).getReg()]) { + StoreI = MBBI; + return true; + } + + if (MI->isCall()) + return false; + + // Update modified / uses register lists. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg]) + return false; + + // If we encounter a store aliased with the load, return early. + if (MI->mayStore() && mayAlias(FirstMI, MI, TII)) + return false; + } + return false; +} + /// findMatchingInsn - Scan the instructions looking for a load/store that can /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, int &SExtIdx, - unsigned Limit) { + LdStPairFlags &Flags, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr *FirstMI = I; @@ -520,21 +1089,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, unsigned Opc = FirstMI->getOpcode(); bool MayLoad = FirstMI->mayLoad(); - bool IsUnscaled = isUnscaledLdst(Opc); - unsigned Reg = FirstMI->getOperand(0).getReg(); - unsigned BaseReg = FirstMI->getOperand(1).getReg(); - int Offset = FirstMI->getOperand(2).getImm(); + bool IsUnscaled = isUnscaledLdSt(FirstMI); + unsigned Reg = getLdStRegOp(FirstMI).getReg(); + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + int Offset = getLdStOffsetOp(FirstMI).getImm(); + bool IsNarrowStore = isNarrowStore(Opc); + + // For narrow stores, find only the case where the stored value is WZR. + if (IsNarrowStore && Reg != AArch64::WZR) + return E; // Early exit if the first instruction modifies the base register. // e.g., ldr x0, [x0] - // Early exit if the offset if not possible to match. (6 bits of positive - // range, plus allow an extra one in case we find a later insn that matches - // with Offset-1 if (FirstMI->modifiesRegister(BaseReg, TRI)) return E; - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; - if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + + // Early exit if the offset if not possible to match. (6 bits of positive + // range, plus allow an extra one in case we find a later insn that matches + // with Offset-1) + int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; + if (!(isNarrowLoad(Opc) || IsNarrowStore) && + !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) return E; // Track which registers have been modified and used between the first insn @@ -557,18 +1132,19 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, ++Count; bool CanMergeOpc = Opc == MI->getOpcode(); - SExtIdx = -1; + Flags.setSExtIdx(-1); if (!CanMergeOpc) { bool IsValidLdStrOpc; unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); - if (!IsValidLdStrOpc) - continue; + assert(IsValidLdStrOpc && + "Given Opc should be a Load or Store with an immediate"); // Opc will be the first instruction in the pair. - SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0; + Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); } - if (CanMergeOpc && MI->getOperand(2).isImm()) { + if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { + assert(MI->mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just @@ -579,8 +1155,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Pairwise instructions have a 7-bit signed offset field. Single insns // have a 12-bit unsigned offset field. To be a valid combine, the // final offset must be in range. - unsigned MIBaseReg = MI->getOperand(1).getReg(); - int MIOffset = MI->getOperand(2).getImm(); + unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); + int MIOffset = getLdStOffsetOp(MI).getImm(); if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || (Offset + OffsetStride == MIOffset))) { int MinOffset = Offset < MIOffset ? Offset : MIOffset; @@ -591,30 +1167,43 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, return E; // If the resultant immediate offset of merging these instructions // is out of range for a pairwise instruction, bail and keep looking. - bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); - if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { + bool MIIsUnscaled = isUnscaledLdSt(MI); + bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); + if (!IsNarrowLoad && + !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + MemInsns.push_back(MI); continue; } - // If the alignment requirements of the paired (scaled) instruction - // can't express the offset of the unscaled input, bail and keep - // looking. - if (IsUnscaled && EnableAArch64UnscaledMemOp && - (alignTo(MinOffset, OffsetStride) != MinOffset)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) + + if (IsNarrowLoad || IsNarrowStore) { + // If the alignment requirements of the scaled wide load/store + // instruction can't express the offset of the scaled narrow + // input, bail and keep looking. + if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(MI); - continue; + continue; + } + } else { + // If the alignment requirements of the paired (scaled) instruction + // can't express the offset of the unscaled input, bail and keep + // looking. + if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. - if (MayLoad && Reg == MI->getOperand(0).getReg()) { + // For narrow stores, allow only when the stored value is the same + // (i.e., WZR). + if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || + (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + MemInsns.push_back(MI); continue; } @@ -622,10 +1211,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // the two instructions and none of the instructions between the second // and first alias with the second, we can combine the second into the // first. - if (!ModifiedRegs[MI->getOperand(0).getReg()] && - !(MI->mayLoad() && UsedRegs[MI->getOperand(0).getReg()]) && + if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && + !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && !mayAlias(MI, MemInsns, TII)) { - MergeForward = false; + Flags.setMergeForward(false); return MBBI; } @@ -633,11 +1222,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // between the two instructions and none of the instructions between the // first and the second alias with the first, we can combine the first // into the second. - if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && - !(FirstMI->mayLoad() && - UsedRegs[FirstMI->getOperand(0).getReg()]) && + if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && + !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && !mayAlias(FirstMI, MemInsns, TII)) { - MergeForward = true; + Flags.setMergeForward(true); return MBBI; } // Unable to combine these instructions due to interference in between. @@ -666,51 +1254,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, } MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update) { - assert((Update->getOpcode() == AArch64::ADDXri || - Update->getOpcode() == AArch64::SUBXri) && - "Unexpected base register update instruction to merge!"); - MachineBasicBlock::iterator NextI = I; - // Return the instruction following the merged instruction, which is - // the instruction following our unmerged load. Unless that's the add/sub - // instruction we're merging, in which case it's the one after that. - if (++NextI == Update) - ++NextI; - - int Value = Update->getOperand(2).getImm(); - assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into pre-indexed load / store"); - if (Update->getOpcode() == AArch64::SUBXri) - Value = -Value; - - unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(Update->getOperand(0)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addImm(Value); - (void)MIB; - - DEBUG(dbgs() << "Creating pre-indexed load/store."); - DEBUG(dbgs() << " Replacing instructions:\n "); - DEBUG(I->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG(Update->print(dbgs())); - DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); - - // Erase the old instructions for the block. - I->eraseFromParent(); - Update->eraseFromParent(); - - return NextI; -} - -MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( - MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) { +AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, + bool IsPreIdx) { assert((Update->getOpcode() == AArch64::ADDXri || Update->getOpcode() == AArch64::SUBXri) && "Unexpected base register update instruction to merge!"); @@ -723,20 +1269,36 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( int Value = Update->getOperand(2).getImm(); assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into post-indexed load / store"); + "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); if (Update->getOpcode() == AArch64::SUBXri) Value = -Value; - unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(Update->getOperand(0)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addImm(Value); + unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) + : getPostIndexedOpcode(I->getOpcode()); + MachineInstrBuilder MIB; + if (!isPairedLdSt(I)) { + // Non-paired instruction. + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value); + } else { + // Paired instruction. + int Scale = getMemScale(I); + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I, 0)) + .addOperand(getLdStRegOp(I, 1)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value / Scale); + } (void)MIB; - DEBUG(dbgs() << "Creating post-indexed load/store."); + if (IsPreIdx) + DEBUG(dbgs() << "Creating pre-indexed load/store."); + else + DEBUG(dbgs() << "Creating post-indexed load/store."); DEBUG(dbgs() << " Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); @@ -752,8 +1314,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( return NextI; } -static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, - int Offset) { +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, + MachineInstr *MI, + unsigned BaseReg, int Offset) { switch (MI->getOpcode()) { default: break; @@ -769,44 +1332,65 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, // Watch out for 1 << 12 shifted value. if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) break; - // If the instruction has the base register as source and dest and the - // immediate will fit in a signed 9-bit integer, then we have a match. - if (MI->getOperand(0).getReg() == BaseReg && - MI->getOperand(1).getReg() == BaseReg && - MI->getOperand(2).getImm() <= 255 && - MI->getOperand(2).getImm() >= -256) { - // If we have a non-zero Offset, we check that it matches the amount - // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) - return true; + + // The update instruction source and destination register must be the + // same as the load/store base register. + if (MI->getOperand(0).getReg() != BaseReg || + MI->getOperand(1).getReg() != BaseReg) + break; + + bool IsPairedInsn = isPairedLdSt(MemMI); + int UpdateOffset = MI->getOperand(2).getImm(); + // For non-paired load/store instructions, the immediate must fit in a + // signed 9-bit integer. + if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + break; + + // For paired load/store instructions, the immediate must be a multiple of + // the scaling factor. The scaled offset must also fit into a signed 7-bit + // integer. + if (IsPairedInsn) { + int Scale = getMemScale(MemMI); + if (UpdateOffset % Scale != 0) + break; + + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > 64 || ScaledOffset < -64) + break; } + + // If we have a non-zero Offset, we check that it matches the amount + // we're adding to the register. + if (!Offset || Offset == MI->getOperand(2).getImm()) + return true; break; } return false; } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( - MachineBasicBlock::iterator I, unsigned Limit, int Value) { + MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = MemMI->getOperand(0).getReg(); - unsigned BaseReg = MemMI->getOperand(1).getReg(); - int Offset = MemMI->getOperand(2).getImm() * - TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); - // If the base register overlaps the destination register, we can't - // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + // Scan forward looking for post-index opportunities. Updating instructions + // can't be formed if the memory instruction doesn't have the offset we're + // looking for. + if (MIUnscaledOffset != UnscaledOffset) return E; - // Scan forward looking for post-index opportunities. - // Updating instructions can't be formed if the memory insn already - // has an offset other than the value we're looking for. - if (Offset != Value) - return E; + // If the base register overlaps a destination register, we can't + // merge the update. + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -825,7 +1409,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, Value)) + if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -845,21 +1429,22 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = MemMI->getOperand(0).getReg(); - unsigned BaseReg = MemMI->getOperand(1).getReg(); - int Offset = MemMI->getOperand(2).getImm(); - unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + int Offset = getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. if (MBBI == B || Offset != 0) return E; - // If the base register overlaps the destination register, we can't + // If the base register overlaps a destination register, we can't // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -878,7 +1463,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) + if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -892,17 +1477,101 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; } -bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { +bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + // If this is a volatile load, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm. + // FIXME: It is possible to extend it to handle reg+reg cases. + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Look backward up to ScanLimit instructions. + MachineBasicBlock::iterator StoreI; + if (findMatchingStore(MBBI, ScanLimit, StoreI)) { + ++NumLoadsFromStoresPromoted; + // Promote the load. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = promoteLoadFromStore(MBBI, StoreI); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::tryToMergeLdStInst( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + MachineBasicBlock::iterator E = MI->getParent()->end(); + // If this is a volatile load/store, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (TII->isLdStPairSuppressed(MI)) + return false; + + // Look ahead up to ScanLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); + if (Paired != E) { + if (isNarrowLoad(MI)) { + ++NumNarrowLoadsPromoted; + } else if (isNarrowStore(MI)) { + ++NumZeroStoresPromoted; + } else { + ++NumPairCreated; + if (isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + } + + // Merge the loads into a pair. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = mergePairedInsns(MBBI, Paired, Flags); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, + bool enableNarrowLdOpt) { bool Modified = false; - // Two tranformations to do here: - // 1) Find loads and stores that can be merged into a single load or store + // Three tranformations to do here: + // 1) Find loads that directly read from stores and promote them by + // replacing with mov instructions. If the store is wider than the load, + // the load will be replaced with a bitfield extract. + // e.g., + // str w1, [x0, #4] + // ldrh w2, [x0, #6] + // ; becomes + // str w1, [x0, #4] + // lsr w2, w1, #16 + // 2) Find narrow loads that can be converted into a single wider load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // 3) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., // ldr x0, [x2] // ldr x1, [x2, #8] // ; becomes // ldp x0, x1, [x2] - // 2) Find base register updates that can be merged into the load or store + // 4) Find base register updates that can be merged into the load or store // as a base-reg writeback. // e.g., // ldr x0, [x2] @@ -918,6 +1587,69 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRXui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDURXi: { + if (tryToPromoteLoadFromStore(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + enableNarrowLdOpt && MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRSBWui: + case AArch64::LDRSHWui: + case AArch64::STRBBui: + case AArch64::STRHHui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + case AArch64::STURBBi: + case AArch64::STURHHi: { + if (tryToMergeLdStInst(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -929,7 +1661,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: - // do the unscaled versions as well + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -941,37 +1673,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) { - ++MBBI; - break; - } - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!MI->getOperand(2).isImm()) { - ++MBBI; - break; - } - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) { - ++MBBI; - break; - } - // Look ahead up to ScanLimit instructions for a pairable instruction. - bool MergeForward = false; - int SExtIdx = -1; - MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit); - if (Paired != E) { - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx); - + if (tryToMergeLdStInst(MBBI)) { Modified = true; - ++NumPairCreated; - if (isUnscaledLdst(MI->getOpcode())) - ++NumUnscaledPairCreated; break; } ++MBBI; @@ -992,17 +1695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: case AArch64::STRXui: case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: - // do the unscaled versions as well + case AArch64::LDRHHui: + case AArch64::LDRBBui: + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -1012,25 +1720,41 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: - case AArch64::LDURXi: { + case AArch64::LDURXi: + // Paired instructions. + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: { // Make sure this is a reg+imm (as opposed to an address reloc). - if (!MI->getOperand(2).isImm()) { + if (!getLdStOffsetOp(MI).isImm()) { ++MBBI; break; } - // Look ahead up to ScanLimit instructions for a mergable instruction. + // Look forward to try to form a post-index instruction. For example, + // ldr x0, [x20] + // add x20, x20, #32 + // merged into: + // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePostIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); Modified = true; ++NumPostFolded; break; } // Don't know how to handle pre/post-index versions, so move to the next // instruction. - if (isUnscaledLdst(Opc)) { + if (isUnscaledLdSt(Opc)) { ++MBBI; break; } @@ -1043,28 +1767,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; } + // The immediate in the load/store is scaled by the size of the memory + // operation. The immediate in the add we're looking for, + // however, is not, so adjust here. + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); // Look forward to try to find a post-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - - // The immediate in the load/store is scaled by the size of the register - // being loaded. The immediate in the add we're looking for, - // however, is not, so adjust here. - int Value = MI->getOperand(2).getImm() * - TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) - ->getSize(); - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); + Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; @@ -1081,13 +1802,24 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { return Modified; } +bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { + bool ProfitableArch = Subtarget->isCortexA57(); + // FIXME: The benefit from converting narrow loads into a wider load could be + // microarchitectural as it assumes that a single load with two bitfield + // extracts is cheaper than two narrow loads. Currently, this conversion is + // enabled only in cortex-a57 on which performance benefits were verified. + return ProfitableArch && !Subtarget->requiresStrictAlign(); +} + bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo()); - TRI = Fn.getSubtarget().getRegisterInfo(); + Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); + TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); + TRI = Subtarget->getRegisterInfo(); bool Modified = false; + bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MBB, enableNarrowLdOpt); return Modified; } @@ -1095,8 +1827,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep // loads and stores near one another? -/// createARMLoadStoreOptimizationPass - returns an instance of the load / store -/// optimization pass. +/// createAArch64LoadStoreOptimizationPass - returns an instance of the +/// load / store optimization pass. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { return new AArch64LoadStoreOpt(); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 580427ab3cc1..2b4cdf1083be 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -207,9 +207,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO, void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; - if (lowerOperand(MI->getOperand(i), MCOp)) + if (lowerOperand(MO, MCOp)) OutMI.addOperand(MCOp); } } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h deleted file mode 100644 index 4164b3364559..000000000000 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h +++ /dev/null @@ -1,42 +0,0 @@ -//===- AArch64MachineCombinerPattern.h -===// -//===- AArch64 instruction pattern supported by combiner -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines instruction pattern supported by combiner -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H - -namespace llvm { - -/// Enumeration of instruction pattern supported by machine combiner -/// -/// -namespace MachineCombinerPattern { -enum MC_PATTERN : int { - MC_NONE = 0, - MC_MULADDW_OP1 = 1, - MC_MULADDW_OP2 = 2, - MC_MULSUBW_OP1 = 3, - MC_MULSUBW_OP2 = 4, - MC_MULADDWI_OP1 = 5, - MC_MULSUBWI_OP1 = 6, - MC_MULADDX_OP1 = 7, - MC_MULADDX_OP2 = 8, - MC_MULSUBX_OP1 = 9, - MC_MULSUBX_OP2 = 10, - MC_MULADDXI_OP1 = 11, - MC_MULSUBXI_OP1 = 12 -}; -} // end namespace MachineCombinerPattern -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 536a8d0f97a0..318f83953505 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -1,4 +1,4 @@ -//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=// +//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -42,7 +42,7 @@ class AArch64FunctionInfo : public MachineFunctionInfo { unsigned ArgumentStackToRestore; /// HasStackFrame - True if this function has a stack frame. Set by - /// processFunctionBeforeCalleeSavedScan(). + /// determineCalleeSaves(). bool HasStackFrame; /// \brief Amount of stack frame size, not including callee-saved registers. @@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo { /// registers. unsigned VarArgsFPRSize; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + public: AArch64FunctionInfo() : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {} + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) {} explicit AArch64FunctionInfo(MachineFunction &MF) : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) { + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) { (void)MF; } @@ -96,6 +102,9 @@ public: bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } unsigned getLocalStackSize() const { return LocalStackSize; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index e1b93bf07c89..79c09d9f058d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -489,7 +489,7 @@ bool AArch64PromoteConstant::insertDefinitions( for (const auto &IPI : InsertPts) { // Create the load of the global variable. - IRBuilder<> Builder(IPI.first->getParent(), IPI.first); + IRBuilder<> Builder(IPI.first); LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); DEBUG(dbgs() << "**********\n"); DEBUG(dbgs() << "New def: "); @@ -540,7 +540,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) { bool LocalChange = false; SmallPtrSet<Constant *, 8> AlreadyChecked; - for (Instruction &I : inst_range(&F)) { + for (Instruction &I : instructions(&F)) { // Traverse the operand, looking for constant vectors. Replace them by a // load of a global variable of constant vector type. for (Value *Op : I.operand_values()) { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 841af55f7a65..32b4888f2f64 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" @@ -34,10 +35,6 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "AArch64GenRegisterInfo.inc" -static cl::opt<bool> -ReserveX18("aarch64-reserve-x18", cl::Hidden, - cl::desc("Reserve X18, making it unavailable as GPR")); - AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {} @@ -50,10 +47,23 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_NoRegs_SaveList; if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS) + return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ? + CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : + CSR_AArch64_CXX_TLS_Darwin_SaveList; else return CSR_AArch64_AAPCS_SaveList; } +const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()) + return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { @@ -62,6 +72,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return CSR_AArch64_AllRegs_RegMask; + if (CC == CallingConv::CXX_FAST_TLS) + return CSR_AArch64_CXX_TLS_Darwin_RegMask; else return CSR_AArch64_AAPCS_RegMask; } @@ -104,7 +116,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AArch64::W29); } - if (TT.isOSDarwin() || ReserveX18) { + if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) { Reserved.set(AArch64::X18); // Platform register Reserved.set(AArch64::W18); } @@ -131,7 +143,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, return true; case AArch64::X18: case AArch64::W18: - return TT.isOSDarwin() || ReserveX18; + return MF.getSubtarget<AArch64Subtarget>().isX18Reserved(); case AArch64::FP: case AArch64::W29: return TFI->hasFP(MF) || TT.isOSDarwin(); @@ -186,29 +198,6 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { return false; } -bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const { - - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) - return false; - - return true; -} - -// FIXME: share this with other backends with identical implementation? -bool -AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const AArch64FrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = - ((MFI->getMaxAlignment() > StackAlign) || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); -} - unsigned AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); @@ -424,10 +413,11 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case AArch64::GPR64RegClassID: case AArch64::GPR32commonRegClassID: case AArch64::GPR64commonRegClassID: - return 32 - 1 // XZR/SP - - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP - - (TT.isOSDarwin() || ReserveX18) // X18 reserved as platform register - - hasBasePointer(MF); // X19 + return 32 - 1 // XZR/SP + - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP + - MF.getSubtarget<AArch64Subtarget>() + .isX18Reserved() // X18 reserved as platform register + - hasBasePointer(MF); // X19 case AArch64::FPR8RegClassID: case AArch64::FPR16RegClassID: case AArch64::FPR32RegClassID: diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 8c379d926108..f33f788fd437 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -35,6 +35,8 @@ public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; @@ -93,9 +95,6 @@ public: unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - // Base pointer (stack realignment) support. - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index b2efca023372..a8c8b176efa9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -407,7 +407,7 @@ def FPR128 : RegisterClass<"AArch64", // The lower 16 vector registers. Some instructions can only take registers // in this range. def FPR128_lo : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, (trunc FPR128, 16)>; // Pairs, triples, and quads of 64-bit vector registers. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 486efd6ce3a2..f6ee8cf47a6a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -31,6 +31,11 @@ static cl::opt<bool> EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " "converter pass"), cl::init(true), cl::Hidden); +// If OS supports TBI, use this flag to enable it. +static cl::opt<bool> +UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " + "an address is ignored"), cl::init(false), cl::Hidden); + AArch64Subtarget & AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { // Determine default and user-specified characteristics @@ -46,9 +51,11 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false), - HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), - IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), + HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false), + HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false), + HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), + StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), + CPUString(CPU), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), TLInfo(TM, *this) {} @@ -113,12 +120,30 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // bi-directional scheduling. 253.perlbmk. Policy.OnlyTopDown = false; Policy.OnlyBottomUp = false; + // Enabling or Disabling the latency heuristic is a close call: It seems to + // help nearly no benchmark on out-of-order architectures, on the other hand + // it regresses register pressure on a few benchmarking. + if (isCyclone()) + Policy.DisableLatencyHeuristic = true; } bool AArch64Subtarget::enableEarlyIfConversion() const { return EnableEarlyIfConvert; } +bool AArch64Subtarget::supportsAddressTopByteIgnored() const { + if (!UseAddressTopByteIgnored) + return false; + + if (TargetTriple.isiOS()) { + unsigned Major, Minor, Micro; + TargetTriple.getiOSVersion(Major, Minor, Micro); + return Major >= 8; + } + + return false; +} + std::unique_ptr<PBQPRAConstraint> AArch64Subtarget::getCustomPBQPConstraints() const { if (!isCortexA57()) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h index 6bb069423060..1b8b9b27719c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -33,17 +33,21 @@ class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { protected: - enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone}; + enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone}; /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. ARMProcFamilyEnum ARMProcFamily; bool HasV8_1aOps; + bool HasV8_2aOps; bool HasFPARMv8; bool HasNEON; bool HasCrypto; bool HasCRC; + bool HasPerfMon; + bool HasFullFP16; + bool HasSPE; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove; @@ -51,6 +55,12 @@ protected: // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. bool HasZeroCycleZeroing; + // StrictAlign - Disallow unaligned memory accesses. + bool StrictAlign; + + // ReserveX18 - X18 is not available as a general purpose register. + bool ReserveX18; + bool IsLittle; /// CPUString - String name of used CPU. @@ -92,19 +102,30 @@ public: const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { - return isCortexA53() || isCortexA57(); + return isGeneric() || isCortexA53() || isCortexA57(); } bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + bool requiresStrictAlign() const { return StrictAlign; } + + bool isX18Reserved() const { return ReserveX18; } bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + /// CPU has TBI (top byte of addresses is ignored during HW address + /// translation) and OS enables it. + bool supportsAddressTopByteIgnored() const; + + bool hasPerfMon() const { return HasPerfMon; } + bool hasFullFP16() const { return HasFullFP16; } + bool hasSPE() const { return HasSPE; } bool isLittleEndian() const { return IsLittle; } @@ -112,11 +133,13 @@ public: bool isTargetIOS() const { return TargetTriple.isiOS(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + bool isGeneric() const { return CPUString == "generic"; } bool isCyclone() const { return CPUString == "cyclone"; } bool isCortexA57() const { return CPUString == "cortex-a57"; } bool isCortexA53() const { return CPUString == "cortex-a53"; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index db6e244337a7..c52c5544fc7e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -203,7 +203,7 @@ public: } // namespace TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(AArch64TTIImpl(this, F)); }); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e085cca35f1c..9af0e6444789 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -23,7 +23,7 @@ using namespace llvm; /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) { +int AArch64TTIImpl::getIntImmCost(int64_t Val) { // Check if the immediate can be encoded within an instruction. if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) return 0; @@ -37,7 +37,7 @@ unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) { } /// \brief Calculate the cost of materializing the given constant. -unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -51,18 +51,18 @@ unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { // Split the constant into 64-bit chunks and calculate the cost for each // chunk. - unsigned Cost = 0; + int Cost = 0; for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); int64_t Val = Tmp.getSExtValue(); Cost += getIntImmCost(Val); } // We need at least one instruction to materialze the constant. - return std::max(1U, Cost); + return std::max(1, Cost); } -unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -118,17 +118,17 @@ unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, } if (Idx == ImmIdx) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } return AArch64TTIImpl::getIntImmCost(Imm, Ty); } -unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -147,10 +147,10 @@ unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: if (Idx == 1) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } break; @@ -176,8 +176,7 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) { +int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -187,7 +186,31 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); - static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { + static const TypeConversionCostTblEntry + ConversionTbl[] = { + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + + // The number of shll instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + // LowerVectorINT_TO_FP: { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, @@ -210,6 +233,16 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + // Complex: to v8f32 + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + + // Complex: to v16f32 + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, + // Complex: to v2f64 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, @@ -250,22 +283,21 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, }; - int Idx = ConvertCostTableLookup<MVT>( - ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); - if (Idx != -1) - return ConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -281,15 +313,15 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } // All other insert/extracts cost this much. - return 2; + return 3; } -unsigned AArch64TTIImpl::getArithmeticInstrCost( +int AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -300,10 +332,9 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - unsigned Cost = - getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -331,7 +362,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost( } } -unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -346,19 +377,20 @@ unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { return 1; } -unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) { int ISD = TLI->InstructionOpcodeToISD(Opcode); - // We don't lower vector selects well that are wider than the register width. + // We don't lower some vector selects well that are wider than the register + // width. if (ValTy->isVectorTy() && ISD == ISD::SELECT) { // We would need this many instructions to hide the scalarization happening. - const unsigned AmortizationCost = 20; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> + const int AmortizationCost = 20; + static const TypeConversionCostTblEntry VectorSelectTbl[] = { - { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, - { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, - { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, + { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, + { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, + { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } @@ -367,20 +399,18 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, EVT SelCondTy = TLI->getValueType(DL, CondTy); EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { - int Idx = - ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), - SelValTy.getSimpleVT()); - if (Idx != -1) - return VectorSelectTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, + SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT())) + return Entry->Cost; } } return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); +int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, unsigned AddressSpace) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isIntegerTy(64)) { @@ -389,7 +419,7 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // practice on inlined memcpy code. // We make v2i64 stores expensive so that we will only vectorize if there // are 6 other instructions getting vectorized. - unsigned AmortizationCost = 6; + int AmortizationCost = 6; return LT.first * 2 * AmortizationCost; } @@ -407,16 +437,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } -unsigned AArch64TTIImpl::getInterleavedMemoryOpCost( - unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace) { +int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // ldN/stN only support legal vector types of size 64 or 128 in bits. if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) @@ -427,8 +459,8 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost( Alignment, AddressSpace); } -unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { - unsigned Cost = 0; +int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { + int Cost = 0; for (auto *I : Tys) { if (!I->isVectorTy()) continue; @@ -506,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_ld4: Info.ReadMem = true; Info.WriteMem = false; - Info.Vol = false; + Info.IsSimple = true; Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(0); break; @@ -515,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_st4: Info.ReadMem = false; Info.WriteMem = true; - Info.Vol = false; + Info.IsSimple = true; Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); break; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 444d3ccc15e1..ec58c4fe309f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -48,7 +48,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { }; public: - explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F) + explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -63,12 +63,11 @@ public: /// @{ using BaseT::getIntImmCost; - unsigned getIntImmCost(int64_t Val); - unsigned getIntImmCost(const APInt &Imm, Type *Ty); - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCost(int64_t Val); + int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); /// @} @@ -76,6 +75,8 @@ public: /// \name Vector TTI Implementations /// @{ + bool enableInterleavedAccessVectorization() { return true; } + unsigned getNumberOfRegisters(bool Vector) { if (Vector) { if (ST->hasNEON()) @@ -96,25 +97,25 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getAddressComputationCost(Type *Ty, bool IsComplex); + int getAddressComputationCost(Type *Ty, bool IsComplex); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); - unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); + int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); @@ -123,11 +124,9 @@ public: bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 38e8b4d9a938..394c8e78581f 100644 --- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -43,7 +43,6 @@ class AArch64Operand; class AArch64AsmParser : public MCTargetAsmParser { private: StringRef Mnemonic; ///< Instruction mnemonic. - MCSubtargetInfo &STI; // Map of register aliases registers via the .req directive. StringMap<std::pair<bool, unsigned> > RegisterReqs; @@ -101,6 +100,7 @@ private: OperandMatchResultTy tryParseSysReg(OperandVector &Operands); OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); OperandMatchResultTy tryParsePrefetch(OperandVector &Operands); + OperandMatchResultTy tryParsePSBHint(OperandVector &Operands); OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands); OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands); OperandMatchResultTy tryParseFPImm(OperandVector &Operands); @@ -115,16 +115,16 @@ public: #define GET_OPERAND_DIAGNOSTIC_TYPES #include "AArch64GenAsmMatcher.inc" }; - AArch64AsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser, + AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI) { + : MCTargetAsmParser(Options, STI) { MCAsmParserExtension::Initialize(Parser); MCStreamer &S = getParser().getStreamer(); if (S.getTargetStreamer() == nullptr) new AArch64TargetStreamer(S); // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -160,7 +160,8 @@ private: k_Prefetch, k_ShiftExtend, k_FPImm, - k_Barrier + k_Barrier, + k_PSBHint, } Kind; SMLoc StartLoc, EndLoc; @@ -228,6 +229,12 @@ private: unsigned Length; }; + struct PSBHintOp { + unsigned Val; + const char *Data; + unsigned Length; + }; + struct ShiftExtendOp { AArch64_AM::ShiftExtendType Type; unsigned Amount; @@ -251,6 +258,7 @@ private: struct SysRegOp SysReg; struct SysCRImmOp SysCRImm; struct PrefetchOp Prefetch; + struct PSBHintOp PSBHint; struct ShiftExtendOp ShiftExtend; }; @@ -302,6 +310,9 @@ public: case k_Prefetch: Prefetch = o.Prefetch; break; + case k_PSBHint: + PSBHint = o.PSBHint; + break; case k_ShiftExtend: ShiftExtend = o.ShiftExtend; break; @@ -393,6 +404,16 @@ public: return Prefetch.Val; } + unsigned getPSBHint() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return PSBHint.Val; + } + + StringRef getPSBHintName() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return StringRef(PSBHint.Data, PSBHint.Length); + } + StringRef getPrefetchName() const { assert(Kind == k_Prefetch && "Invalid access!"); return StringRef(Prefetch.Data, Prefetch.Length); @@ -497,6 +518,15 @@ public: return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000; } + bool isImm0_1() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 2); + } bool isImm0_7() const { if (!isImm()) return false; @@ -876,12 +906,15 @@ public: } bool isMSRSystemRegister() const { if (!isSysReg()) return false; - return SysReg.MSRReg != -1U; } - bool isSystemPStateField() const { + bool isSystemPStateFieldWithImm0_1() const { if (!isSysReg()) return false; - + return (SysReg.PStateField == AArch64PState::PAN || + SysReg.PStateField == AArch64PState::UAO); + } + bool isSystemPStateFieldWithImm0_15() const { + if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false; return SysReg.PStateField != -1U; } bool isReg() const override { return Kind == k_Register && !Reg.isVector; } @@ -950,6 +983,7 @@ public: } bool isSysCR() const { return Kind == k_SysCR; } bool isPrefetch() const { return Kind == k_Prefetch; } + bool isPSBHint() const { return Kind == k_PSBHint; } bool isShiftExtend() const { return Kind == k_ShiftExtend; } bool isShifter() const { if (!isShiftExtend()) @@ -1175,8 +1209,10 @@ public: template <unsigned NumRegs> void addVectorList64Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - static unsigned FirstRegs[] = { AArch64::D0, AArch64::D0_D1, - AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 }; + static const unsigned FirstRegs[] = { AArch64::D0, + AArch64::D0_D1, + AArch64::D0_D1_D2, + AArch64::D0_D1_D2_D3 }; unsigned FirstReg = FirstRegs[NumRegs - 1]; Inst.addOperand( @@ -1186,8 +1222,10 @@ public: template <unsigned NumRegs> void addVectorList128Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - static unsigned FirstRegs[] = { AArch64::Q0, AArch64::Q0_Q1, - AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 }; + static const unsigned FirstRegs[] = { AArch64::Q0, + AArch64::Q0_Q1, + AArch64::Q0_Q1_Q2, + AArch64::Q0_Q1_Q2_Q3 }; unsigned FirstReg = FirstRegs[NumRegs - 1]; Inst.addOperand( @@ -1304,6 +1342,12 @@ public: Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16)); } + void addImm0_1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + void addImm0_7Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); @@ -1491,7 +1535,13 @@ public: Inst.addOperand(MCOperand::createImm(SysReg.MSRReg)); } - void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const { + void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::createImm(SysReg.PStateField)); + } + + void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createImm(SysReg.PStateField)); @@ -1507,6 +1557,11 @@ public: Inst.addOperand(MCOperand::createImm(getPrefetch())); } + void addPSBHintOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getPSBHint())); + } + void addShifterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); unsigned Imm = @@ -1703,6 +1758,19 @@ public: return Op; } + static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val, + StringRef Str, + SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx); + Op->PSBHint.Val = Val; + Op->PSBHint.Data = Str.data(); + Op->PSBHint.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + static std::unique_ptr<AArch64Operand> CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { @@ -1776,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const { OS << "<prfop invalid #" << getPrefetch() << ">"; break; } + case k_PSBHint: { + OS << getPSBHintName(); + break; + } case k_ShiftExtend: { OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" << getShiftExtendAmount(); @@ -1849,6 +1921,8 @@ static bool isValidVectorKind(StringRef Name) { .Case(".h", true) .Case(".s", true) .Case(".d", true) + // Needed for fp16 scalar pairwise reductions + .Case(".2h", true) .Default(false); } @@ -2016,7 +2090,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { bool Valid; auto Mapper = AArch64PRFM::PRFMMapper(); StringRef Name = - Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid); + Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name, S, getContext())); return MatchOperand_Success; @@ -2030,7 +2104,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { bool Valid; auto Mapper = AArch64PRFM::PRFMMapper(); unsigned prfop = - Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid); + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); if (!Valid) { TokError("pre-fetch hint expected"); return MatchOperand_ParseFail; @@ -2042,6 +2116,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_Success; } +/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64PSBHint::PSBHintMapper(); + unsigned psbhint = + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); + if (!Valid) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(), + S, getContext())); + return MatchOperand_Success; +} + /// tryParseAdrpLabel - Parse and validate a source label for the ADRP /// instruction. AArch64AsmParser::OperandMatchResultTy @@ -2439,6 +2539,13 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, } else if (!Op.compare_lower("cisw")) { // SYS #0, C7, C14, #2 SYS_ALIAS(0, 7, 14, 2); + } else if (!Op.compare_lower("cvap")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #3, C7, C12, #1 + SYS_ALIAS(3, 7, 12, 1); + } else { + return TokError("DC CVAP requires ARMv8.2a"); + } } else { return TokError("invalid operand for DC instruction"); } @@ -2479,6 +2586,20 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, } else if (!Op.compare_lower("s12e0w")) { // SYS #4, C7, C8, #7 SYS_ALIAS(4, 7, 8, 7); + } else if (!Op.compare_lower("s1e1rp")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #0, C7, C9, #0 + SYS_ALIAS(0, 7, 9, 0); + } else { + return TokError("AT S1E1RP requires ARMv8.2a"); + } + } else if (!Op.compare_lower("s1e1wp")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #0, C7, C9, #1 + SYS_ALIAS(0, 7, 9, 1); + } else { + return TokError("AT S1E1WP requires ARMv8.2a"); + } } else { return TokError("invalid operand for AT instruction"); } @@ -2644,7 +2765,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { bool Valid; auto Mapper = AArch64DB::DBarrierMapper(); StringRef Name = - Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid); + Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name, ExprLoc, getContext())); return MatchOperand_Success; @@ -2658,7 +2779,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { bool Valid; auto Mapper = AArch64DB::DBarrierMapper(); unsigned Opt = - Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid); + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); if (!Valid) { TokError("invalid barrier option name"); return MatchOperand_ParseFail; @@ -2687,20 +2808,21 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { bool IsKnown; auto MRSMapper = AArch64SysReg::MRSMapper(); - uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), STI.getFeatureBits(), - IsKnown); + uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (MRSReg != -1U) && "register should be -1 if and only if it's unknown"); auto MSRMapper = AArch64SysReg::MSRMapper(); - uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), STI.getFeatureBits(), - IsKnown); + uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (MSRReg != -1U) && "register should be -1 if and only if it's unknown"); auto PStateMapper = AArch64PState::PStateMapper(); uint32_t PStateField = - PStateMapper.fromString(Tok.getString(), STI.getFeatureBits(), IsKnown); + PStateMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (PStateField != -1U) && "register should be -1 if and only if it's unknown"); @@ -3151,7 +3273,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, if (Operands.size() < 2 || !static_cast<AArch64Operand &>(*Operands[1]).isReg()) - return true; + return Error(Loc, "Only valid when first operand is register"); bool IsXReg = AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( @@ -3183,7 +3305,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, } // If it is a label or an imm that cannot fit in a movz, put it into CP. const MCExpr *CPLoc = - getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4); + getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc); Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx)); return false; } @@ -3601,6 +3723,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { return Error(Loc, "index must be a multiple of 8 in range [0, 32760]."); case Match_InvalidMemoryIndexed16: return Error(Loc, "index must be a multiple of 16 in range [0, 65520]."); + case Match_InvalidImm0_1: + return Error(Loc, "immediate must be an integer in range [0, 1]."); case Match_InvalidImm0_7: return Error(Loc, "immediate must be an integer in range [0, 7]."); case Match_InvalidImm0_15: @@ -3912,7 +4036,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]); if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) { unsigned zreg = - AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains( + !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains( RegOp.getReg()) ? AArch64::WZR : AArch64::XZR; @@ -3929,10 +4053,27 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // If that fails, try against the alternate table containing long-form NEON: // "fadd v0.2s, v1.2s, v2.2s" - if (MatchResult != Match_Success) + if (MatchResult != Match_Success) { + // But first, save the short-form match result: we can use it in case the + // long-form match also fails. + auto ShortFormNEONErrorInfo = ErrorInfo; + auto ShortFormNEONMatchResult = MatchResult; + MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0); + // Now, both matches failed, and the long-form match failed on the mnemonic + // suffix token operand. The short-form match failure is probably more + // relevant: use it instead. + if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 && + Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() && + ((AArch64Operand &)*Operands[1]).isTokenSuffix()) { + MatchResult = ShortFormNEONMatchResult; + ErrorInfo = ShortFormNEONErrorInfo; + } + } + + switch (MatchResult) { case Match_Success: { // Perform range checking and other semantic validations @@ -3944,7 +4085,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; } case Match_MissingFeature: { @@ -3966,6 +4107,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return showMatchError(IDLoc, MatchResult); case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); @@ -4011,6 +4153,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidMemoryIndexed8SImm7: case Match_InvalidMemoryIndexed16SImm7: case Match_InvalidMemoryIndexedSImm9: + case Match_InvalidImm0_1: case Match_InvalidImm0_7: case Match_InvalidImm0_15: case Match_InvalidImm0_31: @@ -4083,7 +4226,7 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { if (getParser().parseExpression(Value)) return true; - getParser().getStreamer().EmitValue(Value, Size); + getParser().getStreamer().EmitValue(Value, Size, L); if (getLexer().is(AsmToken::EndOfStatement)) break; @@ -4155,7 +4298,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { Inst.setOpcode(AArch64::TLSDESCCALL); Inst.addOperand(MCOperand::createExpr(Expr)); - getParser().getStreamer().EmitInstruction(Inst, STI); + getParser().getStreamer().EmitInstruction(Inst, getSTI()); return false; } diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index db9fb0e775df..f1f968e73123 100644 --- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -1516,6 +1516,10 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, uint64_t pstate_field = (op1 << 3) | op2; + if ((pstate_field == AArch64PState::PAN || + pstate_field == AArch64PState::UAO) && crm > 1) + return Fail; + Inst.addOperand(MCOperand::createImm(pstate_field)); Inst.addOperand(MCOperand::createImm(crm)); diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 7f56c2cf6bb8..d8a810824370 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -55,7 +56,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, unsigned Opcode = MI->getOpcode(); if (Opcode == AArch64::SYSxt) - if (printSysAlias(MI, O)) { + if (printSysAlias(MI, STI, O)) { printAnnotation(O, Annot); return; } @@ -269,7 +270,7 @@ struct LdStNInstrDesc { int NaturalOffset; }; -static LdStNInstrDesc LdStNInstInfo[] = { +static const LdStNInstrDesc LdStNInstInfo[] = { { AArch64::LD1i8, "ld1", ".b", 1, true, 0 }, { AArch64::LD1i16, "ld1", ".h", 1, true, 0 }, { AArch64::LD1i32, "ld1", ".s", 1, true, 0 }, @@ -612,7 +613,7 @@ static LdStNInstrDesc LdStNInstInfo[] = { { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 }, }; -static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { +static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { unsigned Idx; for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx) if (LdStNInstInfo[Idx].Opcode == Opcode) @@ -641,7 +642,7 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } - if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { + if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t'; // Now onto the operands: first a vector list with possible lane @@ -674,7 +675,9 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, AArch64InstPrinter::printInst(MI, O, Annot, STI); } -bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { +bool AArch64InstPrinter::printSysAlias(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { #ifndef NDEBUG unsigned Opcode = MI->getOpcode(); assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); @@ -729,6 +732,11 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { if (Op1Val == 3 && Op2Val == 1) Asm = "dc\tcvau"; break; + case 12: + if (Op1Val == 3 && Op2Val == 1 && + (STI.getFeatureBits()[AArch64::HasV8_2aOps])) + Asm = "dc\tcvap"; + break; case 14: if (Op1Val == 3 && Op2Val == 1) Asm = "dc\tcivac"; @@ -773,6 +781,21 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { break; } break; + case 9: + switch (Op1Val) { + default: + break; + case 0: + if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) { + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e1rp"; break; + case 1: Asm = "at\ts1e1wp"; break; + } + } + break; + } } } else if (CnVal == 8) { // TLBI aliases @@ -1122,6 +1145,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, O << '#' << prfop; } +void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned psbhintop = MI->getOperand(OpNum).getImm(); + bool Valid; + StringRef Name = + AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid); + if (Valid) + O << Name; + else + O << '#' << psbhintop; +} + void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 15dee978e229..ea68d9848b42 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -15,14 +15,10 @@ #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H #include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class AArch64InstPrinter : public MCInstPrinter { public: AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, @@ -48,7 +44,8 @@ public: unsigned AltIdx = AArch64::NoRegAltName); protected: - bool printSysAlias(const MCInst *MI, raw_ostream &O); + bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); // Operand printers void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); @@ -122,6 +119,9 @@ protected: void printPrefetchOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + void printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index ed24343a6f2a..648b1dfc8c5e 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -364,6 +364,32 @@ static inline float getFPImmFloat(unsigned Imm) { return FPUnion.F; } +/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit +/// floating-point value. If the value cannot be represented as an 8-bit +/// floating-point value, then return -1. +static inline int getFP16Imm(const APInt &Imm) { + uint32_t Sign = Imm.lshr(15).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15 + int32_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x3f) + return -1; + Mantissa >>= 6; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +static inline int getFP16Imm(const APFloat &FPImm) { + return getFP16Imm(FPImm.bitcastToAPInt()); +} + /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit /// floating-point value. If the value cannot be represented as an 8-bit /// floating-point value, then return -1. diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 16d53569b231..d26604f5765d 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -128,10 +128,9 @@ public: /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override { + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size); + MCELFStreamer::EmitValueImpl(Value, Size, Loc); } private: diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 921c4b94a729..fbce26e1d9a1 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -48,10 +48,6 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { UseDataRegionDirectives = true; ExceptionsType = ExceptionHandling::DwarfCFI; - - // AArch64 Darwin doesn't have the baggage of X86/ARM, so it's fine to use - // LShr instead of AShr. - UseLogicalShr = true; } const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 28703419514a..a540f49866a9 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -85,13 +85,13 @@ void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const { Streamer.visitUsedExpr(*getSubExpr()); } -MCSection *AArch64MCExpr::findAssociatedSection() const { +MCFragment *AArch64MCExpr::findAssociatedFragment() const { llvm_unreachable("FIXME: what goes here?"); } bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup)) return false; diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index 1165314e4105..db36a65564ce 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -149,11 +149,10 @@ public: void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override; + MCFragment *findAssociatedFragment() const override; - bool evaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const override; + bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const override; void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; @@ -162,7 +161,6 @@ public: } static bool classof(const AArch64MCExpr *) { return true; } - }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 741b273073e4..61c96f1d93c1 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -90,9 +90,11 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( Log2Size = llvm::Log2_32(4); // This encompasses the relocation for the whole 21-bit value. switch (Sym->getKind()) { - default: - Asm.getContext().reportFatalError(Fixup.getLoc(), - "ADR/ADRP relocations must be GOT relative"); + default: { + Asm.getContext().reportError(Fixup.getLoc(), + "ADR/ADRP relocations must be GOT relative"); + return false; + } case MCSymbolRefExpr::VK_PAGE: RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); return true; @@ -170,25 +172,25 @@ void AArch64MachObjectWriter::recordRelocation( // assembler local symbols. If we got here, that's not what we have, // so complain loudly. if (Kind == AArch64::fixup_aarch64_pcrel_branch19) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "conditional branch requires assembler-local" - " label. '" + - Target.getSymA()->getSymbol().getName() + - "' is external."); + Asm.getContext().reportError(Fixup.getLoc(), + "conditional branch requires assembler-local" + " label. '" + + Target.getSymA()->getSymbol().getName() + + "' is external."); return; } // 14-bit branch relocations should only target internal labels, and so // should never get here. if (Kind == AArch64::fixup_aarch64_pcrel_branch14) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "Invalid relocation on conditional branch!"); + Asm.getContext().reportError(Fixup.getLoc(), + "Invalid relocation on conditional branch!"); return; } if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size, - Asm)) { - Asm.getContext().reportFatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); + Asm)) { + Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); return; } @@ -200,8 +202,9 @@ void AArch64MachObjectWriter::recordRelocation( Type = MachO::ARM64_RELOC_UNSIGNED; if (IsPCRel) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "PC relative absolute relocation!"); + Asm.getContext().reportError(Fixup.getLoc(), + "PC relative absolute relocation!"); + return; // FIXME: x86_64 sets the type to a branch reloc here. Should we do // something similar? @@ -229,16 +232,20 @@ void AArch64MachObjectWriter::recordRelocation( Writer->addRelocation(A_Base, Fragment->getParent(), MRE); return; } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || - Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { // Otherwise, neither symbol can be modified. - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported relocation of modified symbol"); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + return; + } // We don't support PCrel relocations of differences. - if (IsPCRel) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported pc-relative relocation of " - "difference"); + if (IsPCRel) { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported pc-relative relocation of " + "difference"); + return; + } // AArch64 always uses external relocations. If there is no symbol to use as // a base address (a local symbol with no preceding non-local symbol), @@ -246,20 +253,26 @@ void AArch64MachObjectWriter::recordRelocation( // // FIXME: We should probably just synthesize an external symbol and use // that. - if (!A_Base) - Asm.getContext().reportFatalError( + if (!A_Base) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + A->getName() + "'. Must have non-local symbol earlier in section."); - if (!B_Base) - Asm.getContext().reportFatalError( + return; + } + if (!B_Base) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + B->getName() + "'. Must have non-local symbol earlier in section."); + return; + } - if (A_Base == B_Base && A_Base) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported relocation with identical base"); + if (A_Base == B_Base && A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation with identical base"); + return; + } Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) - (!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress( @@ -309,10 +322,12 @@ void AArch64MachObjectWriter::recordRelocation( // we need to preserve and merge with the new Target? How about // the FixedValue? if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout, - &Fixup)) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unable to resolve variable '" + - Symbol->getName() + "'"); + &Fixup)) { + Asm.getContext().reportError(Fixup.getLoc(), + "unable to resolve variable '" + + Symbol->getName() + "'"); + return; + } return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, FixedValue); } @@ -337,11 +352,13 @@ void AArch64MachObjectWriter::recordRelocation( Value += Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base); } else if (Symbol->isInSection()) { - if (!CanUseLocalRelocation) - Asm.getContext().reportFatalError( + if (!CanUseLocalRelocation) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + Symbol->getName() + "'. Must have non-local symbol earlier in section."); + return; + } // Adjust the relocation to be section-relative. // The index is the section ordinal (1-based). const MCSection &Sec = Symbol->getSection(); @@ -361,9 +378,10 @@ void AArch64MachObjectWriter::recordRelocation( return; } } - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of variable '" + Symbol->getName() + "'"); + return; } } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 52b000d15b8d..3e86a42d5be6 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -26,8 +26,9 @@ AArch64TargetStreamer::~AArch64TargetStreamer() {} // The constant pool handling is shared by all AArch64TargetStreamer // implementations. const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr, - unsigned Size) { - return ConstantPools->addEntry(Streamer, Expr, Size); + unsigned Size, + SMLoc Loc) { + return ConstantPools->addEntry(Streamer, Expr, Size, Loc); } void AArch64TargetStreamer::emitCurrentConstantPool() { diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index fcc0d053f6e2..51432830f795 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -24,7 +24,7 @@ public: /// Callback used to implement the ldr= pseudo. /// Add a new entry to the constant pool for the current section and return an /// MCExpr that can be used to refer to the constant pool location. - const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size); + const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc); /// Callback used to implemnt the .ltorg directive. /// Emit contents of constant pool for the current section. diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index ee85b65bf39a..78f5289ec26d 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -146,11 +146,22 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings // v8.1a "Privileged Access Never" extension-specific PStates {"pan", PAN, {AArch64::HasV8_1aOps}}, + + // v8.2a + {"uao", UAO, {AArch64::HasV8_2aOps}}, }; AArch64PState::PStateMapper::PStateMapper() : AArch64NamedImmMapper(PStateMappings, 0) {} +const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = { + // v8.2a "Statistical Profiling" extension-specific PSB operand + {"csync", CSync, {AArch64::FeatureSPE}}, +}; + +AArch64PSBHint::PSBHintMapper::PSBHintMapper() + : AArch64NamedImmMapper(PSBHintMappings, 0) {} + const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"mdccsr_el0", MDCCSR_EL0, {}}, {"dbgdtrrx_el0", DBGDTRRX_EL0, {}}, @@ -192,6 +203,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}}, {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}}, {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}}, + {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}}, {"mvfr0_el1", MVFR0_EL1, {}}, {"mvfr1_el1", MVFR1_EL1, {}}, {"mvfr2_el1", MVFR2_EL1, {}}, @@ -275,9 +287,6 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = { {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}}, {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}}, {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}}, - - // v8.1a "Privileged Access Never" extension-specific system registers - {"pan", PAN, {AArch64::HasV8_1aOps}}, }; AArch64SysReg::MSRMapper::MSRMapper() { @@ -804,6 +813,24 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}}, {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}}, {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}}, + + // v8.2a registers + {"uao", UAO, {AArch64::HasV8_2aOps}}, + + // v8.2a "Statistical Profiling extension" registers + {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}}, + {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}}, + {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}}, + {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}}, + {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}}, + {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}}, + {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}}, + {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}}, + {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}}, + {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}}, + {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}}, + {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}}, + {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}}, }; uint32_t diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 7e42f8e3601e..f649cb9b8a8d 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -337,7 +337,9 @@ namespace AArch64AT { S12E1R = 0x63c4, // 01 100 0111 1000 100 S12E1W = 0x63c5, // 01 100 0111 1000 101 S12E0R = 0x63c6, // 01 100 0111 1000 110 - S12E0W = 0x63c7 // 01 100 0111 1000 111 + S12E0W = 0x63c7, // 01 100 0111 1000 111 + S1E1RP = 0x43c8, // 01 000 0111 1001 000 + S1E1WP = 0x43c9 // 01 000 0111 1001 001 }; struct ATMapper : AArch64NamedImmMapper { @@ -463,6 +465,9 @@ namespace AArch64PState { // v8.1a "Privileged Access Never" extension-specific PStates PAN = 0x04, + + // v8.2a "User Access Override" extension-specific PStates + UAO = 0x03 }; struct PStateMapper : AArch64NamedImmMapper { @@ -473,6 +478,21 @@ namespace AArch64PState { } +namespace AArch64PSBHint { + enum PSBHintValues { + Invalid = -1, + // v8.2a "Statistical Profiling" extension-specific PSB operands + CSync = 0x11, // psb csync = hint #0x11 + }; + + struct PSBHintMapper : AArch64NamedImmMapper { + const static Mapping PSBHintMappings[]; + + PSBHintMapper(); + }; + +} + namespace AArch64SE { enum ShiftExtSpecifiers { Invalid = -1, @@ -594,6 +614,7 @@ namespace AArch64SysReg { ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001 ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000 ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001 + ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010 MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000 MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001 MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010 @@ -1190,6 +1211,24 @@ namespace AArch64SysReg { SPSR_EL12 = 0xea00, // 11 101 0100 0000 000 ELR_EL12 = 0xea01, // 11 101 0100 0000 001 + // v8.2a registers + UAO = 0xc214, // 11 000 0100 0010 100 + + // v8.2a "Statistical Profiling extension" registers + PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000 + PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001 + PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011 + PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111 + PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000 + PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000 + PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000 + PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010 + PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011 + PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100 + PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101 + PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110 + PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111 + // Cyclone specific system registers CPM_IOACC_CTL_EL3 = 0xff90, }; |