aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64')
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64.td43
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp15
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp1
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp8
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp17
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp27
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h38
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td19
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp8
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp32
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp22
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp20
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp16
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp15
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp221
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp93
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h6
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp246
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp1446
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h110
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td1076
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp211
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h12
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td483
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp1304
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp4
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h42
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h17
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp4
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp56
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h5
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td2
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp31
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h27
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp2
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp150
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h39
-rw-r--r--contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp201
-rw-r--r--contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp4
-rw-r--r--contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp46
-rw-r--r--contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h10
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h26
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp5
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp4
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp6
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h8
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp88
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp5
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h2
-rw-r--r--contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp33
-rw-r--r--contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h41
51 files changed, 4510 insertions, 1837 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
index 9a7d6c884db5..0bff9b592c15 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -32,6 +32,15 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
"Enable ARMv8 CRC-32 checksum instructions">;
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+ "Enable ARMv8 PMUv3 Performance Monitors extension">;
+
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+ "Full FP16", [FeatureFPARMv8]>;
+
+def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
+ "Enable Statistical Profiling extension">;
+
/// Cyclone has register move instructions which are "free".
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
@@ -40,6 +49,15 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
"Has zero-cycle zeroing instructions">;
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+ "StrictAlign", "true",
+ "Disallow all unaligned memory "
+ "access">;
+
+def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
+ "Reserve X18, making it unavailable "
+ "as a GPR">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
@@ -47,6 +65,9 @@ def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
"Support ARM v8.1a instructions", [FeatureCRC]>;
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+ "Support ARM v8.2a instructions", [HasV8_1aOps]>;
+
//===----------------------------------------------------------------------===//
// Register File Description
//===----------------------------------------------------------------------===//
@@ -70,19 +91,29 @@ include "AArch64SchedA53.td"
include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
+def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+ "Cortex-A35 ARM processors",
+ [FeatureFPARMv8,
+ FeatureNEON,
+ FeatureCrypto,
+ FeatureCRC,
+ FeaturePerfMon]>;
+
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
- FeatureCRC]>;
+ FeatureCRC,
+ FeaturePerfMon]>;
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
"Cortex-A57 ARM processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
- FeatureCRC]>;
+ FeatureCRC,
+ FeaturePerfMon]>;
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
"Cyclone",
@@ -90,12 +121,16 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
FeatureNEON,
FeatureCrypto,
FeatureCRC,
+ FeaturePerfMon,
FeatureZCRegMove, FeatureZCZeroing]>;
def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
FeatureNEON,
- FeatureCRC]>;
+ FeatureCRC,
+ FeaturePerfMon]>;
+// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
@@ -109,11 +144,13 @@ def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
def GenericAsmParserVariant : AsmParserVariant {
int Variant = 0;
string Name = "generic";
+ string BreakCharacters = ".";
}
def AppleAsmParserVariant : AsmParserVariant {
int Variant = 1;
string Name = "apple-neon";
+ string BreakCharacters = ".";
}
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
index d7ef3f4ef653..d215d9e831c0 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -122,7 +122,7 @@ AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
const TargetInstrInfo *TII) {
// Get the previous machine basic block in the function.
- MachineFunction::iterator MBBI = *MBB;
+ MachineFunction::iterator MBBI(MBB);
// Can't go off top of function.
if (MBBI == MBB->getParent()->begin())
@@ -131,7 +131,7 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 2> Cond;
- MachineBasicBlock *PrevBB = std::prev(MBBI);
+ MachineBasicBlock *PrevBB = &*std::prev(MBBI);
for (MachineBasicBlock *S : MBB->predecessors())
if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) &&
!TBB && !FBB)
@@ -151,10 +151,9 @@ static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB,
// If there is no non-pseudo in the current block, loop back around and try
// the previous block (if there is one).
while ((FMBB = getBBFallenThrough(FMBB, TII))) {
- for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) {
- if (!I->isPseudo())
- return &*I;
- }
+ for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend()))
+ if (!I.isPseudo())
+ return &I;
}
// There was no previous non-pseudo in the fallen through blocks
@@ -217,8 +216,8 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
++Idx;
}
- DEBUG(dbgs() << "Scan complete, "<< Sequences.size()
- << " occurences of pattern found.\n");
+ DEBUG(dbgs() << "Scan complete, " << Sequences.size()
+ << " occurrences of pattern found.\n");
// Then update the basic block, inserting nops between the detected sequences.
for (auto &MI : Sequences) {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 9d6dbd641a16..79a84ad8c6c5 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -593,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
if (Change) {
Substs[MO.getReg()] = Reg;
MO.setReg(Reg);
- MRI->setPhysRegUsed(Reg);
Changed = true;
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 716e1a37b1f7..3afcdfb8b930 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -57,6 +57,8 @@ EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
" the other."),
cl::init(true));
+#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion"
+
//===----------------------------------------------------------------------===//
// AArch64AddressTypePromotion
//===----------------------------------------------------------------------===//
@@ -76,7 +78,7 @@ public:
}
const char *getPassName() const override {
- return "AArch64 Address Type Promotion";
+ return AARCH64_TYPE_PROMO_NAME;
}
/// Iterate over the functions and promote the computation of interesting
@@ -143,10 +145,10 @@ private:
char AArch64AddressTypePromotion::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
- "AArch64 Type Promotion Pass", false, false)
+ AARCH64_TYPE_PROMO_NAME, false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
- "AArch64 Type Promotion Pass", false, false)
+ AARCH64_TYPE_PROMO_NAME, false, false)
FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
return new AArch64AddressTypePromotion();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 18d21fd38618..1644d71d2821 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -61,6 +61,12 @@ STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+namespace llvm {
+void initializeAArch64AdvSIMDScalarPass(PassRegistry &);
+}
+
+#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
+
namespace {
class AArch64AdvSIMDScalar : public MachineFunctionPass {
MachineRegisterInfo *MRI;
@@ -82,12 +88,14 @@ private:
public:
static char ID; // Pass identification, replacement for typeid.
- explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+ explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
+ initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
+ }
bool runOnMachineFunction(MachineFunction &F) override;
const char *getPassName() const override {
- return "AdvSIMD Scalar Operation Optimization";
+ return AARCH64_ADVSIMD_NAME;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -98,6 +106,9 @@ public:
char AArch64AdvSIMDScalar::ID = 0;
} // end anonymous namespace
+INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
+ AARCH64_ADVSIMD_NAME, false, false)
+
static bool isGPR64(unsigned Reg, unsigned SubReg,
const MachineRegisterInfo *MRI) {
if (SubReg)
@@ -381,7 +392,7 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
// Just check things on a one-block-at-a-time basis.
for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
- if (processMachineBasicBlock(I))
+ if (processMachineBasicBlock(&*I))
Changed = true;
return Changed;
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index d973234dd86a..a614f555a4e9 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -45,6 +45,12 @@ BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
STATISTIC(NumSplit, "Number of basic blocks split");
STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
+namespace llvm {
+void initializeAArch64BranchRelaxationPass(PassRegistry &);
+}
+
+#define AARCH64_BR_RELAX_NAME "AArch64 branch relaxation pass"
+
namespace {
class AArch64BranchRelaxation : public MachineFunctionPass {
/// BasicBlockInfo - Information about the offset and size of a single
@@ -93,17 +99,22 @@ class AArch64BranchRelaxation : public MachineFunctionPass {
public:
static char ID;
- AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
+ AArch64BranchRelaxation() : MachineFunctionPass(ID) {
+ initializeAArch64BranchRelaxationPass(*PassRegistry::getPassRegistry());
+ }
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
- return "AArch64 branch relaxation pass";
+ return AARCH64_BR_RELAX_NAME;
}
};
char AArch64BranchRelaxation::ID = 0;
}
+INITIALIZE_PASS(AArch64BranchRelaxation, "aarch64-branch-relax",
+ AARCH64_BR_RELAX_NAME, false, false)
+
/// verify - check BBOffsets, BBSizes, alignment of islands
void AArch64BranchRelaxation::verify() {
#ifndef NDEBUG
@@ -131,14 +142,14 @@ void AArch64BranchRelaxation::dumpBBs() {
/// into the block immediately after it.
static bool BBHasFallthrough(MachineBasicBlock *MBB) {
// Get the next machine basic block in the function.
- MachineFunction::iterator MBBI = MBB;
+ MachineFunction::iterator MBBI(MBB);
// Can't fall off end of function.
- MachineBasicBlock *NextBB = std::next(MBBI);
+ auto NextBB = std::next(MBBI);
if (NextBB == MBB->getParent()->end())
return false;
for (MachineBasicBlock *S : MBB->successors())
- if (S == NextBB)
+ if (S == &*NextBB)
return true;
return false;
@@ -216,9 +227,7 @@ AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
// Create a new MBB for the code after the OrigBB.
MachineBasicBlock *NewBB =
MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
- MachineFunction::iterator MBBI = OrigBB;
- ++MBBI;
- MF->insert(MBBI, NewBB);
+ MF->insert(++OrigBB->getIterator(), NewBB);
// Splice the instructions starting with MI over to NewBB.
NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
@@ -421,7 +430,7 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
MBB->replaceSuccessor(FBB, NewBB);
NewBB->addSuccessor(FBB);
}
- MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+ MachineBasicBlock *NextBB = &*std::next(MachineFunction::iterator(MBB));
DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber()
<< ", invert condition and change dest. to BB#"
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
index 1e2d1c3b93bd..bc44bc5f2461 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,30 +25,28 @@
namespace {
using namespace llvm;
-static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
- AArch64::X3, AArch64::X4, AArch64::X5,
- AArch64::X6, AArch64::X7};
-static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
- AArch64::H3, AArch64::H4, AArch64::H5,
- AArch64::H6, AArch64::H7};
-static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
- AArch64::S3, AArch64::S4, AArch64::S5,
- AArch64::S6, AArch64::S7};
-static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
- AArch64::D3, AArch64::D4, AArch64::D5,
- AArch64::D6, AArch64::D7};
-static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
- AArch64::Q3, AArch64::Q4, AArch64::Q5,
- AArch64::Q6, AArch64::Q7};
+static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+ AArch64::X3, AArch64::X4, AArch64::X5,
+ AArch64::X6, AArch64::X7};
+static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+ AArch64::H3, AArch64::H4, AArch64::H5,
+ AArch64::H6, AArch64::H7};
+static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+ AArch64::S3, AArch64::S4, AArch64::S5,
+ AArch64::S6, AArch64::S7};
+static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+ AArch64::D3, AArch64::D4, AArch64::D5,
+ AArch64::D6, AArch64::D7};
+static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+ AArch64::Q3, AArch64::Q4, AArch64::Q5,
+ AArch64::Q6, AArch64::Q7};
static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
CCState &State, unsigned SlotAlign) {
unsigned Size = LocVT.getSizeInBits() / 8;
- unsigned StackAlign = State.getMachineFunction()
- .getTarget()
- .getDataLayout()
- ->getStackAlignment();
+ unsigned StackAlign =
+ State.getMachineFunction().getDataLayout().getStackAlignment();
unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
for (auto &It : PendingMembers) {
@@ -88,7 +86,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
ISD::ArgFlagsTy &ArgFlags, CCState &State) {
// Try to allocate a contiguous block of registers, each of the correct
// size to hold one member.
- ArrayRef<uint16_t> RegList;
+ ArrayRef<MCPhysReg> RegList;
if (LocVT.SimpleTy == MVT::i64)
RegList = XRegList;
else if (LocVT.SimpleTy == MVT::f16)
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 815ebef177d8..388d64ec4e99 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> :
CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
/// CCIfBigEndian - Match only if we're in big endian mode.
class CCIfBigEndian<CCAction A> :
- CCIf<"State.getMachineFunction().getTarget().getDataLayout()->isBigEndian()", A>;
+ CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
//===----------------------------------------------------------------------===//
// ARM AAPCS64 Calling Convention
@@ -279,6 +279,23 @@ def CSR_AArch64_TLS_Darwin
FP,
(sequence "Q%u", 0, 31))>;
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
+// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
+def CSR_AArch64_CXX_TLS_Darwin
+ : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+ (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+ (sequence "D%u", 0, 31))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_AArch64_CXX_TLS_Darwin_PE
+ : CalleeSavedRegs<(add LR, FP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
+ : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;
+
// The ELF stub used for TLS-descriptor access saves every feasible
// register. Only X0 and LR are clobbered.
def CSR_AArch64_TLS_ELF
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 06ff9af37fd7..9310ac4a44a2 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -117,10 +117,10 @@ struct LDTLSCleanup : public MachineFunctionPass {
*TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
// Insert a copy from X0 to TLSBaseAddrReg for later.
- MachineInstr *Next = I->getNextNode();
- MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
- TII->get(TargetOpcode::COPY),
- *TLSBaseAddrReg).addReg(AArch64::X0);
+ MachineInstr *Copy =
+ BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(),
+ TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+ .addReg(AArch64::X0);
return Copy;
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index efdb2e33a36e..78c239b11ef3 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -168,6 +168,8 @@ namespace llvm {
void initializeAArch64CollectLOHPass(PassRegistry &);
}
+#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)"
+
namespace {
struct AArch64CollectLOH : public MachineFunctionPass {
static char ID;
@@ -178,7 +180,7 @@ struct AArch64CollectLOH : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
- return "AArch64 Collect Linker Optimization Hint (LOH)";
+ return AARCH64_COLLECT_LOH_NAME;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -220,12 +222,10 @@ typedef SmallVector<unsigned, 32> MapIdToReg;
char AArch64CollectLOH::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
- "AArch64 Collect Linker Optimization Hint (LOH)", false,
- false)
+ AARCH64_COLLECT_LOH_NAME, false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
- "AArch64 Collect Linker Optimization Hint (LOH)", false,
- false)
+ AARCH64_COLLECT_LOH_NAME, false, false)
/// Given a couple (MBB, reg) get the corresponding set of instruction from
/// the given "sets".
@@ -353,9 +353,17 @@ static void initReachingDef(const MachineFunction &MF,
for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
- assert(ItRegId != RegToId.end() &&
- "Sub-register of an "
- "involved register, not recorded as involved!");
+ // If this alias has not been recorded, then it is not interesting
+ // for the current analysis.
+ // We can end up in this situation because of tuple registers.
+ // E.g., Let say we are interested in S1. When we register
+ // S1, we will also register its aliases and in particular
+ // the tuple Q1_Q2.
+ // Now, when we encounter Q1_Q2, we will look through its aliases
+ // and will find that S2 is not registered.
+ if (ItRegId == RegToId.end())
+ continue;
+
BBKillSet.set(ItRegId->second);
BBGen[ItRegId->second] = &MI;
}
@@ -523,6 +531,8 @@ static bool isCandidateStore(const MachineInstr *Instr) {
switch (Instr->getOpcode()) {
default:
return false;
+ case AArch64::STRBBui:
+ case AArch64::STRHHui:
case AArch64::STRBui:
case AArch64::STRHui:
case AArch64::STRWui:
@@ -884,7 +894,8 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
// If the chain is three instructions long and ldr is the second element,
// then this ldr must load form GOT, otherwise this is not a correct chain.
- if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
+ if (L2 && !IsL2Add &&
+ !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT))
continue;
SmallVector<const MachineInstr *, 3> Args;
MCLOHType Kind;
@@ -1000,7 +1011,8 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
DEBUG(dbgs() << "** Collect Involved Register\n");
for (const auto &MBB : MF) {
for (const MachineInstr &MI : MBB) {
- if (!canDefBePartOfLOH(&MI))
+ if (!canDefBePartOfLOH(&MI) &&
+ !isCandidateLoad(&MI) && !isCandidateStore(&MI))
continue;
// Process defs
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index b9e41c61defe..fc27bfee73d1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -59,6 +59,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
@@ -153,13 +154,20 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
case AArch64::SUBSXri:
// cmn is an alias for adds with a dead destination register.
case AArch64::ADDSWri:
- case AArch64::ADDSXri:
- if (MRI->use_empty(I->getOperand(0).getReg()))
- return I;
-
- DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
- return nullptr;
-
+ case AArch64::ADDSXri: {
+ unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
+ if (!I->getOperand(2).isImm()) {
+ DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+ return nullptr;
+ } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
+ DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n');
+ return nullptr;
+ } else if (!MRI->use_empty(I->getOperand(0).getReg())) {
+ DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+ return nullptr;
+ }
+ return I;
+ }
// Prevent false positive case like:
// cmp w19, #0
// cinc w0, w19, gt
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 2b0c92fe02d5..df1320fbd4c9 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -353,7 +353,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
MIOperands::PhysRegInfo PRI =
MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
- if (PRI.Reads) {
+ if (PRI.Read) {
// The ccmp doesn't produce exactly the same flags as the original
// compare, so reject the transform if there are uses of the flags
// besides the terminators.
@@ -362,7 +362,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
return nullptr;
}
- if (PRI.Clobbers) {
+ if (PRI.Defined || PRI.Clobbered) {
DEBUG(dbgs() << "Not convertible compare: " << *I);
++NumUnknNZCVDefs;
return nullptr;
@@ -567,8 +567,8 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
// All CmpBB instructions are moved into Head, and CmpBB is deleted.
// Update the CFG first.
updateTailPHIs();
- Head->removeSuccessor(CmpBB);
- CmpBB->removeSuccessor(Tail);
+ Head->removeSuccessor(CmpBB, true);
+ CmpBB->removeSuccessor(Tail, true);
Head->transferSuccessorsAndUpdatePHIs(CmpBB);
DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
TII->RemoveBranch(*Head);
@@ -786,13 +786,13 @@ void AArch64ConditionalCompares::updateDomTree(
// convert() removes CmpBB which was previously dominated by Head.
// CmpBB children should be transferred to Head.
MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
- for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
- MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+ for (MachineBasicBlock *RemovedMBB : Removed) {
+ MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB);
assert(Node != HeadNode && "Cannot erase the head node");
assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
while (Node->getNumChildren())
DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
- DomTree->eraseNode(Removed[i]);
+ DomTree->eraseNode(RemovedMBB);
}
}
@@ -801,8 +801,8 @@ void
AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
if (!Loops)
return;
- for (unsigned i = 0, e = Removed.size(); i != e; ++i)
- Loops->removeBlock(Removed[i]);
+ for (MachineBasicBlock *RemovedMBB : Removed)
+ Loops->removeBlock(RemovedMBB);
}
/// Invalidate MachineTraceMetrics before if-conversion.
@@ -899,7 +899,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
Loops = getAnalysisIfAvailable<MachineLoopInfo>();
Traces = &getAnalysis<MachineTraceMetrics>();
MinInstr = nullptr;
- MinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+ MinSize = MF.getFunction()->optForMinSize();
bool Changed = false;
CmpConv.runOnMachineFunction(MF);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 74fc167433f6..576cf4a74167 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -26,6 +26,12 @@ using namespace llvm;
STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+namespace llvm {
+void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry &);
+}
+
+#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions"
+
namespace {
class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
private:
@@ -35,11 +41,14 @@ private:
bool usesFrameIndex(const MachineInstr &MI);
public:
static char ID; // Pass identification, replacement for typeid.
- explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+ explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {
+ initializeAArch64DeadRegisterDefinitionsPass(
+ *PassRegistry::getPassRegistry());
+ }
bool runOnMachineFunction(MachineFunction &F) override;
- const char *getPassName() const override { return "Dead register definitions"; }
+ const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -49,6 +58,9 @@ public:
char AArch64DeadRegisterDefinitions::ID = 0;
} // end anonymous namespace
+INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs",
+ AARCH64_DEAD_REG_DEF_NAME, false, false)
+
bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
unsigned Reg, const MachineInstr &MI) {
for (const MachineOperand &MO : MI.implicit_operands())
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index c2470f747a38..d24e42a93763 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -22,18 +22,26 @@
#include "llvm/Support/MathExtras.h"
using namespace llvm;
+namespace llvm {
+void initializeAArch64ExpandPseudoPass(PassRegistry &);
+}
+
+#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass"
+
namespace {
class AArch64ExpandPseudo : public MachineFunctionPass {
public:
static char ID;
- AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
+ AArch64ExpandPseudo() : MachineFunctionPass(ID) {
+ initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry());
+ }
const AArch64InstrInfo *TII;
bool runOnMachineFunction(MachineFunction &Fn) override;
const char *getPassName() const override {
- return "AArch64 pseudo instruction expansion pass";
+ return AARCH64_EXPAND_PSEUDO_NAME;
}
private:
@@ -45,6 +53,9 @@ private:
char AArch64ExpandPseudo::ID = 0;
}
+INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
+ AARCH64_EXPAND_PSEUDO_NAME, false, false)
+
/// \brief Transfer implicit operands on the pseudo instruction to the
/// instructions created from the expansion.
static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 072819836bb3..0ac4b39b0357 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -523,7 +523,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
U = C;
}
- if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+ if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
if (Ty->getAddressSpace() > 255)
// Fast instruction selection doesn't support the special
// address spaces.
@@ -969,7 +969,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
// Cannot encode an offset register and an immediate offset in the same
// instruction. Fold the immediate offset into the load/store instruction and
- // emit an additonal add to take care of the offset register.
+ // emit an additional add to take care of the offset register.
if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
RegisterOffsetNeedsLowering = true;
@@ -1058,8 +1058,8 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr,
// FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size
// and alignment should be based on the VT.
MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(FI, Offset), Flags,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
// Now add the rest of the operands.
MIB.addFrameIndex(FI).addImm(Offset);
} else {
@@ -1178,7 +1178,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
}
// Check if the mul can be folded into the instruction.
- if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
if (isMulPowOf2(RHS)) {
const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
@@ -1193,12 +1193,16 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
if (!RHSReg)
return 0;
bool RHSIsKill = hasTrivialKill(MulLHS);
- return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
- AArch64_AM::LSL, ShiftVal, SetFlags, WantResult);
+ ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags,
+ WantResult);
+ if (ResultReg)
+ return ResultReg;
}
+ }
// Check if the shift can be folded into the instruction.
- if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
@@ -1214,12 +1218,15 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
if (!RHSReg)
return 0;
bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
- return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftType, ShiftVal, SetFlags,
- WantResult);
+ ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftType, ShiftVal, SetFlags,
+ WantResult);
+ if (ResultReg)
+ return ResultReg;
}
}
}
+ }
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
@@ -1323,6 +1330,10 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
+ // Don't deal with undefined shifts.
+ if (ShiftImm >= RetVT.getSizeInBits())
+ return 0;
+
static const unsigned OpcTable[2][2][2] = {
{ { AArch64::SUBWrs, AArch64::SUBXrs },
{ AArch64::ADDWrs, AArch64::ADDXrs } },
@@ -1360,6 +1371,9 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
+ if (ShiftImm >= 4)
+ return 0;
+
static const unsigned OpcTable[2][2][2] = {
{ { AArch64::SUBWrx, AArch64::SUBXrx },
{ AArch64::ADDWrx, AArch64::ADDXrx } },
@@ -1542,7 +1556,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
return ResultReg;
// Check if the mul can be folded into the instruction.
- if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
if (isMulPowOf2(RHS)) {
const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
@@ -1558,12 +1572,15 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
if (!RHSReg)
return 0;
bool RHSIsKill = hasTrivialKill(MulLHS);
- return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftVal);
+ ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftVal);
+ if (ResultReg)
+ return ResultReg;
}
+ }
// Check if the shift can be folded into the instruction.
- if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (RHS->hasOneUse() && isValueAvailable(RHS)) {
if (const auto *SI = dyn_cast<ShlOperator>(RHS))
if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
uint64_t ShiftVal = C->getZExtValue();
@@ -1571,9 +1588,12 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
if (!RHSReg)
return 0;
bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
- return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill, ShiftVal);
+ ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftVal);
+ if (ResultReg)
+ return ResultReg;
}
+ }
unsigned RHSReg = getRegForValue(RHS);
if (!RHSReg)
@@ -1646,6 +1666,11 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
{ AArch64::ORRWrs, AArch64::ORRXrs },
{ AArch64::EORWrs, AArch64::EORXrs }
};
+
+ // Don't deal with undefined shifts.
+ if (ShiftImm >= RetVT.getSizeInBits())
+ return 0;
+
const TargetRegisterClass *RC;
unsigned Opc;
switch (RetVT.SimpleTy) {
@@ -2235,14 +2260,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
MIB.addImm(TestBit);
MIB.addMBB(TBB);
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
- fastEmitBranch(FBB, DbgLoc);
-
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
@@ -2257,7 +2275,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
- AArch64CC::CondCode CC = AArch64CC::NE;
if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
if (CI->hasOneUse() && isValueAvailable(CI)) {
// Try to optimize or fold the cmp.
@@ -2289,7 +2306,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
// FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
// instruction.
- CC = getCompareCC(Predicate);
+ AArch64CC::CondCode CC = getCompareCC(Predicate);
AArch64CC::CondCode ExtraCC = AArch64CC::AL;
switch (Predicate) {
default:
@@ -2317,52 +2334,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
.addImm(CC)
.addMBB(TBB);
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
- fastEmitBranch(FBB, DbgLoc);
- return true;
- }
- } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
- MVT SrcVT;
- if (TI->hasOneUse() && isValueAvailable(TI) &&
- isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) {
- unsigned CondReg = getRegForValue(TI->getOperand(0));
- if (!CondReg)
- return false;
- bool CondIsKill = hasTrivialKill(TI->getOperand(0));
-
- // Issue an extract_subreg to get the lower 32-bits.
- if (SrcVT == MVT::i64) {
- CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill,
- AArch64::sub_32);
- CondIsKill = true;
- }
-
- unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1);
- assert(ANDReg && "Unexpected AND instruction emission failure.");
- emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
-
- if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
- std::swap(TBB, FBB);
- CC = AArch64CC::EQ;
- }
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
- .addImm(CC)
- .addMBB(TBB);
-
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
- fastEmitBranch(FBB, DbgLoc);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
} else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
@@ -2371,34 +2343,31 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
.addMBB(Target);
- // Obtain the branch weight and add the target to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- Target->getBasicBlock());
- FuncInfo.MBB->addSuccessor(Target, BranchWeight);
+ // Obtain the branch probability and add the target to the successor list.
+ if (FuncInfo.BPI) {
+ auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
+ BI->getParent(), Target->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(Target, BranchProbability);
+ } else
+ FuncInfo.MBB->addSuccessorWithoutProb(Target);
return true;
- } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
- // Fake request the condition, otherwise the intrinsic might be completely
- // optimized away.
- unsigned CondReg = getRegForValue(BI->getCondition());
- if (!CondReg)
- return false;
-
- // Emit the branch.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
- .addImm(CC)
- .addMBB(TBB);
+ } else {
+ AArch64CC::CondCode CC = AArch64CC::NE;
+ if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned CondReg = getRegForValue(BI->getCondition());
+ if (!CondReg)
+ return false;
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+ // Emit the branch.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TBB);
- fastEmitBranch(FBB, DbgLoc);
- return true;
+ finishCondBranch(BI->getParent(), TBB, FBB);
+ return true;
+ }
}
unsigned CondReg = getRegForValue(BI->getCondition());
@@ -2406,32 +2375,22 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
return false;
bool CondRegIsKill = hasTrivialKill(BI->getCondition());
- // We've been divorced from our compare! Our block was split, and
- // now our compare lives in a predecessor block. We musn't
- // re-compare here, as the children of the compare aren't guaranteed
- // live across the block boundary (we *could* check for this).
- // Regardless, the compare has been done in the predecessor block,
- // and it left a value for us in a virtual register. Ergo, we test
- // the one-bit value left in the virtual register.
- emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0);
-
+ // i1 conditions come as i32 values, test the lowest bit with tb(n)z.
+ unsigned Opcode = AArch64::TBNZW;
if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
std::swap(TBB, FBB);
- CC = AArch64CC::EQ;
+ Opcode = AArch64::TBZW;
}
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
- .addImm(CC)
+ const MCInstrDesc &II = TII.get(Opcode);
+ unsigned ConstrainedCondReg
+ = constrainOperandRegClass(II, CondReg, II.getNumDefs());
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+ .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
+ .addImm(0)
.addMBB(TBB);
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
-
- fastEmitBranch(FBB, DbgLoc);
+ finishCondBranch(BI->getParent(), TBB, FBB);
return true;
}
@@ -2447,8 +2406,8 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
// Make sure the CFG is up-to-date.
- for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
- FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
+ for (auto *Succ : BI->successors())
+ FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]);
return true;
}
@@ -2456,6 +2415,10 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
bool AArch64FastISel::selectCmp(const Instruction *I) {
const CmpInst *CI = cast<CmpInst>(I);
+ // Vectors of i1 are weird: bail out.
+ if (CI->getType()->isVectorTy())
+ return false;
+
// Try to optimize or fold the cmp.
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
unsigned ResultReg = 0;
@@ -2954,8 +2917,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
.addImm(NumBytes);
// Process the args.
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign &VA = ArgLocs[i];
+ for (CCValAssign &VA : ArgLocs) {
const Value *ArgVal = CLI.OutVals[VA.getValNo()];
MVT ArgVT = OutVTs[VA.getValNo()];
@@ -3018,8 +2980,8 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getStack(Addr.getOffset()),
- MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+ MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
if (!emitStore(ArgVT, ArgReg, Addr, MMO))
return false;
@@ -3318,8 +3280,8 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
return false;
// Make sure nothing is in the way
- BasicBlock::const_iterator Start = I;
- BasicBlock::const_iterator End = II;
+ BasicBlock::const_iterator Start(I);
+ BasicBlock::const_iterator End(II);
for (auto Itr = std::prev(Start); Itr != End; --Itr) {
// We only expect extractvalue instructions between the intrinsic and the
// instruction to be selected.
@@ -3684,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
if (F.isVarArg())
return false;
+ if (TLI.supportSplitCSR(FuncInfo.MF))
+ return false;
+
// Build a list of return value registers.
SmallVector<unsigned, 4> RetRegs;
@@ -3763,8 +3728,8 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AArch64::RET_ReallyLR));
- for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
- MIB.addReg(RetRegs[i], RegState::Implicit);
+ for (unsigned RetReg : RetRegs)
+ MIB.addReg(RetReg, RegState::Implicit);
return true;
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a76473f7e539..11ae8005370d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -72,9 +72,9 @@
//
// For most functions, some of the frame areas are empty. For those functions,
// it may not be necessary to set up fp or bp:
-// * A base pointer is definitly needed when there are both VLAs and local
+// * A base pointer is definitely needed when there are both VLAs and local
// variables with more-than-default alignment requirements.
-// * A frame pointer is definitly needed when there are local variables with
+// * A frame pointer is definitely needed when there are local variables with
// more-than-default alignment requirements.
//
// In some cases when a base pointer is not strictly needed, it is generated
@@ -216,11 +216,11 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
if (CSI.empty())
return;
- const DataLayout *TD = MF.getTarget().getDataLayout();
+ const DataLayout &TD = MF.getDataLayout();
bool HasFP = hasFP(MF);
// Calculate amount of bytes used for return address storing.
- int stackGrowth = -TD->getPointerSize(0);
+ int stackGrowth = -TD.getPointerSize(0);
// Calculate offsets.
int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
@@ -280,14 +280,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MBB.begin();
const MachineFrameInfo *MFI = MF.getFrameInfo();
const Function *Fn = MF.getFunction();
- const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
bool HasFP = hasFP(MF);
- DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ // Debug location must be unknown since the first debug location is used
+ // to determine the end of the prologue.
+ DebugLoc DL;
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
@@ -354,7 +357,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (NumBytes && NeedsRealignment) {
// Use the first callee-saved register as a scratch register.
scratchSPReg = AArch64::X9;
- MF.getRegInfo().setPhysRegUsed(scratchSPReg);
}
// If we're a leaf function, try using the red zone.
@@ -400,8 +402,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
if (needsFrameMoves) {
- const DataLayout *TD = MF.getTarget().getDataLayout();
- const int StackGrowth = -TD->getPointerSize(0);
+ const DataLayout &TD = MF.getDataLayout();
+ const int StackGrowth = -TD.getPointerSize(0);
unsigned FramePtr = RegInfo->getFrameRegister(MF);
// An example of the prologue:
//
@@ -513,33 +515,33 @@ static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
return false;
}
-static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+/// Checks whether the given instruction restores callee save registers
+/// and if so returns how many.
+static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) {
unsigned RtIdx = 0;
- if (MI->getOpcode() == AArch64::LDPXpost ||
- MI->getOpcode() == AArch64::LDPDpost)
+ switch (MI.getOpcode()) {
+ case AArch64::LDPXpost:
+ case AArch64::LDPDpost:
RtIdx = 1;
-
- if (MI->getOpcode() == AArch64::LDPXpost ||
- MI->getOpcode() == AArch64::LDPDpost ||
- MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
- if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
- !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
- MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
- return false;
- return true;
+ // FALLTHROUGH
+ case AArch64::LDPXi:
+ case AArch64::LDPDi:
+ if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) ||
+ !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) ||
+ MI.getOperand(RtIdx + 2).getReg() != AArch64::SP)
+ return 0;
+ return 2;
}
-
- return false;
+ return 0;
}
void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
MachineFrameInfo *MFI = MF.getFrameInfo();
- const AArch64InstrInfo *TII =
- static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
- const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
bool IsTailCallReturn = false;
if (MBB.end() != MBBI) {
@@ -585,7 +587,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// ---------------------| --- |
// | | | |
// | CalleeSavedReg | | |
- // | (NumRestores * 16) | | |
+ // | (NumRestores * 8) | | |
// | | | |
// ---------------------| | NumBytes
// | | StackSize (StackAdjustUp)
@@ -606,17 +608,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// Move past the restores of the callee-saved registers.
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
- if (LastPopI != MBB.begin()) {
- do {
- ++NumRestores;
- --LastPopI;
- } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
- if (!isCSRestore(LastPopI, CSRegs)) {
+ MachineBasicBlock::iterator Begin = MBB.begin();
+ while (LastPopI != Begin) {
+ --LastPopI;
+ unsigned Restores = getNumCSRestores(*LastPopI, CSRegs);
+ NumRestores += Restores;
+ if (Restores == 0) {
++LastPopI;
- --NumRestores;
+ break;
}
}
- NumBytes -= NumRestores * 16;
+ NumBytes -= NumRestores * 8;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
if (!hasFP(MF)) {
@@ -634,15 +636,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// be able to save any instructions.
if (NumBytes || MFI->hasVarSizedObjects())
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
- -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
-}
-
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index.
-int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
- unsigned FrameReg;
- return getFrameIndexReference(MF, FI, FrameReg);
+ -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags);
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -739,9 +733,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
DebugLoc DL;
assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
- if (MI != MBB.end())
- DL = MI->getDebugLoc();
-
for (unsigned i = 0; i < Count; i += 2) {
unsigned idx = Count - i - 2;
unsigned Reg1 = CSI[idx].getReg();
@@ -911,7 +902,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned NumFPRSpilled = 0;
bool ExtraCSSpill = false;
bool CanEliminateFrame = true;
- DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+ DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:");
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
// Check pairs of consecutive callee-saved registers.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 731f031ff855..427afdf4acbf 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -37,7 +37,6 @@ public:
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
int resolveFrameIndexReference(const MachineFunction &MF, int FI,
@@ -61,6 +60,11 @@ public:
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
+
+ /// Returns true if the target will correctly handle shrink wrapping.
+ bool enableShrinkWrapping(const MachineFunction &MF) const override {
+ return true;
+ }
};
} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 772e894f4f0a..6c868880bcac 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -34,7 +34,6 @@ using namespace llvm;
namespace {
class AArch64DAGToDAGISel : public SelectionDAGISel {
- AArch64TargetMachine &TM;
/// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
@@ -45,7 +44,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
public:
explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
ForCodeSize(false) {}
const char *getPassName() const override {
@@ -53,9 +52,7 @@ public:
}
bool runOnMachineFunction(MachineFunction &MF) override {
- ForCodeSize =
- MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
- MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+ ForCodeSize = MF.getFunction()->optForSize();
Subtarget = &MF.getSubtarget<AArch64Subtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -79,6 +76,21 @@ public:
bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
return SelectShiftedRegister(N, true, Reg, Shift);
}
+ bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
+ }
+ bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+ return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
+ }
bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 1, Base, OffImm);
}
@@ -153,8 +165,7 @@ public:
SDNode *SelectBitfieldExtractOp(SDNode *N);
SDNode *SelectBitfieldInsertOp(SDNode *N);
-
- SDNode *SelectLIBM(SDNode *N);
+ SDNode *SelectBitfieldInsertInZeroOp(SDNode *N);
SDNode *SelectReadRegister(SDNode *N);
SDNode *SelectWriteRegister(SDNode *N);
@@ -165,6 +176,8 @@ public:
private:
bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
SDValue &Shift);
+ bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
+ SDValue &OffImm);
bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
@@ -422,7 +435,7 @@ static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
return true;
}
-// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
// high lane extract.
static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
SDValue &LaneOp, int &LaneIdx) {
@@ -572,7 +585,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
}
// AArch64 mandates that the RHS of the operation must use the smallest
- // register classs that could contain the size being extended from. Thus,
+ // register class that could contain the size being extended from. Thus,
// if we're folding a (sext i8), we need the RHS to be a GPR32, even though
// there might not be an actual 32-bit value in the program. We can
// (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
@@ -587,7 +600,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
/// need to create a real ADD instruction from it anyway and there's no point in
/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
-/// leads to duplaicated ADRP instructions.
+/// leads to duplicated ADRP instructions.
static bool isWorthFoldingADDlow(SDValue N) {
for (auto Use : N->uses()) {
if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
@@ -604,6 +617,51 @@ static bool isWorthFoldingADDlow(SDValue N) {
return true;
}
+/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// immediate" address. The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
+ SDValue &Base,
+ SDValue &OffImm) {
+ SDLoc dl(N);
+ const DataLayout &DL = CurDAG->getDataLayout();
+ const TargetLowering *TLI = getTargetLowering();
+ if (N.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(N)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+ }
+
+ // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed
+ // selected here doesn't support labels/immediates, only base+offset.
+
+ if (CurDAG->isBaseWithConstantOffset(N)) {
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ int64_t RHSC = RHS->getSExtValue();
+ unsigned Scale = Log2_32(Size);
+ if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) &&
+ RHSC < (0x40 << Scale)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+ }
+ OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+ return true;
+ }
+ }
+ }
+
+ // Base only. The address will be materialized into a register before
+ // the memory is accessed.
+ // add x0, Xbase, #offset
+ // stp x1, x2, [x0]
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+ return true;
+}
+
/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.
@@ -867,7 +925,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
if (isa<ConstantSDNode>(RHS)) {
int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
unsigned Scale = Log2_32(Size);
- // Skip the immediate can be seleced by load/store addressing mode.
+ // Skip the immediate can be selected by load/store addressing mode.
// Also skip the immediate can be encoded by a single ADD (SUB is also
// checked by using -ImmOff).
if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
@@ -1034,6 +1092,8 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
// it into an i64.
DstVT = MVT::i32;
}
+ } else if (VT == MVT::f16) {
+ Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
} else if (VT == MVT::f32) {
Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
} else if (VT == MVT::f64 || VT.is64BitVector()) {
@@ -1222,8 +1282,8 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
SDValue SuperReg = SDValue(Ld, 0);
EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
- static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
- AArch64::qsub3 };
+ static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
for (unsigned i = 0; i < NumVecs; ++i) {
SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
if (Narrow)
@@ -1275,8 +1335,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
} else {
EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
- static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
- AArch64::qsub3 };
+ static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+ AArch64::qsub2, AArch64::qsub3 };
for (unsigned i = 0; i < NumVecs; ++i) {
SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
SuperReg);
@@ -1420,7 +1480,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
// The resulting code will be at least as good as the original one
// plus it may expose more opportunities for bitfield insert pattern.
// FIXME: Currently we limit this to the bigger pattern, because
- // some optimizations expect AND and not UBFM
+ // some optimizations expect AND and not UBFM.
Opd0 = N->getOperand(0);
} else
return false;
@@ -1852,6 +1912,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
/// Does this tree qualify as an attempt to move a bitfield into position,
/// essentially "(and (shl VAL, N), Mask)".
static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+ bool BiggerPattern,
SDValue &Src, int &ShiftAmount,
int &MaskWidth) {
EVT VT = Op.getValueType();
@@ -1874,6 +1935,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
Op = Op.getOperand(0);
}
+ // Don't match if the SHL has more than one use, since then we'll end up
+ // generating SHL+UBFIZ instead of just keeping SHL+AND.
+ if (!BiggerPattern && !Op.hasOneUse())
+ return false;
+
uint64_t ShlImm;
if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
return false;
@@ -1887,7 +1953,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
// BFI encompasses sufficiently many nodes that it's worth inserting an extra
// LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
- // amount.
+ // amount. BiggerPattern is true when this pattern is being matched for BFI,
+ // BiggerPattern is false when this pattern is being matched for UBFIZ, in
+ // which case it is not profitable to insert an extra shift.
+ if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
+ return false;
Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
return true;
@@ -1904,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
SDValue &Src, unsigned &ImmR,
- unsigned &ImmS, SelectionDAG *CurDAG) {
+ unsigned &ImmS, const APInt &UsefulBits,
+ SelectionDAG *CurDAG) {
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
// Set Opc
@@ -1918,23 +1989,30 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
// Because of simplify-demanded-bits in DAGCombine, involved masks may not
// have the expected shape. Try to undo that.
- APInt UsefulBits;
- getUsefulBits(SDValue(N, 0), UsefulBits);
unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
- // OR is commutative, check both possibilities (does llvm provide a
- // way to do that directely, e.g., via code matcher?)
- SDValue OrOpd1Val = N->getOperand(1);
- SDNode *OrOpd0 = N->getOperand(0).getNode();
- SDNode *OrOpd1 = N->getOperand(1).getNode();
- for (int i = 0; i < 2;
- ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+ // OR is commutative, check all combinations of operand order and values of
+ // BiggerPattern, i.e.
+ // Opd0, Opd1, BiggerPattern=false
+ // Opd1, Opd0, BiggerPattern=false
+ // Opd0, Opd1, BiggerPattern=true
+ // Opd1, Opd0, BiggerPattern=true
+ // Several of these combinations may match, so check with BiggerPattern=false
+ // first since that will produce better results by matching more instructions
+ // and/or inserting fewer extra instructions.
+ for (int I = 0; I < 4; ++I) {
+
+ bool BiggerPattern = I / 2;
+ SDNode *OrOpd0 = N->getOperand(I % 2).getNode();
+ SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
+ SDNode *OrOpd1 = OrOpd1Val.getNode();
+
unsigned BFXOpc;
int DstLSB, Width;
if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
- NumberOfIgnoredLowBits, true)) {
+ NumberOfIgnoredLowBits, BiggerPattern)) {
// Check that the returned opcode is compatible with the pattern,
// i.e., same type and zero extended (U and not S)
if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
@@ -1952,8 +2030,9 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
// If the mask on the insertee is correct, we have a BFXIL operation. We
// can share the ImmR and ImmS values from the already-computed UBFM.
- } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
- DstLSB, Width)) {
+ } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0),
+ BiggerPattern,
+ Src, DstLSB, Width)) {
ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
ImmS = Width - 1;
} else
@@ -2003,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
unsigned Opc;
unsigned LSB, MSB;
SDValue Opd0, Opd1;
+ EVT VT = N->getValueType(0);
+ APInt NUsefulBits;
+ getUsefulBits(SDValue(N, 0), NUsefulBits);
+
+ // If all bits are not useful, just return UNDEF.
+ if (!NUsefulBits)
+ return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
- if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+ if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
+ CurDAG))
return nullptr;
- EVT VT = N->getValueType(0);
SDLoc dl(N);
SDValue Ops[] = { Opd0,
Opd1,
@@ -2016,58 +2102,37 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
}
-SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
+/// equivalent of a left shift by a constant amount followed by an and masking
+/// out a contiguous set of bits.
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
+ if (N->getOpcode() != ISD::AND)
+ return nullptr;
+
EVT VT = N->getValueType(0);
- unsigned Variant;
unsigned Opc;
- unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
-
- if (VT == MVT::f32) {
- Variant = 0;
- } else if (VT == MVT::f64) {
- Variant = 1;
- } else
- return nullptr; // Unrecognized argument type. Fall back on default codegen.
-
- // Pick the FRINTX variant needed to set the flags.
- unsigned FRINTXOpc = FRINTXOpcs[Variant];
-
- switch (N->getOpcode()) {
- default:
- return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
- case ISD::FCEIL: {
- unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
- Opc = FRINTPOpcs[Variant];
- break;
- }
- case ISD::FFLOOR: {
- unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
- Opc = FRINTMOpcs[Variant];
- break;
- }
- case ISD::FTRUNC: {
- unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
- Opc = FRINTZOpcs[Variant];
- break;
- }
- case ISD::FROUND: {
- unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
- Opc = FRINTAOpcs[Variant];
- break;
- }
- }
+ if (VT == MVT::i32)
+ Opc = AArch64::UBFMWri;
+ else if (VT == MVT::i64)
+ Opc = AArch64::UBFMXri;
+ else
+ return nullptr;
- SDLoc dl(N);
- SDValue In = N->getOperand(0);
- SmallVector<SDValue, 2> Ops;
- Ops.push_back(In);
+ SDValue Op0;
+ int DstLSB, Width;
+ if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
+ Op0, DstLSB, Width))
+ return nullptr;
- if (!TM.Options.UnsafeFPMath) {
- SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
- Ops.push_back(SDValue(FRINTX, 1));
- }
+ // ImmR is the rotate right amount.
+ unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+ // ImmS is the most significant bit of the source to be moved.
+ unsigned ImmS = Width - 1;
- return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+ SDLoc DL(N);
+ SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
+ CurDAG->getTargetConstant(ImmS, DL, VT)};
+ return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
}
bool
@@ -2119,7 +2184,7 @@ AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
// into a single value to be used in the MRS/MSR instruction.
static int getIntOperandFromRegisterString(StringRef RegString) {
SmallVector<StringRef, 5> Fields;
- RegString.split(Fields, ":");
+ RegString.split(Fields, ':');
if (Fields.size() == 1)
return -1;
@@ -2206,7 +2271,15 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
assert (isa<ConstantSDNode>(N->getOperand(2))
&& "Expected a constant integer expression.");
uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
- return CurDAG->getMachineNode(AArch64::MSRpstate, DL, MVT::Other,
+ unsigned State;
+ if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
+ assert(Immed < 2 && "Bad imm");
+ State = AArch64::MSRpstateImm1;
+ } else {
+ assert(Immed < 16 && "Bad imm");
+ State = AArch64::MSRpstateImm4;
+ }
+ return CurDAG->getMachineNode(State, DL, MVT::Other,
CurDAG->getTargetConstant(Reg, DL, MVT::i32),
CurDAG->getTargetConstant(Immed, DL, MVT::i16),
N->getOperand(0));
@@ -2279,6 +2352,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case ISD::SRA:
if (SDNode *I = SelectBitfieldExtractOp(Node))
return I;
+ if (SDNode *I = SelectBitfieldInsertInZeroOp(Node))
+ return I;
break;
case ISD::OR:
@@ -2802,6 +2877,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
}
}
+ break;
}
case AArch64ISD::LD2post: {
if (VT == MVT::v8i8)
@@ -3214,14 +3290,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
break;
}
-
- case ISD::FCEIL:
- case ISD::FFLOOR:
- case ISD::FTRUNC:
- case ISD::FROUND:
- if (SDNode *I = SelectLIBM(Node))
- return I;
- break;
}
// Select the default instruction
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3e8f46cf1ecd..9f5beff12100 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -40,23 +40,6 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-namespace {
-enum AlignMode {
- StrictAlign,
- NoStrictAlign
-};
-}
-
-static cl::opt<AlignMode>
-Align(cl::desc("Load/store alignment support"),
- cl::Hidden, cl::init(NoStrictAlign),
- cl::values(
- clEnumValN(StrictAlign, "aarch64-strict-align",
- "Disallow all unaligned memory accesses"),
- clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
- "Allow unaligned memory accesses"),
- clEnumValEnd));
-
// Place holder until extr generation is tested fully.
static cl::opt<bool>
EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
@@ -76,6 +59,9 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
cl::init(false));
+/// Value type used for condition codes.
+static const MVT MVT_CC = MVT::i32;
+
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -210,11 +196,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
- // Exception handling.
- // FIXME: These are guesses. Has this been defined yet?
- setExceptionPointerRegister(AArch64::X0);
- setExceptionSelectorRegister(AArch64::X1);
-
// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -234,6 +215,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// AArch64 lacks both left-rotate and popcount instructions.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::ROTL, VT, Expand);
+ setOperationAction(ISD::ROTR, VT, Expand);
+ }
// AArch64 doesn't have {U|S}MUL_LOHI.
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
@@ -252,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ }
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::SREM, MVT::i64, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
@@ -315,6 +304,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
// v4f16 is also a storage-only type, so promote it to v4f32 when that is
// known to be safe.
@@ -403,10 +394,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
+ setOperationAction(ISD::FMINNUM, Ty, Legal);
+ setOperationAction(ISD::FMAXNUM, Ty, Legal);
+ setOperationAction(ISD::FMINNAN, Ty, Legal);
+ setOperationAction(ISD::FMAXNAN, Ty, Legal);
}
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+ // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
+ // This requires the Performance Monitors extension.
+ if (Subtarget->hasPerfMon())
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+
if (Subtarget->isTargetMachO()) {
// For iOS, we don't want to the normal expansion of a libcall to
// sincos. We want to issue a libcall to __sincos_stret to avoid memory
@@ -456,12 +456,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setIndexedLoadAction(im, MVT::i64, Legal);
setIndexedLoadAction(im, MVT::f64, Legal);
setIndexedLoadAction(im, MVT::f32, Legal);
+ setIndexedLoadAction(im, MVT::f16, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i16, Legal);
setIndexedStoreAction(im, MVT::i32, Legal);
setIndexedStoreAction(im, MVT::i64, Legal);
setIndexedStoreAction(im, MVT::f64, Legal);
setIndexedStoreAction(im, MVT::f32, Legal);
+ setIndexedStoreAction(im, MVT::f16, Legal);
}
// Trap.
@@ -479,6 +481,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::FP_TO_SINT);
+ setTargetDAGCombine(ISD::FP_TO_UINT);
+ setTargetDAGCombine(ISD::FDIV);
+
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::ANY_EXTEND);
@@ -487,16 +493,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::STORE);
+ if (Subtarget->supportsAddressTopByteIgnored())
+ setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::VSELECT);
- setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -512,10 +520,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setMinFunctionAlignment(2);
- RequireStrictAlign = (Align == StrictAlign);
-
setHasExtractBitsInsn(true);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
if (Subtarget->hasNEON()) {
// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
// silliness like this:
@@ -646,6 +654,9 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+
+ // But we do support custom-lowering for FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
@@ -686,6 +697,12 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+ // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
+ if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
+ for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
+ ISD::FMINNUM, ISD::FMAXNUM})
+ setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
@@ -730,7 +747,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
break;
}
case ISD::INTRINSIC_W_CHAIN: {
- ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+ ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
switch (IntID) {
default: return;
@@ -780,6 +797,34 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
return MVT::i64;
}
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned AddrSpace,
+ unsigned Align,
+ bool *Fast) const {
+ if (Subtarget->requiresStrictAlign())
+ return false;
+
+ // FIXME: This is mostly true for Cyclone, but not necessarily others.
+ if (Fast) {
+ // FIXME: Define an attribute for slow unaligned accesses instead of
+ // relying on the CPU type as a proxy.
+ // On Cyclone, unaligned 128-bit stores are slow.
+ *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+ // See comments in performSTORECombine() for more details about
+ // these conditions.
+
+ // Code that uses clang vector extensions can mark that it
+ // wants unaligned accesses to be treated as fast by
+ // underspecifying alignment to be 1 or 2.
+ Align <= 2 ||
+
+ // Disregard v2i64. Memcpy lowering produces those and splitting
+ // them regresses performance on micro-benchmarks and olden/bh.
+ VT == MVT::v2i64;
+ }
+ return true;
+}
+
FastISel *
AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
@@ -809,9 +854,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
+ case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
+ case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
+ case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
- case AArch64ISD::FMIN: return "AArch64ISD::FMIN";
- case AArch64ISD::FMAX: return "AArch64ISD::FMAX";
case AArch64ISD::DUP: return "AArch64ISD::DUP";
case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
@@ -931,8 +977,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
DebugLoc DL = MI->getDebugLoc();
- MachineFunction::iterator It = MBB;
- ++It;
+ MachineFunction::iterator It = ++MBB->getIterator();
unsigned DestReg = MI->getOperand(0).getReg();
unsigned IfTrueReg = MI->getOperand(1).getReg();
@@ -1141,8 +1186,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// register to WZR/XZR if it ends up being unused.
unsigned Opcode = AArch64ISD::SUBS;
- if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
- cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+ if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
// the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
@@ -1156,8 +1200,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// the absence of information about op2.
Opcode = AArch64ISD::ADDS;
RHS = RHS.getOperand(1);
- } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
- cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+ } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
!isUnsignedIntSetCC(CC)) {
// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
@@ -1167,14 +1210,230 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
LHS = LHS.getOperand(0);
}
- return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+ return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
.getValue(1);
}
+/// \defgroup AArch64CCMP CMP;CCMP matching
+///
+/// These functions deal with the formation of CMP;CCMP;... sequences.
+/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
+/// a comparison. They set the NZCV flags to a predefined value if their
+/// predicate is false. This allows to express arbitrary conjunctions, for
+/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
+/// expressed as:
+/// cmp A
+/// ccmp B, inv(CB), CA
+/// check for CB flags
+///
+/// In general we can create code for arbitrary "... (and (and A B) C)"
+/// sequences. We can also implement some "or" expressions, because "(or A B)"
+/// is equivalent to "not (and (not A) (not B))" and we can implement some
+/// negation operations:
+/// We can negate the results of a single comparison by inverting the flags
+/// used when the predicate fails and inverting the flags tested in the next
+/// instruction; We can also negate the results of the whole previous
+/// conditional compare sequence by inverting the flags tested in the next
+/// instruction. However there is no way to negate the result of a partial
+/// sequence.
+///
+/// Therefore on encountering an "or" expression we can negate the subtree on
+/// one side and have to be able to push the negate to the leafs of the subtree
+/// on the other side (see also the comments in code). As complete example:
+/// "or (or (setCA (cmp A)) (setCB (cmp B)))
+/// (and (setCC (cmp C)) (setCD (cmp D)))"
+/// is transformed to
+/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
+/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
+/// and implemented as:
+/// cmp C
+/// ccmp D, inv(CD), CC
+/// ccmp A, CA, inv(CD)
+/// ccmp B, CB, inv(CA)
+/// check for CB flags
+/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
+/// by conditional compare sequences.
+/// @{
+
+/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
+static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC, SDValue CCOp,
+ SDValue Condition, unsigned NZCV,
+ SDLoc DL, SelectionDAG &DAG) {
+ unsigned Opcode = 0;
+ if (LHS.getValueType().isFloatingPoint())
+ Opcode = AArch64ISD::FCCMP;
+ else if (RHS.getOpcode() == ISD::SUB) {
+ SDValue SubOp0 = RHS.getOperand(0);
+ if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ // See emitComparison() on why we can only do this for SETEQ and SETNE.
+ Opcode = AArch64ISD::CCMN;
+ RHS = RHS.getOperand(1);
+ }
+ }
+ if (Opcode == 0)
+ Opcode = AArch64ISD::CCMP;
+
+ SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+ return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+}
+
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
+/// CanPushNegate is set to true if we can push a negate operation through
+/// the tree in a was that we are left with AND operations and negate operations
+/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
+/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
+/// brought into such a form.
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
+ unsigned Depth = 0) {
+ if (!Val.hasOneUse())
+ return false;
+ unsigned Opcode = Val->getOpcode();
+ if (Opcode == ISD::SETCC) {
+ CanPushNegate = true;
+ return true;
+ }
+ // Protect against stack overflow.
+ if (Depth > 15)
+ return false;
+ if (Opcode == ISD::AND || Opcode == ISD::OR) {
+ SDValue O0 = Val->getOperand(0);
+ SDValue O1 = Val->getOperand(1);
+ bool CanPushNegateL;
+ if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))
+ return false;
+ bool CanPushNegateR;
+ if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))
+ return false;
+ // We cannot push a negate through an AND operation (it would become an OR),
+ // we can however change a (not (or x y)) to (and (not x) (not y)) if we can
+ // push the negate through the x/y subtrees.
+ CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;
+ return true;
+ }
+ return false;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// Tries to transform the given i1 producing node @p Val to a series compare
+/// and conditional compare operations. @returns an NZCV flags producing node
+/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
+/// transformation was not possible.
+/// On recursive invocations @p PushNegate may be set to true to have negation
+/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
+/// for the comparisons in the current subtree; @p Depth limits the search
+/// depth to avoid stack overflow.
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+ AArch64CC::CondCode &OutCC, bool PushNegate = false,
+ SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,
+ unsigned Depth = 0) {
+ // We're at a tree leaf, produce a conditional comparison operation.
+ unsigned Opcode = Val->getOpcode();
+ if (Opcode == ISD::SETCC) {
+ SDValue LHS = Val->getOperand(0);
+ SDValue RHS = Val->getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
+ bool isInteger = LHS.getValueType().isInteger();
+ if (PushNegate)
+ CC = getSetCCInverse(CC, isInteger);
+ SDLoc DL(Val);
+ // Determine OutCC and handle FP special case.
+ if (isInteger) {
+ OutCC = changeIntCCToAArch64CC(CC);
+ } else {
+ assert(LHS.getValueType().isFloatingPoint());
+ AArch64CC::CondCode ExtraCC;
+ changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
+ // Surpisingly some floating point conditions can't be tested with a
+ // single condition code. Construct an additional comparison in this case.
+ // See comment below on how we deal with OR conditions.
+ if (ExtraCC != AArch64CC::AL) {
+ SDValue ExtraCmp;
+ if (!CCOp.getNode())
+ ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
+ else {
+ SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
+ // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
+ ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
+ NZCV, DL, DAG);
+ }
+ CCOp = ExtraCmp;
+ Predicate = AArch64CC::getInvertedCondCode(ExtraCC);
+ OutCC = AArch64CC::getInvertedCondCode(OutCC);
+ }
+ }
+
+ // Produce a normal comparison if we are first in the chain
+ if (!CCOp.getNode())
+ return emitComparison(LHS, RHS, CC, DL, DAG);
+ // Otherwise produce a ccmp.
+ SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
+ AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+ unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+ return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+ DAG);
+ } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse())
+ return SDValue();
+
+ assert((Opcode == ISD::OR || !PushNegate)
+ && "Can only push negate through OR operation");
+
+ // Check if both sides can be transformed.
+ SDValue LHS = Val->getOperand(0);
+ SDValue RHS = Val->getOperand(1);
+ bool CanPushNegateL;
+ if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))
+ return SDValue();
+ bool CanPushNegateR;
+ if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))
+ return SDValue();
+
+ // Do we need to negate our operands?
+ bool NegateOperands = Opcode == ISD::OR;
+ // We can negate the results of all previous operations by inverting the
+ // predicate flags giving us a free negation for one side. For the other side
+ // we need to be able to push the negation to the leafs of the tree.
+ if (NegateOperands) {
+ if (!CanPushNegateL && !CanPushNegateR)
+ return SDValue();
+ // Order the side where we can push the negate through to LHS.
+ if (!CanPushNegateL && CanPushNegateR)
+ std::swap(LHS, RHS);
+ } else {
+ bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
+ bool NeedsNegOutR = RHS->getOpcode() == ISD::OR;
+ if (NeedsNegOutL && NeedsNegOutR)
+ return SDValue();
+ // Order the side where we need to negate the output flags to RHS so it
+ // gets emitted first.
+ if (NeedsNegOutL)
+ std::swap(LHS, RHS);
+ }
+
+ // Emit RHS. If we want to negate the tree we only need to push a negate
+ // through if we are already in a PushNegate case, otherwise we can negate
+ // the "flags to test" afterwards.
+ AArch64CC::CondCode RHSCC;
+ SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,
+ CCOp, Predicate, Depth+1);
+ if (NegateOperands && !PushNegate)
+ RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+ // Emit LHS. We must push the negate through if we need to negate it.
+ SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,
+ CmpR, RHSCC, Depth+1);
+ // If we transformed an OR to and AND then we have to negate the result
+ // (or absorb a PushNegate resulting in a double negation).
+ if (Opcode == ISD::OR && !PushNegate)
+ OutCC = AArch64CC::getInvertedCondCode(OutCC);
+ return CmpL;
+}
+
+/// @}
+
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
- SDValue Cmp;
- AArch64CC::CondCode AArch64CC;
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
EVT VT = RHS.getValueType();
uint64_t C = RHSC->getZExtValue();
@@ -1229,47 +1488,56 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
}
}
- // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
- // For the i8 operand, the largest immediate is 255, so this can be easily
- // encoded in the compare instruction. For the i16 operand, however, the
- // largest immediate cannot be encoded in the compare.
- // Therefore, use a sign extending load and cmn to avoid materializing the -1
- // constant. For example,
- // movz w1, #65535
- // ldrh w0, [x0, #0]
- // cmp w0, w1
- // >
- // ldrsh w0, [x0, #0]
- // cmn w0, #1
- // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
- // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
- // both the LHS and RHS are truely zero extended and to make sure the
- // transformation is profitable.
+ SDValue Cmp;
+ AArch64CC::CondCode AArch64CC;
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
- if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
- isa<LoadSDNode>(LHS)) {
- if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
- cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
- LHS.getNode()->hasNUsesOfValue(1, 0)) {
- int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
- if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
- SDValue SExt =
- DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
- DAG.getValueType(MVT::i16));
- Cmp = emitComparison(SExt,
- DAG.getConstant(ValueofRHS, dl,
- RHS.getValueType()),
- CC, dl, DAG);
- AArch64CC = changeIntCCToAArch64CC(CC);
- AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
- return Cmp;
- }
+ const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
+
+ // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+ // For the i8 operand, the largest immediate is 255, so this can be easily
+ // encoded in the compare instruction. For the i16 operand, however, the
+ // largest immediate cannot be encoded in the compare.
+ // Therefore, use a sign extending load and cmn to avoid materializing the
+ // -1 constant. For example,
+ // movz w1, #65535
+ // ldrh w0, [x0, #0]
+ // cmp w0, w1
+ // >
+ // ldrsh w0, [x0, #0]
+ // cmn w0, #1
+ // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+ // if and only if (sext LHS) == (sext RHS). The checks are in place to
+ // ensure both the LHS and RHS are truly zero extended and to make sure the
+ // transformation is profitable.
+ if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
+ cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+ cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+ LHS.getNode()->hasNUsesOfValue(1, 0)) {
+ int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+ if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+ SDValue SExt =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+ DAG.getValueType(MVT::i16));
+ Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
+ RHS.getValueType()),
+ CC, dl, DAG);
+ AArch64CC = changeIntCCToAArch64CC(CC);
}
}
+
+ if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
+ if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
+ if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
+ AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+ }
+ }
+ }
+
+ if (!Cmp) {
+ Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ AArch64CC = changeIntCCToAArch64CC(CC);
}
- Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
- AArch64CC = changeIntCCToAArch64CC(CC);
- AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
+ AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
return Cmp;
}
@@ -1391,8 +1659,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
- SDLoc(Op)).first;
+ return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
}
static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
@@ -1571,8 +1838,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
SDValue SrcVal = Op.getOperand(0);
- return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
- /*isSigned*/ false, SDLoc(Op)).first;
+ return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+ SDLoc(Op)).first;
}
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
@@ -1581,6 +1848,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
// in the cost tables.
EVT InVT = Op.getOperand(0).getValueType();
EVT VT = Op.getValueType();
+ unsigned NumElts = InVT.getVectorNumElements();
+
+ // f16 vectors are promoted to f32 before a conversion.
+ if (InVT.getVectorElementType() == MVT::f16) {
+ MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
+ SDLoc dl(Op);
+ return DAG.getNode(
+ Op.getOpcode(), dl, Op.getValueType(),
+ DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+ }
if (VT.getSizeInBits() < InVT.getSizeInBits()) {
SDLoc dl(Op);
@@ -1628,8 +1905,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
- return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
- SDLoc(Op)).first;
+ return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -1931,6 +2207,31 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
+SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ SDLoc dl(Op);
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+ case Intrinsic::aarch64_thread_pointer: {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
+ }
+ case Intrinsic::aarch64_neon_smax:
+ return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_umax:
+ return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_smin:
+ return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_neon_umin:
+ return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+}
+
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -2032,14 +2333,11 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFSINCOS(Op, DAG);
case ISD::MUL:
return LowerMUL(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return LowerINTRINSIC_WO_CHAIN(Op, DAG);
}
}
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
- return 2;
-}
-
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
@@ -2214,9 +2512,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
break;
}
- ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- MemVT, false, false, false, 0);
+ ArgValue = DAG.getExtLoad(
+ ExtType, DL, VA.getLocVT(), Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+ MemVT, false, false, false, 0);
InVals.push_back(ArgValue);
}
@@ -2289,9 +2588,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
- SDValue Store =
- DAG.getStore(Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(i * 8), false, false, 0);
+ SDValue Store = DAG.getStore(
+ Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false,
+ false, 0);
MemOps.push_back(Store);
FIN =
DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2318,9 +2618,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
- SDValue Store =
- DAG.getStore(Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(i * 16), false, false, 0);
+ SDValue Store = DAG.getStore(
+ Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16),
+ false, false, 0);
MemOps.push_back(Store);
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getConstant(16, DL, PtrVT));
@@ -2453,8 +2754,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
*DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
- if (!ArgLocs[i].isRegLoc())
+ for (const CCValAssign &ArgLoc : ArgLocs)
+ if (!ArgLoc.isRegLoc())
return false;
}
@@ -2758,7 +3059,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
DstAddr = DAG.getFrameIndex(FI, PtrVT);
- DstInfo = MachinePointerInfo::getFixedStack(FI);
+ DstInfo =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
// Make sure any stack arguments overlapping with where we're storing
// are loaded before this eventual operation. Otherwise they'll be
@@ -2768,7 +3070,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
- DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+ DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
+ LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
@@ -2802,9 +3105,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
+ for (auto &RegToPass : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+ RegToPass.second, InFlag);
InFlag = Chain.getValue(1);
}
@@ -2860,9 +3163,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Add argument registers to the end of the list so that they are known live
// into the call.
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
- Ops.push_back(DAG.getRegister(RegsToPass[i].first,
- RegsToPass[i].second.getValueType()));
+ for (auto &RegToPass : RegsToPass)
+ Ops.push_back(DAG.getRegister(RegToPass.first,
+ RegToPass.second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
@@ -2968,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (AArch64::GPR64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (AArch64::FPR64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
RetOps[0] = Chain; // Update chain.
@@ -3010,11 +3326,12 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
- SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
- MachinePointerInfo::getConstantPool(),
- /*isVolatile=*/ false,
- /*isNonTemporal=*/ true,
- /*isInvariant=*/ true, 8);
+ SDValue GlobalAddr = DAG.getLoad(
+ PtrVT, DL, DAG.getEntryNode(), PoolAddr,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ /*isVolatile=*/false,
+ /*isNonTemporal=*/true,
+ /*isInvariant=*/true, 8);
if (GN->getOffset() != 0)
return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
DAG.getConstant(GN->getOffset(), DL, PtrVT));
@@ -3087,8 +3404,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet =
- DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
- false, true, true, 8);
+ DAG.getLoad(MVT::i64, DL, Chain, DescAddr,
+ MachinePointerInfo::getGOT(DAG.getMachineFunction()), false,
+ true, true, 8);
Chain = FuncTLVGet.getValue(1);
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -3160,6 +3478,10 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+ if (DAG.getTarget().Options.EmulatedTLS)
+ return LowerToTLSEmulatedModel(GA, DAG);
+
if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
if (Model == TLSModel::LocalDynamic)
Model = TLSModel::GeneralDynamic;
@@ -3277,8 +3599,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
unsigned Opc = LHS.getOpcode();
- if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
- cast<ConstantSDNode>(RHS)->isOne() &&
+ if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
@@ -3392,17 +3713,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SDValue In1 = Op.getOperand(0);
SDValue In2 = Op.getOperand(1);
EVT SrcVT = In2.getValueType();
- if (SrcVT != VT) {
- if (SrcVT == MVT::f32 && VT == MVT::f64)
- In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
- else if (SrcVT == MVT::f64 && VT == MVT::f32)
- In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2,
- DAG.getIntPtrConstant(0, DL));
- else
- // FIXME: Src type is different, bail out for now. Can VT really be a
- // vector type?
- return SDValue();
- }
+
+ if (SrcVT.bitsLT(VT))
+ In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+ else if (SrcVT.bitsGT(VT))
+ In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
EVT VecVT;
EVT EltVT;
@@ -3410,7 +3725,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SDValue VecVal1, VecVal2;
if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
EltVT = MVT::i32;
- VecVT = MVT::v4i32;
+ VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
EltMask = 0x80000000ULL;
if (!VT.isVector()) {
@@ -3571,32 +3886,6 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
}
}
-/// A SELECT_CC operation is really some kind of max or min if both values being
-/// compared are, in some sense, equal to the results in either case. However,
-/// it is permissible to compare f32 values and produce directly extended f64
-/// values.
-///
-/// Extending the comparison operands would also be allowed, but is less likely
-/// to happen in practice since their use is right here. Note that truncate
-/// operations would *not* be semantically equivalent.
-static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
- if (Cmp == Result)
- return (Cmp.getValueType() == MVT::f32 ||
- Cmp.getValueType() == MVT::f64);
-
- ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
- ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
- if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
- Result.getValueType() == MVT::f64) {
- bool Lossy;
- APFloat CmpVal = CCmp->getValueAPF();
- CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
- return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
- }
-
- return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
-}
-
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
SDValue RHS, SDValue TVal,
SDValue FVal, SDLoc dl,
@@ -3614,7 +3903,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
}
}
- // Handle integers first.
+ // Also handle f16, for which we need to do a f32 comparison.
+ if (LHS.getValueType() == MVT::f16) {
+ LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+ RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+ }
+
+ // Next, handle integers.
if (LHS.getValueType().isInteger()) {
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
@@ -3637,9 +3932,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
} else if (TVal.getOpcode() == ISD::XOR) {
// If TVal is a NOT we want to swap TVal and FVal so that we can match
// with a CSINV rather than a CSEL.
- ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
-
- if (CVal && CVal->isAllOnesValue()) {
+ if (isAllOnesConstant(TVal.getOperand(1))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
@@ -3647,9 +3940,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
} else if (TVal.getOpcode() == ISD::SUB) {
// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
// that we can match with a CSNEG rather than a CSEL.
- ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
-
- if (CVal && CVal->isNullValue()) {
+ if (isNullConstant(TVal.getOperand(0))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
@@ -4109,46 +4400,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
- SDValue ARMcc;
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+ // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
+ // is "undef". We wanted 0, so CSEL it directly.
+ SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+ ISD::SETEQ, dl, DAG);
+ SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+ HiBitsForLo =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+ HiBitsForLo, CCVal, Cmp);
+
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
- SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
- SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
- ISD::SETGE, dl, DAG);
- SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+ SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue LoForNormalShift =
+ DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
- SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
- SDValue Lo =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+ Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+ dl, DAG);
+ CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+ SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+ SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+ LoForNormalShift, CCVal, Cmp);
// AArch64 shifts larger than the register width are wrapped rather than
// clamped, so we can't just emit "hi >> x".
- SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
- SDValue TrueValHi = Opc == ISD::SRA
- ? DAG.getNode(Opc, dl, VT, ShOpHi,
- DAG.getConstant(VTBits - 1, dl,
- MVT::i64))
- : DAG.getConstant(0, dl, VT);
- SDValue Hi =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+ SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue HiForBigShift =
+ Opc == ISD::SRA
+ ? DAG.getNode(Opc, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, dl, MVT::i64))
+ : DAG.getConstant(0, dl, VT);
+ SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+ HiForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
}
+
/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
@@ -4156,31 +4458,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
- SDValue ARMcc;
assert(Op.getOpcode() == ISD::SHL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
- SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+ SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+ // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
+ // is "undef". We wanted 0, so CSEL it directly.
+ SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+ ISD::SETEQ, dl, DAG);
+ SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+ LoBitsForHi =
+ DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+ LoBitsForHi, CCVal, Cmp);
+
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
- SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
- SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+ SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue HiForNormalShift =
+ DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
- SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
- SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
- ISD::SETGE, dl, DAG);
- SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
- SDValue Hi =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+ Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+ dl, DAG);
+ CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+ SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+ HiForNormalShift, CCVal, Cmp);
// AArch64 shifts of larger than register sizes are wrapped rather than
// clamped, so we can't just emit "lo << a" if a is too big.
- SDValue TrueValLo = DAG.getConstant(0, dl, VT);
- SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
- SDValue Lo =
- DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+ SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
+ SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+ LoForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
@@ -4362,8 +4674,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
// Validate and return a target constant for them if we can.
case 'z': {
// 'z' maps to xzr or wzr so it needs an input of 0.
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
- if (!C || C->getZExtValue() != 0)
+ if (!isNullConstant(Op))
return;
if (Op.getValueType() == MVT::i64)
@@ -5653,11 +5964,10 @@ static SDValue NormalizeBuildVector(SDValue Op,
return Op;
SmallVector<SDValue, 16> Ops;
- for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
- SDValue Lane = Op.getOperand(I);
- if (Lane.getOpcode() == ISD::Constant) {
+ for (SDValue Lane : Op->ops()) {
+ if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
APInt LowBits(EltTy.getSizeInBits(),
- cast<ConstantSDNode>(Lane)->getZExtValue());
+ CstLane->getZExtValue());
Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
}
Ops.push_back(Lane);
@@ -5997,8 +6307,7 @@ FailedModImm:
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
- SDValue shuffle = ReconstructShuffle(Op, DAG);
- if (shuffle != SDValue())
+ if (SDValue shuffle = ReconstructShuffle(Op, DAG))
return shuffle;
}
@@ -6017,7 +6326,10 @@ FailedModImm:
// a) Avoid a RMW dependency on the full vector register, and
// b) Allow the register coalescer to fold away the copy if the
// value is already in an S or D register.
- if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+ // Do not do this for UNDEF/LOAD nodes because we have better patterns
+ // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
+ if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD &&
+ (ElemSize == 32 || ElemSize == 64)) {
unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
MachineSDNode *N =
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
@@ -6123,24 +6435,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
unsigned Val = Cst->getZExtValue();
unsigned Size = Op.getValueType().getSizeInBits();
- if (Val == 0) {
- switch (Size) {
- case 8:
- return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
- Op.getOperand(0));
- case 16:
- return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
- Op.getOperand(0));
- case 32:
- return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
- Op.getOperand(0));
- case 64:
- return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
- Op.getOperand(0));
- default:
- llvm_unreachable("Unexpected vector type in extract_subvector!");
- }
- }
+
+ // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+ if (Val == 0)
+ return Op;
+
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
@@ -6213,26 +6512,20 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
/// 0 <= Value <= ElementBits for a long left shift.
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
- unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+ int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
}
/// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation. For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-/// 1 <= |Value| <= ElementBits for a right shift; or
-/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
- int64_t &Cnt) {
+/// operand of a vector shift right operation. The value must be in the range:
+/// 1 <= Value <= ElementBits for a right shift; or
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
- unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+ int64_t ElementBits = VT.getVectorElementType().getSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
- if (isIntrinsic)
- Cnt = -Cnt;
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
}
@@ -6261,8 +6554,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
case ISD::SRA:
case ISD::SRL:
// Right shift immediate
- if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
- Cnt < EltSize) {
+ if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
unsigned Opc =
(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
@@ -6451,7 +6743,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::aarch64_neon_ld4r: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
- uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
@@ -6477,7 +6769,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Type *ArgTy = I.getArgOperand(ArgI)->getType();
if (!ArgTy->isVectorTy())
break;
- NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
}
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6720,10 +7012,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
const DataLayout &DL = LI->getModule()->getDataLayout();
VectorType *VecTy = Shuffles[0]->getType();
- unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+ unsigned VecSize = DL.getTypeSizeInBits(VecTy);
- // Skip illegal vector types.
- if (VecSize != 64 && VecSize != 128)
+ // Skip if we do not have NEON and skip illegal vector types.
+ if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
return false;
// A pointer vector can not be the return type of the ldN intrinsics. Need to
@@ -6806,10 +7098,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
const DataLayout &DL = SI->getModule()->getDataLayout();
- unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
- // Skip illegal vector types.
- if (SubVecSize != 64 && SubVecSize != 128)
+ // Skip if we do not have NEON and skip illegal vector types.
+ if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
return false;
Value *Op0 = SVI->getOperand(0);
@@ -7228,8 +7520,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
- SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
- if (Res != SDValue())
+ if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
return Res;
EVT VT = N->getValueType(0);
@@ -7242,7 +7533,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
// If the result of an integer load is only used by an integer-to-float
// conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
- // This eliminates an "integer-to-vector-move UOP and improve throughput.
+ // This eliminates an "integer-to-vector-move" UOP and improves throughput.
SDValue N0 = N->getOperand(0);
if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not change the width of a volatile load.
@@ -7265,6 +7556,134 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Fold a floating-point multiply by power of two into floating-point to
+/// fixed-point conversion.
+static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+ return SDValue();
+
+ SDValue ConstVec = Op->getOperand(1);
+ if (!isa<BuildVectorSDNode>(ConstVec))
+ return SDValue();
+
+ MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+ uint32_t FloatBits = FloatTy.getSizeInBits();
+ if (FloatBits != 32 && FloatBits != 64)
+ return SDValue();
+
+ MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+ uint32_t IntBits = IntTy.getSizeInBits();
+ if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+ return SDValue();
+
+ // Avoid conversions where iN is larger than the float (e.g., float -> i64).
+ if (IntBits > FloatBits)
+ return SDValue();
+
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t Bits = IntBits == 64 ? 64 : 32;
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
+ if (C == -1 || C == 0 || C > Bits)
+ return SDValue();
+
+ MVT ResTy;
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ switch (NumLanes) {
+ default:
+ return SDValue();
+ case 2:
+ ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+ break;
+ case 4:
+ ResTy = MVT::v4i32;
+ break;
+ }
+
+ SDLoc DL(N);
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+ unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
+ : Intrinsic::aarch64_neon_vcvtfp2fxu;
+ SDValue FixConv =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
+ DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
+ Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
+ // We can handle smaller integers by generating an extra trunc.
+ if (IntBits < FloatBits)
+ FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
+
+ return FixConv;
+}
+
+/// Fold a floating-point divide by power of two into fixed-point to
+/// floating-point conversion.
+static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ unsigned Opc = Op->getOpcode();
+ if (!Op.getValueType().isVector() ||
+ (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
+ return SDValue();
+
+ SDValue ConstVec = N->getOperand(1);
+ if (!isa<BuildVectorSDNode>(ConstVec))
+ return SDValue();
+
+ MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+ int32_t IntBits = IntTy.getSizeInBits();
+ if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+ return SDValue();
+
+ MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+ int32_t FloatBits = FloatTy.getSizeInBits();
+ if (FloatBits != 32 && FloatBits != 64)
+ return SDValue();
+
+ // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
+ if (IntBits > FloatBits)
+ return SDValue();
+
+ BitVector UndefElements;
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+ int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
+ if (C == -1 || C == 0 || C > FloatBits)
+ return SDValue();
+
+ MVT ResTy;
+ unsigned NumLanes = Op.getValueType().getVectorNumElements();
+ switch (NumLanes) {
+ default:
+ return SDValue();
+ case 2:
+ ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+ break;
+ case 4:
+ ResTy = MVT::v4i32;
+ break;
+ }
+
+ SDLoc DL(N);
+ SDValue ConvInput = Op.getOperand(0);
+ bool IsSigned = Opc == ISD::SINT_TO_FP;
+ if (IntBits < FloatBits)
+ ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+ ResTy, ConvInput);
+
+ unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
+ : Intrinsic::aarch64_neon_vcvtfxu2fp;
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
+ DAG.getConstant(C, DL, MVT::i32));
+}
+
/// An EXTR instruction is made up of two shifts, ORed together. This helper
/// searches for and classifies those shifts.
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
@@ -7964,7 +8383,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
- break;
case Intrinsic::aarch64_neon_saddv:
return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
case Intrinsic::aarch64_neon_uaddv:
@@ -7978,10 +8396,16 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_umaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
case Intrinsic::aarch64_neon_fmax:
- return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+ return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fmin:
- return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+ return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fmaxnm:
+ return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
+ N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fminnm:
+ return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull:
@@ -8141,7 +8565,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
unsigned Alignment = std::min(OrigAlignment, EltOffset);
// Create scalar stores. This is at least as good as the code sequence for a
- // split unaligned store wich is a dup.s, ext.b, and two stores.
+ // split unaligned store which is a dup.s, ext.b, and two stores.
// Most of the time the three stores should be replaced by store pair
// instructions (stp).
SDLoc DL(St);
@@ -8162,10 +8586,9 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
return NewST1;
}
-static SDValue performSTORECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
+static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
if (!DCI.isBeforeLegalize())
return SDValue();
@@ -8173,15 +8596,17 @@ static SDValue performSTORECombine(SDNode *N,
if (S->isVolatile())
return SDValue();
+ // FIXME: The logic for deciding if an unaligned store should be split should
+ // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
+ // a call to that function here.
+
// Cyclone has bad performance on unaligned 16B stores when crossing line and
// page boundaries. We want to split such stores.
if (!Subtarget->isCyclone())
return SDValue();
- // Don't split at Oz.
- MachineFunction &MF = DAG.getMachineFunction();
- bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
- if (IsMinSize)
+ // Don't split at -Oz.
+ if (DAG.getMachineFunction().getFunction()->optForMinSize())
return SDValue();
SDValue StVal = S->getValue();
@@ -8204,8 +8629,7 @@ static SDValue performSTORECombine(SDNode *N,
// If we get a splat of a scalar convert this vector store to a store of
// scalars. They will be merged into store pairs thereby removing two
// instructions.
- SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
- if (ReplacedSplat != SDValue())
+ if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S))
return ReplacedSplat;
SDLoc DL(S);
@@ -8326,6 +8750,299 @@ static SDValue performPostLD1Combine(SDNode *N,
return SDValue();
}
+/// Simplify \Addr given that the top byte of it is ignored by HW during
+/// address translation.
+static bool performTBISimplification(SDValue Addr,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ APInt DemandedMask = APInt::getLowBitsSet(64, 56);
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+ DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ return true;
+ }
+ return false;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ SDValue Split = split16BStores(N, DCI, DAG, Subtarget);
+ if (Split.getNode())
+ return Split;
+
+ if (Subtarget->supportsAddressTopByteIgnored() &&
+ performTBISimplification(N->getOperand(2), DCI, DAG))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+ /// This function handles the log2-shuffle pattern produced by the
+/// LoopVectorizer for the across vector reduction. It consists of
+/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
+/// are reduced, where s is an induction variable from 0 to
+/// log2(NumVectorElements).
+static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
+ unsigned Op,
+ SelectionDAG &DAG) {
+ EVT VTy = OpV->getOperand(0).getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ int NumVecElts = VTy.getVectorNumElements();
+ if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+ if (NumVecElts != 4)
+ return SDValue();
+ } else {
+ if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
+ return SDValue();
+ }
+
+ int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
+ SDValue PreOp = OpV;
+ // Iterate over each step of the across vector reduction.
+ for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
+ SDValue CurOp = PreOp.getOperand(0);
+ SDValue Shuffle = PreOp.getOperand(1);
+ if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
+ // Try to swap the 1st and 2nd operand as add and min/max instructions
+ // are commutative.
+ CurOp = PreOp.getOperand(1);
+ Shuffle = PreOp.getOperand(0);
+ if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+ }
+
+ // Check if the input vector is fed by the operator we want to handle,
+ // except the last step; the very first input vector is not necessarily
+ // the same operator we are handling.
+ if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
+ return SDValue();
+
+ // Check if it forms one step of the across vector reduction.
+ // E.g.,
+ // %cur = add %1, %0
+ // %shuffle = vector_shuffle %cur, <2, 3, u, u>
+ // %pre = add %cur, %shuffle
+ if (Shuffle.getOperand(0) != CurOp)
+ return SDValue();
+
+ int NumMaskElts = 1 << CurStep;
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
+ // Check mask values in each step.
+ // We expect the shuffle mask in each step follows a specific pattern
+ // denoted here by the <M, U> form, where M is a sequence of integers
+ // starting from NumMaskElts, increasing by 1, and the number integers
+ // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
+ // of undef in U should be NumVecElts - NumMaskElts.
+ // E.g., for <8 x i16>, mask values in each step should be :
+ // step 0 : <1,u,u,u,u,u,u,u>
+ // step 1 : <2,3,u,u,u,u,u,u>
+ // step 2 : <4,5,6,7,u,u,u,u>
+ for (int i = 0; i < NumVecElts; ++i)
+ if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
+ (i >= NumMaskElts && !(Mask[i] < 0)))
+ return SDValue();
+
+ PreOp = CurOp;
+ }
+ unsigned Opcode;
+ bool IsIntrinsic = false;
+
+ switch (Op) {
+ default:
+ llvm_unreachable("Unexpected operator for across vector reduction");
+ case ISD::ADD:
+ Opcode = AArch64ISD::UADDV;
+ break;
+ case ISD::SMAX:
+ Opcode = AArch64ISD::SMAXV;
+ break;
+ case ISD::UMAX:
+ Opcode = AArch64ISD::UMAXV;
+ break;
+ case ISD::SMIN:
+ Opcode = AArch64ISD::SMINV;
+ break;
+ case ISD::UMIN:
+ Opcode = AArch64ISD::UMINV;
+ break;
+ case ISD::FMAXNUM:
+ Opcode = Intrinsic::aarch64_neon_fmaxnmv;
+ IsIntrinsic = true;
+ break;
+ case ISD::FMINNUM:
+ Opcode = Intrinsic::aarch64_neon_fminnmv;
+ IsIntrinsic = true;
+ break;
+ }
+ SDLoc DL(N);
+
+ return IsIntrinsic
+ ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
+ DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
+ : DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+ DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+ DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Target-specific DAG combine for the across vector min/max reductions.
+/// This function specifically handles the final clean-up step of the vector
+/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which narrows down and finds the final min/max value from all
+/// elements of the vector.
+/// For example, for a <16 x i8> vector :
+/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
+/// %smax0 = smax %arr, svn0
+/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %smax1 = smax %smax0, %svn1
+/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %smax2 = smax %smax1, svn2
+/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+/// %sc = setcc %smax2, %svn3, gt
+/// %n0 = extract_vector_elt %sc, #0
+/// %n1 = extract_vector_elt %smax2, #0
+/// %n2 = extract_vector_elt $smax2, #1
+/// %result = select %n0, %n1, n2
+/// becomes :
+/// %1 = smaxv %0
+/// %result = extract_vector_elt %1, 0
+static SDValue
+performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue IfTrue = N->getOperand(1);
+ SDValue IfFalse = N->getOperand(2);
+
+ // Check if the SELECT merges up the final result of the min/max
+ // from a vector.
+ if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // Expect N0 is fed by SETCC.
+ SDValue SetCC = N0.getOperand(0);
+ EVT SetCCVT = SetCC.getValueType();
+ if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
+ SetCCVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ SDValue VectorOp = SetCC.getOperand(0);
+ unsigned Op = VectorOp->getOpcode();
+ // Check if the input vector is fed by the operator we want to handle.
+ if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
+ Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
+ return SDValue();
+
+ EVT VTy = VectorOp.getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ if (VTy.getSizeInBits() < 64)
+ return SDValue();
+
+ EVT EltTy = VTy.getVectorElementType();
+ if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+ if (EltTy != MVT::f32)
+ return SDValue();
+ } else {
+ if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+ return SDValue();
+ }
+
+ // Check if extracting from the same vector.
+ // For example,
+ // %sc = setcc %vector, %svn1, gt
+ // %n0 = extract_vector_elt %sc, #0
+ // %n1 = extract_vector_elt %vector, #0
+ // %n2 = extract_vector_elt $vector, #1
+ if (!(VectorOp == IfTrue->getOperand(0) &&
+ VectorOp == IfFalse->getOperand(0)))
+ return SDValue();
+
+ // Check if the condition code is matched with the operator type.
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
+ (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
+ (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
+ (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
+ (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
+ CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
+ CC != ISD::SETGE) ||
+ (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
+ CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
+ CC != ISD::SETLE))
+ return SDValue();
+
+ // Expect to check only lane 0 from the vector SETCC.
+ if (!isNullConstant(N0.getOperand(1)))
+ return SDValue();
+
+ // Expect to extract the true value from lane 0.
+ if (!isNullConstant(IfTrue.getOperand(1)))
+ return SDValue();
+
+ // Expect to extract the false value from lane 1.
+ if (!isOneConstant(IfFalse.getOperand(1)))
+ return SDValue();
+
+ return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
+}
+
+/// Target-specific DAG combine for the across vector add reduction.
+/// This function specifically handles the final clean-up step of the vector
+/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which adds all elements of a vector together.
+/// For example, for a <4 x i32> vector :
+/// %1 = vector_shuffle %0, <2,3,u,u>
+/// %2 = add %0, %1
+/// %3 = vector_shuffle %2, <1,u,u,u>
+/// %4 = add %2, %3
+/// %result = extract_vector_elt %4, 0
+/// becomes :
+/// %0 = uaddv %0
+/// %result = extract_vector_elt %0, 0
+static SDValue
+performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ if (!Subtarget->hasNEON())
+ return SDValue();
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Check if the input vector is fed by the ADD.
+ if (N0->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // The vector extract idx must constant zero because we only expect the final
+ // result of the reduction is placed in lane 0.
+ if (!isNullConstant(N1))
+ return SDValue();
+
+ EVT VTy = N0.getValueType();
+ if (!VTy.isVector())
+ return SDValue();
+
+ EVT EltTy = VTy.getVectorElementType();
+ if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+ return SDValue();
+
+ if (VTy.getSizeInBits() < 64)
+ return SDValue();
+
+ return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
+}
+
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -8751,10 +9468,10 @@ static SDValue performBRCONDCombine(SDNode *N,
if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
return SDValue();
- if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+ if (isNullConstant(LHS))
std::swap(LHS, RHS);
- if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+ if (!isNullConstant(RHS))
return SDValue();
if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
@@ -8868,75 +9585,6 @@ static SDValue performSelectCombine(SDNode *N,
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
-/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC
-/// to match FMIN/FMAX patterns.
-static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) {
- // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y".
- // Unless the NoNaNsFPMath option is set, be careful about NaNs:
- // vmax/vmin return NaN if either operand is a NaN;
- // only do the transformation when it matches that behavior.
-
- SDValue CondLHS = N->getOperand(0);
- SDValue CondRHS = N->getOperand(1);
- SDValue LHS = N->getOperand(2);
- SDValue RHS = N->getOperand(3);
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
-
- unsigned Opcode;
- bool IsReversed;
- if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) &&
- selectCCOpsAreFMaxCompatible(CondRHS, RHS)) {
- IsReversed = false; // x CC y ? x : y
- } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) &&
- selectCCOpsAreFMaxCompatible(CondLHS, RHS)) {
- IsReversed = true ; // x CC y ? y : x
- } else {
- return SDValue();
- }
-
- bool IsUnordered = false, IsOrEqual;
- switch (CC) {
- default:
- return SDValue();
- case ISD::SETULT:
- case ISD::SETULE:
- IsUnordered = true;
- case ISD::SETOLT:
- case ISD::SETOLE:
- case ISD::SETLT:
- case ISD::SETLE:
- IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE);
- Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN;
- break;
-
- case ISD::SETUGT:
- case ISD::SETUGE:
- IsUnordered = true;
- case ISD::SETOGT:
- case ISD::SETOGE:
- case ISD::SETGT:
- case ISD::SETGE:
- IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE);
- Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX;
- break;
- }
-
- // If LHS is NaN, an ordered comparison will be false and the result will be
- // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking
- // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN.
- if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
- return SDValue();
-
- // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true,
- // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be
- // used for unsafe math or if one of the operands is known to be nonzero.
- if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath &&
- !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
- return SDValue();
-
- return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
-}
-
/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
@@ -8961,6 +9609,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return performIntToFpCombine(N, DAG, Subtarget);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return performFpToIntCombine(N, DAG, Subtarget);
+ case ISD::FDIV:
+ return performFDivCombine(N, DAG, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
@@ -8973,12 +9626,18 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
- case ISD::SELECT:
- return performSelectCombine(N, DCI);
+ case ISD::SELECT: {
+ SDValue RV = performSelectCombine(N, DCI);
+ if (!RV.getNode())
+ RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
+ return RV;
+ }
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
- case ISD::SELECT_CC:
- return performSelectCCCombine(N, DCI.DAG);
+ case ISD::LOAD:
+ if (performTBISimplification(N->getOperand(1), DCI, DAG))
+ return SDValue(N, 0);
+ break;
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
case AArch64ISD::BRCOND:
@@ -8991,6 +9650,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNVCASTCombine(N);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -9157,6 +9818,20 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
}
+static void ReplaceReductionResults(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG, unsigned InterOp,
+ unsigned AcrossOp) {
+ EVT LoVT, HiVT;
+ SDValue Lo, Hi;
+ SDLoc dl(N);
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
+ SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
+ Results.push_back(SplitVal);
+}
+
void AArch64TargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
@@ -9165,6 +9840,24 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::BITCAST:
ReplaceBITCASTResults(N, Results, DAG);
return;
+ case AArch64ISD::SADDV:
+ ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
+ return;
+ case AArch64ISD::UADDV:
+ ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
+ return;
+ case AArch64ISD::SMINV:
+ ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
+ return;
+ case AArch64ISD::UMINV:
+ ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
+ return;
+ case AArch64ISD::SMAXV:
+ ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
+ return;
+ case AArch64ISD::UMAXV:
+ ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
+ return;
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT:
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
@@ -9177,10 +9870,10 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const {
return true;
}
-bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal if there are three or more FDIVs.
- return NumUsers > 2;
+ return 3;
}
TargetLoweringBase::LegalizeTypeAction
@@ -9206,20 +9899,21 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
-bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+TargetLowering::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
- return Size == 128;
+ return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
}
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
-TargetLoweringBase::AtomicRMWExpansionKind
+TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
- return Size <= 128 ? AtomicRMWExpansionKind::LLSC
- : AtomicRMWExpansionKind::None;
+ return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
}
-bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
+ AtomicCmpXchgInst *AI) const {
return true;
}
@@ -9258,6 +9952,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
cast<PointerType>(Addr->getType())->getElementType());
}
+void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+ IRBuilder<> &Builder) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Builder.CreateCall(
+ llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+}
+
Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
@@ -9294,3 +9995,70 @@ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
return Ty->isArrayTy();
}
+
+bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
+ EVT) const {
+ return false;
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ const unsigned TlsOffset = 0x48;
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ Function *ThreadPointerFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);
+ return IRB.CreatePointerCast(
+ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
+void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+ // Update IsSplitCSR in AArch64unctionInfo.
+ AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
+ AFI->setIsSplitCSR(true);
+}
+
+void AArch64TargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AArch64::GPR64RegClass.contains(*I))
+ RC = &AArch64::GPR64RegClass;
+ else if (AArch64::FPR64RegClass.contains(*I))
+ RC = &AArch64::FPR64RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ // FIXME: this currently does not emit CFI pseudo-instructions, it works
+ // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+ // nounwind. If we want to generalize this later, we may need to emit
+ // CFI pseudo-instructions.
+ assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+ NewVR)
+ .addReg(*I);
+
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+ *I)
+ .addReg(NewVR);
+ }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c73ce1e54b3e..e99616c94068 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -15,6 +15,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+#include "AArch64.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/CallingConv.h"
@@ -58,13 +59,14 @@ enum NodeType : unsigned {
SBCS,
ANDS,
+ // Conditional compares. Operands: left,right,falsecc,cc,flags
+ CCMP,
+ CCMN,
+ FCCMP,
+
// Floating point comparison
FCMP,
- // Floating point max and min instructions.
- FMAX,
- FMIN,
-
// Scalar extract
EXTR,
@@ -217,8 +219,6 @@ class AArch64Subtarget;
class AArch64TargetMachine;
class AArch64TargetLowering : public TargetLowering {
- bool RequireStrictAlign;
-
public:
explicit AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI);
@@ -226,46 +226,35 @@ public:
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
- /// computeKnownBitsForTargetNode - Determine which of the bits specified in
- /// Mask are known to be either zero or one and return them in the
- /// KnownZero/KnownOne bitsets.
+ /// Determine which of the bits specified in Mask are known to be either zero
+ /// or one and return them in the KnownZero/KnownOne bitsets.
void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
APInt &KnownOne, const SelectionDAG &DAG,
unsigned Depth = 0) const override;
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
- /// allowsMisalignedMemoryAccesses - Returns true if the target allows
- /// unaligned memory accesses of the specified type.
+ /// Returns true if the target allows unaligned memory accesses of the
+ /// specified type.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
unsigned Align = 1,
- bool *Fast = nullptr) const override {
- if (RequireStrictAlign)
- return false;
- // FIXME: True for Cyclone, but not necessary others.
- if (Fast)
- *Fast = true;
- return true;
- }
+ bool *Fast = nullptr) const override;
- /// LowerOperation - Provide custom lowering hooks for some operations.
+ /// Provide custom lowering hooks for some operations.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
const char *getTargetNodeName(unsigned Opcode) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
- /// getFunctionAlignment - Return the Log2 alignment of this function.
- unsigned getFunctionAlignment(const Function *F) const;
-
/// Returns true if a cast between SrcAS and DestAS is a noop.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
// Addrspacecasts are always noops.
return true;
}
- /// createFastISel - This method returns a target specific FastISel object,
- /// or null if the target does not support "fast" ISel.
+ /// This method returns a target specific FastISel object, or null if the
+ /// target does not support "fast" ISel.
FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const override;
@@ -273,11 +262,11 @@ public:
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
- /// isShuffleMaskLegal - Return true if the given shuffle mask can be
- /// codegen'd directly, or if it should be stack expanded.
+ /// Return true if the given shuffle mask can be codegen'd directly, or if it
+ /// should be stack expanded.
bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
- /// getSetCCResultType - Return the ISD::SETCC ValueType
+ /// Return the ISD::SETCC ValueType.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
@@ -322,8 +311,8 @@ public:
bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
MachineFunction &MF) const override;
- /// isLegalAddressingMode - Return true if the addressing mode represented
- /// by AM is legal for this target, for a load/store of the specified type.
+ /// Return true if the addressing mode represented by AM is legal for this
+ /// target, for a load/store of the specified type.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
@@ -335,10 +324,9 @@ public:
int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
- /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
- /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
- /// expanded to FMAs when this method returns true, otherwise fmuladd is
- /// expanded to fmul + fadd.
+ /// Return true if an FMA operation is faster than a pair of fmul and fadd
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
+ /// returns true, otherwise fmuladd is expanded to fmul + fadd.
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
@@ -351,25 +339,65 @@ public:
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
- bool hasLoadLinkedStoreConditional() const override;
Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const override;
Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr, AtomicOrdering Ord) const override;
- bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
- TargetLoweringBase::AtomicRMWExpansionKind
+ TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+ bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
bool useLoadStackGuardNode() const override;
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
+ /// If the target has a standard location for the unsafe stack pointer,
+ /// returns the address of that location. Otherwise, returns nullptr.
+ Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ unsigned
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+ // FIXME: This is a guess. Has this been defined yet?
+ return AArch64::X0;
+ }
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ unsigned
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+ // FIXME: This is a guess. Has this been defined yet?
+ return AArch64::X1;
+ }
+
+ bool isCheapToSpeculateCttz() const override {
+ return true;
+ }
+
+ bool isCheapToSpeculateCtlz() const override {
+ return true;
+ }
+ bool supportSplitCSR(MachineFunction *MF) const override {
+ return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+ }
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
private:
bool isExtFreeImpl(const Instruction *Ext) const override;
- /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+ /// Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget;
@@ -392,6 +420,8 @@ private:
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
bool isThisReturn, SDValue ThisVal) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+
bool isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
bool isCalleeStructRet, bool isCallerStructRet,
@@ -470,7 +500,7 @@ private:
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
std::vector<SDNode *> *Created) const override;
- bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+ unsigned combineRepeatedFPDivisors() const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
unsigned getRegisterByName(const char* RegName, EVT VT,
@@ -516,6 +546,8 @@ private:
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
CallingConv::ID CallConv,
bool isVarArg) const override;
+
+ bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
};
namespace AArch64 {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 3f2e772a90c4..6ac2175e5035 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> {
let PrintMethod = "printImmScale<16>";
}
+def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
+def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
+def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
+def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
+def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+
class AsmImmRange<int Low, int High> : AsmOperandClass {
let Name = "Imm" # Low # "_" # High;
let DiagnosticType = "InvalidImm" # Low # "_" # High;
@@ -346,9 +352,11 @@ class fixedpoint_i64<ValueType FloatVT>
let ParserMatchClass = Imm1_64Operand;
}
+def fixedpoint_f16_i32 : fixedpoint_i32<f16>;
def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
+def fixedpoint_f16_i64 : fixedpoint_i64<f16>;
def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
@@ -402,6 +410,7 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
let ParserMatchClass = Imm1_32Operand;
}
+def Imm0_1Operand : AsmImmRange<0, 1>;
def Imm0_7Operand : AsmImmRange<0, 7>;
def Imm0_15Operand : AsmImmRange<0, 15>;
def Imm0_31Operand : AsmImmRange<0, 31>;
@@ -525,6 +534,20 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
let ParserMatchClass = Imm0_31Operand;
}
+// True if the 32-bit immediate is in the range [0,31]
+def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint64_t)Imm) < 32;
+}]> {
+ let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_1 predicate - True if the immediate is in the range [0,1]
+def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 2;
+}]> {
+ let ParserMatchClass = Imm0_1Operand;
+}
+
// imm0_15 predicate - True if the immediate is in the range [0,15]
def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 16;
@@ -542,7 +565,9 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 16;
-}]>;
+}]> {
+ let ParserMatchClass = Imm0_15Operand;
+}
// An arithmetic shifter operand:
// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
@@ -690,6 +715,17 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
}
// Floating-point immediate.
+def fpimm16 : Operand<f16>,
+ PatLeaf<(f16 fpimm), [{
+ return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1;
+ }], SDNodeXForm<fpimm, [{
+ APFloat InVal = N->getValueAPF();
+ uint32_t enc = AArch64_AM::getFP16Imm(InVal);
+ return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+ }]>> {
+ let ParserMatchClass = FPImmOperand;
+ let PrintMethod = "printFPImmOperand";
+}
def fpimm32 : Operand<f32>,
PatLeaf<(f32 fpimm), [{
return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
@@ -822,7 +858,7 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
// model patterns with sufficiently fine granularity
let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
class HintI<string mnemonic>
- : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "",
+ : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "",
[(int_aarch64_hint imm0_127:$imm)]>,
Sched<[WriteHint]> {
bits <7> imm;
@@ -875,6 +911,25 @@ def msr_sysreg_op : Operand<i32> {
let PrintMethod = "printMSRSystemRegister";
}
+def PSBHintOperand : AsmOperandClass {
+ let Name = "PSBHint";
+ let ParserMethod = "tryParsePSBHint";
+}
+def psbhint_op : Operand<i32> {
+ let ParserMatchClass = PSBHintOperand;
+ let PrintMethod = "printPSBHintOp";
+ let MCOperandPredicate = [{
+ // Check, if operand is valid, to fix exhaustive aliasing in disassembly.
+ // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
+ if (!MCOp.isImm())
+ return false;
+ bool ValidNamed;
+ (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(),
+ STI.getFeatureBits(), ValidNamed);
+ return ValidNamed;
+ }];
+}
+
class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
"mrs", "\t$Rt, $systemreg"> {
bits<16> systemreg;
@@ -890,19 +945,19 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
let Inst{20-5} = systemreg;
}
-def SystemPStateFieldOperand : AsmOperandClass {
- let Name = "SystemPStateField";
+def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
+ let Name = "SystemPStateFieldWithImm0_15";
let ParserMethod = "tryParseSysReg";
}
-def pstatefield_op : Operand<i32> {
- let ParserMatchClass = SystemPStateFieldOperand;
+def pstatefield4_op : Operand<i32> {
+ let ParserMatchClass = SystemPStateFieldWithImm0_15Operand;
let PrintMethod = "printSystemPStateField";
}
let Defs = [NZCV] in
-class MSRpstateI
- : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
- "msr", "\t$pstate_field, $imm">,
+class MSRpstateImm0_15
+ : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm),
+ "msr", "\t$pstatefield, $imm">,
Sched<[WriteSys]> {
bits<6> pstatefield;
bits<4> imm;
@@ -913,6 +968,37 @@ class MSRpstateI
let Inst{7-5} = pstatefield{2-0};
let DecoderMethod = "DecodeSystemPStateInstruction";
+ // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+ // Fail the decoder should attempt to decode the instruction as MSRI.
+ let hasCompleteDecoder = 0;
+}
+
+def SystemPStateFieldWithImm0_1Operand : AsmOperandClass {
+ let Name = "SystemPStateFieldWithImm0_1";
+ let ParserMethod = "tryParseSysReg";
+}
+def pstatefield1_op : Operand<i32> {
+ let ParserMatchClass = SystemPStateFieldWithImm0_1Operand;
+ let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateImm0_1
+ : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm),
+ "msr", "\t$pstatefield, $imm">,
+ Sched<[WriteSys]> {
+ bits<6> pstatefield;
+ bit imm;
+ let Inst{20-19} = 0b00;
+ let Inst{18-16} = pstatefield{5-3};
+ let Inst{15-9} = 0b0100000;
+ let Inst{8} = imm;
+ let Inst{7-5} = pstatefield{2-0};
+
+ let DecoderMethod = "DecodeSystemPStateInstruction";
+ // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+ // Fail the decoder should attempt to decode the instruction as MSRI.
+ let hasCompleteDecoder = 0;
}
// SYS and SYSL generic system instructions.
@@ -1341,7 +1427,7 @@ multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
}
class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
@@ -1407,13 +1493,13 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode>
}
class MulAccumWAlias<string asm, Instruction inst>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
class MulAccumXAlias<string asm, Instruction inst>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
class WideMulAccumAlias<string asm, Instruction inst>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
@@ -1643,7 +1729,7 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
RegisterClass src1Regtype, RegisterClass src2Regtype,
int shiftExt>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
shiftExt)>;
@@ -1701,10 +1787,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
}
// add Rd, Rb, -imm -> sub Rd, Rn, imm
- def : InstAlias<alias#" $Rd, $Rn, $imm",
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<alias#" $Rd, $Rn, $imm",
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
addsub_shifted_imm64_neg:$imm), 0>;
@@ -1776,43 +1862,43 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
} // Defs = [NZCV]
// Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
- def : InstAlias<alias#" $Rd, $Rn, $imm",
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<alias#" $Rd, $Rn, $imm",
+ def : InstAlias<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
addsub_shifted_imm64_neg:$imm), 0>;
// Compare aliases
- def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+ def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
- def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+ def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
- def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+ def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
// Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
- def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+ def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+ def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
// Compare shorthands
- def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
- def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
- def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx")
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
- def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
+ def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
// Register/register aliases with no shift when SP is not used.
@@ -1998,7 +2084,7 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
// Aliases for register+register logical instructions.
class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
- : InstAlias<asm#" $dst, $src1, $src2",
+ : InstAlias<asm#"\t$dst, $src1, $src2",
(inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
@@ -2017,10 +2103,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
let Inst{31} = 1;
}
- def : InstAlias<Alias # " $Rd, $Rn, $imm",
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
logical_imm32_not:$imm), 0>;
- def : InstAlias<Alias # " $Rd, $Rn, $imm",
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
logical_imm64_not:$imm), 0>;
}
@@ -2039,10 +2125,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
}
} // end Defs = [NZCV]
- def : InstAlias<Alias # " $Rd, $Rn, $imm",
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
logical_imm32_not:$imm), 0>;
- def : InstAlias<Alias # " $Rd, $Rn, $imm",
+ def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
logical_imm64_not:$imm), 0>;
}
@@ -2105,9 +2191,12 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
//---
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
- : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
- asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype,
+ string mnemonic, SDNode OpNode>
+ : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $imm, $nzcv, $cond", "",
+ [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]>,
Sched<[WriteI, ReadI]> {
let Uses = [NZCV];
let Defs = [NZCV];
@@ -2127,19 +2216,13 @@ class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
let Inst{3-0} = nzcv;
}
-multiclass CondSetFlagsImm<bit op, string asm> {
- def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
- let Inst{31} = 0;
- }
- def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
- let Inst{31} = 1;
- }
-}
-
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
- : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
- asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic,
+ SDNode OpNode>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "",
+ [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]>,
Sched<[WriteI, ReadI, ReadI]> {
let Uses = [NZCV];
let Defs = [NZCV];
@@ -2159,11 +2242,19 @@ class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
let Inst{3-0} = nzcv;
}
-multiclass CondSetFlagsReg<bit op, string asm> {
- def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> {
+ // immediate operand variants
+ def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> {
let Inst{31} = 0;
}
- def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+ def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> {
+ let Inst{31} = 1;
+ }
+ // register operand variants
+ def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> {
+ let Inst{31} = 0;
+ }
+ def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> {
let Inst{31} = 1;
}
}
@@ -2328,7 +2419,7 @@ multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
asm, pattern>,
Sched<[WriteLD]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2340,7 +2431,7 @@ multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
asm, pattern>,
Sched<[WriteST]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2508,7 +2599,7 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
}
class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
- : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+ : InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
(INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
@@ -2934,7 +3025,7 @@ multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
(ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
Sched<[WriteLD]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2946,7 +3037,7 @@ multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
asm, pattern>,
Sched<[WriteST]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2958,7 +3049,7 @@ multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
asm, pat>,
Sched<[WriteLD]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -2993,7 +3084,7 @@ multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
(ins GPR64sp:$Rn, simm9:$offset), asm>,
Sched<[WriteLD]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -3005,7 +3096,7 @@ multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
asm>,
Sched<[WriteST]>;
- def : InstAlias<asm # " $Rt, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
}
@@ -3136,7 +3227,7 @@ multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
(ins GPR64sp:$Rn, indextype:$offset), asm>,
Sched<[WriteLD, WriteLDHi]>;
- def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, 0)>;
}
@@ -3151,7 +3242,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
asm>,
Sched<[WriteSTP]>;
- def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+ def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, 0)>;
}
@@ -3230,8 +3321,8 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
let mayStore = 1, mayLoad = 0 in
class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
Operand idxtype, string asm>
- : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
- (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+ : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
+ (ins regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, idxtype:$offset),
asm>,
Sched<[WriteAdr, WriteSTP]>;
@@ -3477,6 +3568,20 @@ class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
SDPatternOperator OpN> {
+ // Unscaled half-precision to 32-bit
+ def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
+ [(set GPR32:$Rd, (OpN FPR16:$Rn))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ // Unscaled half-precision to 64-bit
+ def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
+ [(set GPR64:$Rd, (OpN FPR16:$Rn))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
// Unscaled single-precision to 32-bit
def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
[(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
@@ -3504,6 +3609,25 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
SDPatternOperator OpN> {
+ // Scaled half-precision to 32-bit
+ def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
+ fixedpoint_f16_i32, asm,
+ [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn,
+ fixedpoint_f16_i32:$scale)))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let scale{5} = 1;
+ let Predicates = [HasFullFP16];
+ }
+
+ // Scaled half-precision to 64-bit
+ def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
+ fixedpoint_f16_i64, asm,
+ [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn,
+ fixedpoint_f16_i64:$scale)))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Predicates = [HasFullFP16];
+ }
+
// Scaled single-precision to 32-bit
def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
fixedpoint_f32_i32, asm,
@@ -3553,7 +3677,7 @@ class BaseIntegerToFP<bit isUnsigned,
bits<5> Rd;
bits<5> Rn;
bits<6> scale;
- let Inst{30-23} = 0b00111100;
+ let Inst{30-24} = 0b0011110;
let Inst{21-17} = 0b00001;
let Inst{16} = isUnsigned;
let Inst{15-10} = scale;
@@ -3570,7 +3694,7 @@ class BaseIntegerToFPUnscaled<bit isUnsigned,
bits<5> Rd;
bits<5> Rn;
bits<6> scale;
- let Inst{30-23} = 0b00111100;
+ let Inst{30-24} = 0b0011110;
let Inst{21-17} = 0b10001;
let Inst{16} = isUnsigned;
let Inst{15-10} = 0b000000;
@@ -3580,33 +3704,55 @@ class BaseIntegerToFPUnscaled<bit isUnsigned,
multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
// Unscaled
+ def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+
+ def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
}
def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
}
// Scaled
+ def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
+ [(set FPR16:$Rd,
+ (fdiv (node GPR32:$Rn),
+ fixedpoint_f16_i32:$scale))]> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let scale{5} = 1;
+ let Predicates = [HasFullFP16];
+ }
+
def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
[(set FPR32:$Rd,
(fdiv (node GPR32:$Rn),
fixedpoint_f32_i32:$scale))]> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
let scale{5} = 1;
}
@@ -3615,16 +3761,25 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
(fdiv (node GPR32:$Rn),
fixedpoint_f64_i32:$scale))]> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
let scale{5} = 1;
}
+ def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
+ [(set FPR16:$Rd,
+ (fdiv (node GPR64:$Rn),
+ fixedpoint_f16_i64:$scale))]> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
[(set FPR32:$Rd,
(fdiv (node GPR64:$Rn),
fixedpoint_f32_i64:$scale))]> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
@@ -3632,7 +3787,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
(fdiv (node GPR64:$Rn),
fixedpoint_f64_i64:$scale))]> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
}
}
@@ -3654,7 +3809,7 @@ class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
Sched<[WriteFCopy]> {
bits<5> Rd;
bits<5> Rn;
- let Inst{30-23} = 0b00111100;
+ let Inst{30-24} = 0b0011110;
let Inst{21} = 1;
let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
@@ -3704,26 +3859,49 @@ class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
}
-
multiclass UnscaledConversion<string asm> {
+ def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
+ }
+
+ def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> {
+ let Inst{31} = 0; // 32-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
+ }
+
+ def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> {
+ let Inst{31} = 1; // 64-bit GPR flag
+ let Inst{23-22} = 0b11; // 16-bit FPR flag
+ let Predicates = [HasFullFP16];
}
def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
let Inst{31} = 0; // 32-bit GPR flag
- let Inst{22} = 0; // 32-bit FPR flag
+ let Inst{23-22} = 0b00; // 32-bit FPR flag
}
def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
let Inst{31} = 1; // 64-bit GPR flag
- let Inst{22} = 1; // 64-bit FPR flag
+ let Inst{23-22} = 0b01; // 64-bit FPR flag
}
def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
@@ -3796,7 +3974,7 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
Sched<[WriteF]> {
bits<5> Rd;
bits<5> Rn;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21-19} = 0b100;
let Inst{18-15} = opcode;
let Inst{14-10} = 0b10000;
@@ -3806,12 +3984,17 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
multiclass SingleOperandFPData<bits<4> opcode, string asm,
SDPatternOperator node = null_frag> {
+ def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
- let Inst{22} = 0; // 32-bit size flag
+ let Inst{23-22} = 0b00; // 32-bit size flag
}
def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
- let Inst{22} = 1; // 64-bit size flag
+ let Inst{23-22} = 0b01; // 64-bit size flag
}
}
@@ -3828,7 +4011,7 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = opcode;
@@ -3839,28 +4022,41 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
multiclass TwoOperandFPData<bits<4> opcode, string asm,
SDPatternOperator node = null_frag> {
+ def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+ [(set (f16 FPR16:$Rd),
+ (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
[(set (f32 FPR32:$Rd),
(node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
- let Inst{22} = 0; // 32-bit size flag
+ let Inst{23-22} = 0b00; // 32-bit size flag
}
def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
[(set (f64 FPR64:$Rd),
(node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
- let Inst{22} = 1; // 64-bit size flag
+ let Inst{23-22} = 0b01; // 64-bit size flag
}
}
multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+ def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+ [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
[(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
- let Inst{22} = 0; // 32-bit size flag
+ let Inst{23-22} = 0b00; // 32-bit size flag
}
def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
[(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
- let Inst{22} = 1; // 64-bit size flag
+ let Inst{23-22} = 0b01; // 64-bit size flag
}
}
@@ -3878,7 +4074,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
bits<5> Rn;
bits<5> Rm;
bits<5> Ra;
- let Inst{31-23} = 0b000111110;
+ let Inst{31-24} = 0b00011111;
let Inst{21} = isNegated;
let Inst{20-16} = Rm;
let Inst{15} = isSub;
@@ -3889,16 +4085,23 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
SDPatternOperator node> {
+ def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
+ [(set FPR16:$Rd,
+ (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
+ let Inst{23-22} = 0b11; // 16-bit size flag
+ let Predicates = [HasFullFP16];
+ }
+
def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
[(set FPR32:$Rd,
(node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
- let Inst{22} = 0; // 32-bit size flag
+ let Inst{23-22} = 0b00; // 32-bit size flag
}
def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
[(set FPR64:$Rd,
(node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
- let Inst{22} = 1; // 64-bit size flag
+ let Inst{23-22} = 0b01; // 64-bit size flag
}
}
@@ -3913,7 +4116,7 @@ class BaseOneOperandFPComparison<bit signalAllNans,
: I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
Sched<[WriteFCmp]> {
bits<5> Rn;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{15-10} = 0b001000;
@@ -3932,7 +4135,7 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
Sched<[WriteFCmp]> {
bits<5> Rm;
bits<5> Rn;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b001000;
@@ -3944,24 +4147,36 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
multiclass FPComparison<bit signalAllNans, string asm,
SDPatternOperator OpNode = null_frag> {
let Defs = [NZCV] in {
+ def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
+ [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
+ def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm,
+ [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
[(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
- let Inst{22} = 0;
+ let Inst{23-22} = 0b00;
}
def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
[(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
- let Inst{22} = 0;
+ let Inst{23-22} = 0b00;
}
def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
[(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
- let Inst{22} = 1;
+ let Inst{23-22} = 0b01;
}
def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
[(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
- let Inst{22} = 1;
+ let Inst{23-22} = 0b01;
}
} // Defs = [NZCV]
}
@@ -3971,17 +4186,20 @@ multiclass FPComparison<bit signalAllNans, string asm,
//---
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPCondComparison<bit signalAllNans,
- RegisterClass regtype, string asm>
- : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
- asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
+ string mnemonic, list<dag> pat>
+ : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+ mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>,
Sched<[WriteFCmp]> {
+ let Uses = [NZCV];
+ let Defs = [NZCV];
+
bits<5> Rn;
bits<5> Rm;
bits<4> nzcv;
bits<4> cond;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
@@ -3991,16 +4209,24 @@ class BaseFPCondComparison<bit signalAllNans,
let Inst{3-0} = nzcv;
}
-multiclass FPCondComparison<bit signalAllNans, string asm> {
- let Defs = [NZCV], Uses = [NZCV] in {
- def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
- let Inst{22} = 0;
+multiclass FPCondComparison<bit signalAllNans, string mnemonic,
+ SDPatternOperator OpNode = null_frag> {
+ def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
}
- def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
- let Inst{22} = 1;
+ def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic,
+ [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]> {
+ let Inst{23-22} = 0b00;
+ }
+
+ def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic,
+ [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv),
+ (i32 imm:$cond), NZCV))]> {
+ let Inst{23-22} = 0b01;
}
- } // Defs = [NZCV], Uses = [NZCV]
}
//---
@@ -4019,7 +4245,7 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
bits<5> Rm;
bits<4> cond;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
@@ -4030,12 +4256,17 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
multiclass FPCondSelect<string asm> {
let Uses = [NZCV] in {
+ def Hrrr : BaseFPCondSelect<FPR16, f16, asm> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
- let Inst{22} = 0;
+ let Inst{23-22} = 0b00;
}
def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
- let Inst{22} = 1;
+ let Inst{23-22} = 0b01;
}
} // Uses = [NZCV]
}
@@ -4050,7 +4281,7 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
Sched<[WriteFImm]> {
bits<5> Rd;
bits<8> imm;
- let Inst{31-23} = 0b000111100;
+ let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-13} = imm;
let Inst{12-5} = 0b10000000;
@@ -4058,12 +4289,17 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
}
multiclass FPMoveImmediate<string asm> {
+ def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> {
+ let Inst{23-22} = 0b11;
+ let Predicates = [HasFullFP16];
+ }
+
def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
- let Inst{22} = 0;
+ let Inst{23-22} = 0b00;
}
def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
- let Inst{22} = 1;
+ let Inst{23-22} = 0b01;
}
}
} // end of 'let Predicates = [HasFPARMv8]'
@@ -4079,7 +4315,7 @@ let Predicates = [HasNEON] in {
//----------------------------------------------------------------------------
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
RegisterOperand regtype, string asm, string kind,
list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -4093,8 +4329,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
- let Inst{23-22} = size;
- let Inst{21} = 1;
+ let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
@@ -4103,7 +4338,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
}
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
RegisterOperand regtype, string asm, string kind,
list<dag> pattern>
: I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
@@ -4117,8 +4352,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
- let Inst{23-22} = size;
- let Inst{21} = 1;
+ let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
@@ -4129,25 +4363,25 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
// All operand sizes distinguished in the encoding.
multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
- def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
- def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
- def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
- def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
- def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
- def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+ def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
asm, ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
}
@@ -4155,49 +4389,49 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
// As above, but D sized elements unsupported.
multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
- def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
- def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
- def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
- def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
- def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
}
multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
- def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
- def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$dst),
(OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
- def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
- def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$dst),
(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
- def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
@@ -4206,54 +4440,80 @@ multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
// As above, but only B sized elements supported.
multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
- def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd),
(OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
}
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+// As above, but only floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
- def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
- def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
}
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
string asm,
SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
- def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
- def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
}
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
+ asm, ".4h",
+ [(set (v4f16 V64:$dst),
+ (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+ def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
+ asm, ".8h",
+ [(set (v8f16 V128:$dst),
+ (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2f32 V64:$dst),
(OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
- def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4f32 V128:$dst),
(OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
- def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2f64 V128:$dst),
(OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
@@ -4262,16 +4522,16 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
// As above, but D and B sized elements unsupported.
multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
- def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
- def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
- def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
}
@@ -4279,10 +4539,10 @@ multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
// Logical three vector ops share opcode bits, and only use B sized elements.
multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
SDPatternOperator OpNode = null_frag> {
- def v8i8 : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+ def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
- def v16i8 : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+ def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
@@ -4303,11 +4563,11 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
string asm, SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+ def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
asm, ".8b",
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
- def v16i8 : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+ def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
asm, ".16b",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
@@ -4347,8 +4607,8 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
- RegisterOperand regtype, string asm, string dstkind,
- string srckind, list<dag> pattern>
+ bits<2> size2, RegisterOperand regtype, string asm,
+ string dstkind, string srckind, list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"{\t$Rd" # dstkind # ", $Rn" # srckind #
"|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
@@ -4360,7 +4620,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -4369,8 +4631,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
- RegisterOperand regtype, string asm, string dstkind,
- string srckind, list<dag> pattern>
+ bits<2> size2, RegisterOperand regtype,
+ string asm, string dstkind, string srckind,
+ list<dag> pattern>
: I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
"{\t$Rd" # dstkind # ", $Rn" # srckind #
"|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
@@ -4382,7 +4645,9 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -4392,22 +4657,22 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
// Supports B, H, and S element sizes.
multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
- def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
- def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
- def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
- def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
}
@@ -4450,49 +4715,49 @@ multiclass SIMDVectorLShiftLongBySizeBHS {
// Supports all element sizes.
multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".4h", ".8b",
[(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
- def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".8h", ".16b",
[(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
- def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".2s", ".4h",
[(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
- def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".4s", ".8h",
[(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
- def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".1d", ".2s",
[(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".2d", ".4s",
[(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
}
multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+ def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
asm, ".4h", ".8b",
[(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
(v8i8 V64:$Rn)))]>;
- def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+ def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
asm, ".8h", ".16b",
[(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
(v16i8 V128:$Rn)))]>;
- def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+ def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
asm, ".2s", ".4h",
[(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
(v4i16 V64:$Rn)))]>;
- def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+ def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
asm, ".4s", ".8h",
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
(v8i16 V128:$Rn)))]>;
- def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+ def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
asm, ".1d", ".2s",
[(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
(v2i32 V64:$Rn)))]>;
- def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+ def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
asm, ".2d", ".4s",
[(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
(v4i32 V128:$Rn)))]>;
@@ -4501,50 +4766,50 @@ multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
// Supports all element sizes, except 1xD.
multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
- def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
- def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
- def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
- def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
- def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
- def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+ def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
}
multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
- def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
- def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
- def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
- def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
- def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+ def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+ def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
- def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+ def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
}
@@ -4553,10 +4818,10 @@ multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
// Supports only B element sizes.
multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
- def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
@@ -4565,16 +4830,16 @@ multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
// Supports only B and H element sizes.
multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+ def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
- def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+ def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
- def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+ def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
- def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+ def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
}
@@ -4583,13 +4848,21 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
// as an extra opcode bit.
multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
- def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
- def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
}
@@ -4597,10 +4870,10 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
// Supports only S element size.
multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
}
@@ -4608,26 +4881,42 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
- def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
- def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
}
multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+ def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
- def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+ def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
- def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+ def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
}
@@ -4706,10 +4995,10 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
}
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
- RegisterOperand regtype,
- string asm, string kind, string zero,
- ValueType dty, ValueType sty, SDNode OpNode>
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
+ bits<5> opcode, RegisterOperand regtype, string asm,
+ string kind, string zero, ValueType dty,
+ ValueType sty, SDNode OpNode>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
"|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
@@ -4722,7 +5011,9 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -4732,54 +5023,74 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
// Comparisons support all element sizes, except 1xD.
multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
SDNode OpNode> {
- def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+ def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
asm, ".8b", "0",
v8i8, v8i8, OpNode>;
- def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+ def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
asm, ".16b", "0",
v16i8, v16i8, OpNode>;
- def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+ def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
asm, ".4h", "0",
v4i16, v4i16, OpNode>;
- def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+ def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
asm, ".8h", "0",
v8i16, v8i16, OpNode>;
- def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+ def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
asm, ".2s", "0",
v2i32, v2i32, OpNode>;
- def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+ def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
asm, ".4s", "0",
v4i32, v4i32, OpNode>;
- def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+ def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
asm, ".2d", "0",
v2i64, v2i64, OpNode>;
}
-// FP Comparisons support only S and D element sizes.
+// FP Comparisons support only S and D element sizes (and H for v8.2a).
multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
string asm, SDNode OpNode> {
- def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
+ asm, ".4h", "0.0",
+ v4i16, v4f16, OpNode>;
+ def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
+ asm, ".8h", "0.0",
+ v8i16, v8f16, OpNode>;
+ } // Predicates = [HasNEON, HasFullFP16]
+ def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
asm, ".2s", "0.0",
v2i32, v2f32, OpNode>;
- def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+ def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
asm, ".4s", "0.0",
v4i32, v4f32, OpNode>;
- def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+ def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
asm, ".2d", "0.0",
v2i64, v2f64, OpNode>;
- def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
+ (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
+ (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+ }
+ def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
(!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
- def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+ def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
- def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+ def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
- def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+ def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
+ (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+ }
+ def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
- def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+ def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
- def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+ def : InstAlias<asm # ".2d\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
}
@@ -5325,7 +5636,7 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
//----------------------------------------------------------------------------
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
RegisterClass regtype, string asm,
list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
@@ -5337,8 +5648,7 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
- let Inst{23-22} = size;
- let Inst{21} = 1;
+ let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
@@ -5369,17 +5679,17 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+ def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
}
multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+ def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
- def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
- def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
- def v1i8 : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+ def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
+ def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+ def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
@@ -5389,9 +5699,9 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+ def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
- def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+ def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
}
multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
@@ -5404,26 +5714,34 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
asm, []>;
}
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+ def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
- def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+ def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+ [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
}
def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
}
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+ def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
- def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+ def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+ []>;
+ } // Predicates = [HasNEON, HasFullFP16]
}
def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
@@ -5482,7 +5800,7 @@ multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
//----------------------------------------------------------------------------
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
RegisterClass regtype, RegisterClass regtype2,
string asm, list<dag> pat>
: I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
@@ -5494,7 +5812,9 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -5523,7 +5843,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
RegisterClass regtype, string asm, string zero>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"\t$Rd, $Rn, #" # zero, "", []>,
@@ -5534,7 +5854,9 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
- let Inst{21-17} = 0b10000;
+ let Inst{21} = 0b1;
+ let Inst{20-19} = size2;
+ let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
@@ -5556,21 +5878,28 @@ class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+ def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;
def : Pat<(v1i64 (OpNode FPR64:$Rn)),
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
}
-multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
- def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+ def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
+ def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
+ }
- def : InstAlias<asm # " $Rd, $Rn, #0",
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
- def : InstAlias<asm # " $Rd, $Rn, #0",
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def : InstAlias<asm # "\t$Rd, $Rn, #0",
+ (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
+ }
def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
@@ -5578,35 +5907,42 @@ multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
- def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+ def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
(!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
}
-multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
- def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
- def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
+ }
}
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
- def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
[(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
- def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
[(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+ [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+ }
}
multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
- def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+ def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
[(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
- def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+ def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
- def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
- def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+ def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
+ def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
}
def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
@@ -5633,10 +5969,10 @@ multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
- def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+ def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
[(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
- def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
- def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+ def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
+ def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
}
//----------------------------------------------------------------------------
@@ -5668,10 +6004,14 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
asm, ".2d">;
}
-multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
- def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
+ asm, ".2h">;
+ }
+ def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
asm, ".2s">;
- def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+ def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
asm, ".2d">;
}
@@ -5727,8 +6067,16 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
asm, ".4s", []>;
}
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
Intrinsic intOp> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
+ asm, ".4h",
+ [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+ def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
+ asm, ".8h",
+ [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+ } // Predicates = [HasNEON, HasFullFP16]
def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
asm, ".4s",
[(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
@@ -5925,7 +6273,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst,
class SIMDInsElementMovAlias<string size, Instruction inst,
Operand idxtype>
: InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
- # "|" # size #" $dst$idx, $src$idx2}",
+ # "|" # size #"\t$dst$idx, $src$idx2}",
(inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
@@ -6215,7 +6563,7 @@ multiclass SIMDScalarCPY<string asm> {
// AdvSIMD modified immediate instructions
//----------------------------------------------------------------------------
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
string asm, string op_string,
string cstr, list<dag> pattern>
: I<oops, iops, asm, op_string, cstr, pattern>,
@@ -6227,16 +6575,17 @@ class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
let Inst{29} = op;
let Inst{28-19} = 0b0111100000;
let Inst{18-16} = imm8{7-5};
- let Inst{11-10} = 0b01;
+ let Inst{11} = op2;
+ let Inst{10} = 1;
let Inst{9-5} = imm8{4-0};
let Inst{4-0} = Rd;
}
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
Operand immtype, dag opt_shift_iop,
string opt_shift, string asm, string kind,
list<dag> pattern>
- : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+ : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
!con((ins immtype:$imm8), opt_shift_iop), asm,
"{\t$Rd" # kind # ", $imm8" # opt_shift #
"|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6248,7 +6597,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
Operand immtype, dag opt_shift_iop,
string opt_shift, string asm, string kind,
list<dag> pattern>
- : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+ : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
!con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
"|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
@@ -6259,7 +6608,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins logical_vec_shift:$shift),
"$shift", asm, kind, pattern> {
bits<2> shift;
@@ -6284,7 +6633,7 @@ class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins logical_vec_hw_shift:$shift),
"$shift", asm, kind, pattern> {
bits<2> shift;
@@ -6349,7 +6698,7 @@ multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+ : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins move_vec_shift:$shift),
"$shift", asm, kind, pattern> {
bits<1> shift;
@@ -6357,18 +6706,18 @@ class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
let Inst{12} = shift;
}
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
RegisterOperand vectype,
Operand imm_type, string asm,
string kind, list<dag> pattern>
- : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+ : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
asm, kind, pattern> {
let Inst{15-12} = cmode;
}
class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
list<dag> pattern>
- : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+ : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
"\t$Rd, $imm8", "", pattern> {
let Inst{15-12} = cmode;
let DecoderMethod = "DecodeModImmInstruction";
@@ -6438,8 +6787,36 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
let Inst{4-0} = Rd;
}
-multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
- SDPatternOperator OpNode> {
+multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
+ SDPatternOperator OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
+ V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h",
+ [(set (v4f16 V64:$Rd),
+ (OpNode (v4f16 V64:$Rn),
+ (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h",
+ [(set (v8f16 V128:$Rd),
+ (OpNode (v8f16 V128:$Rn),
+ (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
V64, V64,
V128, VectorIndexS,
@@ -6476,6 +6853,21 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
let Inst{21} = 0;
}
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
+ FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h",
+ [(set (f16 FPR16Op:$Rd),
+ (OpNode (f16 FPR16Op:$Rn),
+ (f16 (vector_extract (v8f16 V128_lo:$Rm),
+ VectorIndexH:$idx))))]> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
@@ -6501,7 +6893,7 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
}
}
-multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(AArch64duplane32 (v4f32 V128:$Rm),
@@ -6553,7 +6945,28 @@ multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
V128:$Rm, VectorIndexD:$idx)>;
}
-multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
+ V128_lo, VectorIndexH,
+ asm, ".4h", ".4h", ".4h", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+
+ def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
+ V128, V128,
+ V128_lo, VectorIndexH,
+ asm, ".8h", ".8h", ".8h", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s", []> {
@@ -6580,6 +6993,16 @@ multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
let Inst{21} = 0;
}
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
+ FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+ asm, ".h", "", "", ".h", []> {
+ bits<3> idx;
+ let Inst{11} = idx{2};
+ let Inst{21} = idx{1};
+ let Inst{20} = idx{0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
@@ -7117,7 +7540,13 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
}
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+ FPR16, FPR16, vecshiftR16, asm, []> {
+ let Inst{19-16} = imm{3-0};
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
FPR32, FPR32, vecshiftR32, asm, []> {
let Inst{20-16} = imm{4-0};
@@ -7297,6 +7726,23 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16,
+ asm, ".4h", ".4h",
+ [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16,
+ asm, ".8h", ".8h",
+ [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
asm, ".2s", ".2s",
@@ -7322,8 +7768,26 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
}
}
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
+ let Predicates = [HasNEON, HasFullFP16] in {
+ def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+ V64, V64, vecshiftR16,
+ asm, ".4h", ".4h",
+ [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+
+ def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+ V128, V128, vecshiftR16,
+ asm, ".8h", ".8h",
+ [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
+ bits<4> imm;
+ let Inst{19-16} = imm;
+ }
+ } // Predicates = [HasNEON, HasFullFP16]
+
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
asm, ".2s", ".2s",
@@ -8604,9 +9068,8 @@ let Predicates = [HasNEON, HasV8_1a] in {
class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand regtype, string asm,
string kind, list<dag> pattern>
- : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind,
+ : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind,
pattern> {
- let Inst{21}=0;
}
multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
SDPatternOperator Accum> {
@@ -9041,6 +9504,7 @@ def : TokenAlias<".8H", ".8h">;
def : TokenAlias<".4S", ".4s">;
def : TokenAlias<".2D", ".2d">;
def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".2H", ".2h">;
def : TokenAlias<".B", ".b">;
def : TokenAlias<".H", ".h">;
def : TokenAlias<".S", ".s">;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c0b3f2c60916..3ef3c8b840cb 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,7 +12,6 @@
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
-#include "AArch64MachineCombinerPattern.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -533,6 +532,14 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
CC);
}
+/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
+static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
+ uint64_t Imm = MI->getOperand(1).getImm();
+ uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+ uint64_t Encoding;
+ return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
+}
+
// FIXME: this implementation should be micro-architecture dependent, so a
// micro-architecture target hook should be introduced here in future.
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
@@ -573,6 +580,12 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
case AArch64::ORRWrr:
case AArch64::ORRXrr:
return true;
+ // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
+ // ORRXri, it is as cheap as MOV
+ case AArch64::MOVi32imm:
+ return canBeExpandedToORR(MI, 32);
+ case AArch64::MOVi64imm:
+ return canBeExpandedToORR(MI, 64);
}
llvm_unreachable("Unknown opcode to check as cheap as a move!");
@@ -1379,42 +1392,34 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
Width = 1;
Scale = 1;
break;
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ Scale = Width = 16;
+ break;
case AArch64::LDRXui:
+ case AArch64::LDRDui:
case AArch64::STRXui:
+ case AArch64::STRDui:
Scale = Width = 8;
break;
case AArch64::LDRWui:
+ case AArch64::LDRSui:
case AArch64::STRWui:
+ case AArch64::STRSui:
Scale = Width = 4;
break;
- case AArch64::LDRBui:
- case AArch64::STRBui:
- Scale = Width = 1;
- break;
case AArch64::LDRHui:
+ case AArch64::LDRHHui:
case AArch64::STRHui:
+ case AArch64::STRHHui:
Scale = Width = 2;
break;
- case AArch64::LDRSui:
- case AArch64::STRSui:
- Scale = Width = 4;
- break;
- case AArch64::LDRDui:
- case AArch64::STRDui:
- Scale = Width = 8;
- break;
- case AArch64::LDRQui:
- case AArch64::STRQui:
- Scale = Width = 16;
- break;
+ case AArch64::LDRBui:
case AArch64::LDRBBui:
+ case AArch64::STRBui:
case AArch64::STRBBui:
Scale = Width = 1;
break;
- case AArch64::LDRHHui:
- case AArch64::STRHHui:
- Scale = Width = 2;
- break;
};
BaseReg = LdSt->getOperand(1).getReg();
@@ -1445,23 +1450,43 @@ bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
MachineInstr *Second) const {
- // Cyclone can fuse CMN, CMP followed by Bcc.
-
- // FIXME: B0 can also fuse:
- // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
- if (Second->getOpcode() != AArch64::Bcc)
- return false;
- switch (First->getOpcode()) {
- default:
- return false;
- case AArch64::SUBSWri:
- case AArch64::ADDSWri:
- case AArch64::ANDSWri:
- case AArch64::SUBSXri:
- case AArch64::ADDSXri:
- case AArch64::ANDSXri:
- return true;
+ if (Subtarget.isCyclone()) {
+ // Cyclone can fuse CMN, CMP, TST followed by Bcc.
+ unsigned SecondOpcode = Second->getOpcode();
+ if (SecondOpcode == AArch64::Bcc) {
+ switch (First->getOpcode()) {
+ default:
+ return false;
+ case AArch64::SUBSWri:
+ case AArch64::ADDSWri:
+ case AArch64::ANDSWri:
+ case AArch64::SUBSXri:
+ case AArch64::ADDSXri:
+ case AArch64::ANDSXri:
+ return true;
+ }
+ }
+ // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
+ if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+ SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
+ switch (First->getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDWri:
+ case AArch64::ADDXri:
+ case AArch64::ANDWri:
+ case AArch64::ANDXri:
+ case AArch64::EORWri:
+ case AArch64::EORXri:
+ case AArch64::ORRWri:
+ case AArch64::ORRXri:
+ case AArch64::SUBWri:
+ case AArch64::SUBXri:
+ return true;
+ }
+ }
}
+ return false;
}
MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
@@ -1814,7 +1839,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
unsigned Opc = 0;
@@ -1911,7 +1936,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = *MF.getFrameInfo();
unsigned Align = MFI.getObjectAlignment(FI);
- MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
@@ -2226,11 +2251,19 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
case AArch64::LDPDi:
case AArch64::STPXi:
case AArch64::STPDi:
+ case AArch64::LDNPXi:
+ case AArch64::LDNPDi:
+ case AArch64::STNPXi:
+ case AArch64::STNPDi:
+ ImmIdx = 3;
IsSigned = true;
Scale = 8;
break;
case AArch64::LDPQi:
case AArch64::STPQi:
+ case AArch64::LDNPQi:
+ case AArch64::STNPQi:
+ ImmIdx = 3;
IsSigned = true;
Scale = 16;
break;
@@ -2238,6 +2271,11 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
case AArch64::LDPSi:
case AArch64::STPWi:
case AArch64::STPSi:
+ case AArch64::LDNPWi:
+ case AArch64::LDNPSi:
+ case AArch64::STNPWi:
+ case AArch64::STNPSi:
+ ImmIdx = 3;
IsSigned = true;
Scale = 4;
break;
@@ -2457,7 +2495,7 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
bool AArch64InstrInfo::getMachineCombinerPatterns(
MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
+ SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
unsigned Opc = Root.getOpcode();
MachineBasicBlock &MBB = *Root.getParent();
bool Found = false;
@@ -2485,76 +2523,76 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
"ADDWrr does not have register operands");
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
Found = true;
}
if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP2);
+ Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
Found = true;
}
break;
case AArch64::ADDXrr:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
Found = true;
}
if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP2);
+ Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
Found = true;
}
break;
case AArch64::SUBWrr:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
Found = true;
}
if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP2);
+ Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
Found = true;
}
break;
case AArch64::SUBXrr:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
Found = true;
}
if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP2);
+ Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
Found = true;
}
break;
case AArch64::ADDWri:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDWI_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
Found = true;
}
break;
case AArch64::ADDXri:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULADDXI_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
Found = true;
}
break;
case AArch64::SUBWri:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
AArch64::WZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
Found = true;
}
break;
case AArch64::SUBXri:
if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
AArch64::XZR)) {
- Patterns.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1);
+ Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
Found = true;
}
break;
@@ -2661,7 +2699,7 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
/// this function generates the instructions that could replace the
/// original code sequence
void AArch64InstrInfo::genAlternativeCodeSequence(
- MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+ MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
@@ -2677,13 +2715,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
default:
// signal error.
break;
- case MachineCombinerPattern::MC_MULADDW_OP1:
- case MachineCombinerPattern::MC_MULADDX_OP1:
+ case MachineCombinerPattern::MULADDW_OP1:
+ case MachineCombinerPattern::MULADDX_OP1:
// MUL I=A,B,0
// ADD R,I,C
// ==> MADD R,A,B,C
// --- Create(MADD);
- if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) {
+ if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
@@ -2692,13 +2730,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
- case MachineCombinerPattern::MC_MULADDW_OP2:
- case MachineCombinerPattern::MC_MULADDX_OP2:
+ case MachineCombinerPattern::MULADDW_OP2:
+ case MachineCombinerPattern::MULADDX_OP2:
// MUL I=A,B,0
// ADD R,C,I
// ==> MADD R,A,B,C
// --- Create(MADD);
- if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) {
+ if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
@@ -2707,8 +2745,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
- case MachineCombinerPattern::MC_MULADDWI_OP1:
- case MachineCombinerPattern::MC_MULADDXI_OP1: {
+ case MachineCombinerPattern::MULADDWI_OP1:
+ case MachineCombinerPattern::MULADDXI_OP1: {
// MUL I=A,B,0
// ADD R,I,Imm
// ==> ORR V, ZR, Imm
@@ -2716,7 +2754,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
// --- Create(MADD);
const TargetRegisterClass *OrrRC;
unsigned BitSize, OrrOpc, ZeroReg;
- if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) {
+ if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
OrrOpc = AArch64::ORRWri;
OrrRC = &AArch64::GPR32spRegClass;
BitSize = 32;
@@ -2751,8 +2789,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
}
- case MachineCombinerPattern::MC_MULSUBW_OP1:
- case MachineCombinerPattern::MC_MULSUBX_OP1: {
+ case MachineCombinerPattern::MULSUBW_OP1:
+ case MachineCombinerPattern::MULSUBX_OP1: {
// MUL I=A,B,0
// SUB R,I, C
// ==> SUB V, 0, C
@@ -2760,7 +2798,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
// --- Create(MADD);
const TargetRegisterClass *SubRC;
unsigned SubOpc, ZeroReg;
- if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) {
+ if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
SubOpc = AArch64::SUBWrr;
SubRC = &AArch64::GPR32spRegClass;
ZeroReg = AArch64::WZR;
@@ -2784,13 +2822,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
break;
}
- case MachineCombinerPattern::MC_MULSUBW_OP2:
- case MachineCombinerPattern::MC_MULSUBX_OP2:
+ case MachineCombinerPattern::MULSUBW_OP2:
+ case MachineCombinerPattern::MULSUBX_OP2:
// MUL I=A,B,0
// SUB R,C,I
// ==> MSUB R,A,B,C (computes C - A*B)
// --- Create(MSUB);
- if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) {
+ if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
Opc = AArch64::MSUBWrrr;
RC = &AArch64::GPR32RegClass;
} else {
@@ -2799,8 +2837,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
- case MachineCombinerPattern::MC_MULSUBWI_OP1:
- case MachineCombinerPattern::MC_MULSUBXI_OP1: {
+ case MachineCombinerPattern::MULSUBWI_OP1:
+ case MachineCombinerPattern::MULSUBXI_OP1: {
// MUL I=A,B,0
// SUB R,I, Imm
// ==> ORR V, ZR, -Imm
@@ -2808,7 +2846,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
// --- Create(MADD);
const TargetRegisterClass *OrrRC;
unsigned BitSize, OrrOpc, ZeroReg;
- if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) {
+ if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
OrrOpc = AArch64::ORRWri;
OrrRC = &AArch64::GPR32spRegClass;
BitSize = 32;
@@ -2944,3 +2982,34 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
MI->eraseFromParent();
return true;
}
+
+std::pair<unsigned, unsigned>
+AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ const unsigned Mask = AArch64II::MO_FRAGMENT;
+ return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace AArch64II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_PAGE, "aarch64-page"},
+ {MO_PAGEOFF, "aarch64-pageoff"},
+ {MO_G3, "aarch64-g3"},
+ {MO_G2, "aarch64-g2"},
+ {MO_G1, "aarch64-g1"},
+ {MO_G0, "aarch64-g0"},
+ {MO_HI12, "aarch64-hi12"}};
+ return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+ using namespace AArch64II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT, "aarch64-got"},
+ {MO_NC, "aarch64-nc"},
+ {MO_TLS, "aarch64-tls"},
+ {MO_CONSTPOOL, "aarch64-constant-pool"}};
+ return makeArrayRef(TargetFlags);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 68c2a2882580..ae02822a32e6 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -167,13 +167,13 @@ public:
/// for an instruction chain ending in <Root>. All potential patterns are
/// listed in the <Patterns> array.
bool getMachineCombinerPatterns(MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns)
+ SmallVectorImpl<MachineCombinerPattern> &Patterns)
const override;
/// When getMachineCombinerPatterns() finds patterns, this function generates
/// the instructions that could replace the original code sequence
void genAlternativeCodeSequence(
- MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+ MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
@@ -181,6 +181,14 @@ public:
bool useMachineCombiner() const override;
bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+
+ std::pair<unsigned, unsigned>
+ decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableDirectMachineOperandTargetFlags() const override;
+ ArrayRef<std::pair<unsigned, const char *>>
+ getSerializableBitmaskMachineOperandTargetFlags() const override;
+
private:
void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
MachineBasicBlock *TBB,
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index fa1a46acba84..d02bc9ff394d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -16,6 +16,8 @@
//
def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
+ AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
@@ -24,6 +26,12 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
AssemblerPredicate<"FeatureCrypto", "crypto">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
AssemblerPredicate<"FeatureCRC", "crc">;
+def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
+def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
+ AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+def HasSPE : Predicate<"Subtarget->hasSPE()">,
+ AssemblerPredicate<"FeatureSPE", "spe">;
+
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsCyclone : Predicate<"Subtarget->isCyclone()">;
@@ -66,6 +74,20 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4,
SDTCisSameAs<0, 2>,
SDTCisInt<3>,
SDTCisVT<4, i32>]>;
+def SDT_AArch64CCMP : SDTypeProfile<1, 5,
+ [SDTCisVT<0, i32>,
+ SDTCisInt<1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisInt<3>,
+ SDTCisInt<4>,
+ SDTCisVT<5, i32>]>;
+def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
+ [SDTCisVT<0, i32>,
+ SDTCisFP<1>,
+ SDTCisSameAs<1, 2>,
+ SDTCisInt<3>,
+ SDTCisInt<4>,
+ SDTCisVT<5, i32>]>;
def SDT_AArch64FCmp : SDTypeProfile<0, 2,
[SDTCisFP<0>,
SDTCisSameAs<0, 1>]>;
@@ -160,13 +182,14 @@ def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut,
def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>;
def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>;
+def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>;
+def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>;
+def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
+
def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
-def AArch64fmax : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
-def AArch64fmin : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
-
def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
@@ -361,6 +384,9 @@ def : InstAlias<"wfi", (HINT 0b011)>;
def : InstAlias<"sev", (HINT 0b100)>;
def : InstAlias<"sevl", (HINT 0b101)>;
+// v8.2a Statistical Profiling extension
+def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
+
// As far as LLVM is concerned this writes to the system's exclusive monitors.
let mayLoad = 1, mayStore = 1 in
def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
@@ -383,12 +409,17 @@ def : InstAlias<"isb", (ISB 0xf)>;
def MRS : MRSI;
def MSR : MSRI;
-def MSRpstate: MSRpstateI;
+def MSRpstateImm1 : MSRpstateImm0_1;
+def MSRpstateImm4 : MSRpstateImm0_15;
// The thread pointer (on Linux, at least, where this has been implemented) is
// TPIDR_EL0.
def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+// The cycle counter PMC register is PMCCNTR_EL0.
+let Predicates = [HasPerfMon] in
+def : Pat<(readcyclecounter), (MRS 0xdce8)>;
+
// Generic system instructions
def SYSxt : SystemXtI<0, "sys">;
def SYSLxt : SystemLXtI<1, "sysl">;
@@ -595,10 +626,12 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
(SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
(SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+let AddedComplexity = 1 in {
def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
(SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
(SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+}
// Because of the immediate format for add/sub-imm instructions, the
// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
@@ -823,7 +856,7 @@ defm AND : LogicalReg<0b00, 0, "and", and>;
defm BIC : LogicalReg<0b00, 1, "bic",
BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
defm EON : LogicalReg<0b10, 1, "eon",
- BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+ BinOpFrag<(not (xor node:$LHS, node:$RHS))>>;
defm EOR : LogicalReg<0b10, 0, "eor", xor>;
defm ORN : LogicalReg<0b01, 1, "orn",
BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
@@ -1020,13 +1053,10 @@ def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
//===----------------------------------------------------------------------===//
-// Conditionally set flags instructions.
+// Conditional comparison instructions.
//===----------------------------------------------------------------------===//
-defm CCMN : CondSetFlagsImm<0, "ccmn">;
-defm CCMP : CondSetFlagsImm<1, "ccmp">;
-
-defm CCMN : CondSetFlagsReg<0, "ccmn">;
-defm CCMP : CondSetFlagsReg<1, "ccmp">;
+defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
+defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;
//===----------------------------------------------------------------------===//
// Conditional select instructions.
@@ -2421,6 +2451,26 @@ defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvt
defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
}
+multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
+ def : Pat<(i32 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+}
+
+defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, frnd, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, frnd, "FCVTAU">;
+
//===----------------------------------------------------------------------===//
// Scaled integer to floating point conversion instructions.
//===----------------------------------------------------------------------===//
@@ -2466,14 +2516,7 @@ defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
(FRINTNDr FPR64:$Rn)>;
-// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
-// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
-// <rdar://problem/13715968>
-// TODO: We should really model the FPSR flags correctly. This is really ugly.
-let hasSideEffects = 1 in {
defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-}
-
defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
let SchedRW = [WriteFDiv] in {
@@ -2488,23 +2531,23 @@ defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>;
let SchedRW = [WriteFDiv] in {
defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>;
}
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
-defm FMAX : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
-defm FMIN : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
+defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
+defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>;
let SchedRW = [WriteFMul] in {
defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>;
defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
}
defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>;
-def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMINDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
//===----------------------------------------------------------------------===//
@@ -2556,7 +2599,7 @@ defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>;
//===----------------------------------------------------------------------===//
defm FCCMPE : FPCondComparison<1, "fccmpe">;
-defm FCCMP : FPCondComparison<0, "fccmp">;
+defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>;
//===----------------------------------------------------------------------===//
// Floating point conditional select instruction.
@@ -2589,6 +2632,40 @@ defm FMOV : FPMoveImmediate<"fmov">;
// Advanced SIMD two vector instructions.
//===----------------------------------------------------------------------===//
+defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+ int_aarch64_neon_uabd>;
+// Match UABDL in log2-shuffle patterns.
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+ (v8i16 (add (sub (zext (v8i8 V64:$opA)),
+ (zext (v8i8 V64:$opB))),
+ (AArch64vashr v8i16:$src, (i32 15))))),
+ (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+ (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
+ (zext (extract_high_v16i8 V128:$opB))),
+ (AArch64vashr v8i16:$src, (i32 15))))),
+ (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+ (v4i32 (add (sub (zext (v4i16 V64:$opA)),
+ (zext (v4i16 V64:$opB))),
+ (AArch64vashr v4i32:$src, (i32 31))))),
+ (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+ (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)),
+ (zext (extract_high_v8i16 V128:$opB))),
+ (AArch64vashr v4i32:$src, (i32 31))))),
+ (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+ (v2i64 (add (sub (zext (v2i32 V64:$opA)),
+ (zext (v2i32 V64:$opB))),
+ (AArch64vashr v2i64:$src, (i32 63))))),
+ (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+ (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)),
+ (zext (extract_high_v4i32 V128:$opB))),
+ (AArch64vashr v2i64:$src, (i32 63))))),
+ (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
+
defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
(v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
@@ -2780,29 +2857,29 @@ defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
-defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
-defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
-defm FADDP : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
-defm FADD : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FDIV : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
-defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
-defm FMAXP : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
-defm FMINNM : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
-defm FMINP : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
+defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
// instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
// The following def pats catch the case where the LHS of an FMA is negated.
@@ -2816,11 +2893,11 @@ def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
(FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
-defm FMULX : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
-defm FMUL : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
-defm FSUB : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
@@ -2833,9 +2910,9 @@ defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
-defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
-defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
@@ -2852,9 +2929,9 @@ defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
-defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
-defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
@@ -2879,54 +2956,6 @@ defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
-def : Pat<(v8i8 (smin V64:$Rn, V64:$Rm)),
- (SMINv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (smin V64:$Rn, V64:$Rm)),
- (SMINv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (smin V64:$Rn, V64:$Rm)),
- (SMINv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (smin V128:$Rn, V128:$Rm)),
- (SMINv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (smin V128:$Rn, V128:$Rm)),
- (SMINv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (smin V128:$Rn, V128:$Rm)),
- (SMINv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (smax V64:$Rn, V64:$Rm)),
- (SMAXv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (smax V64:$Rn, V64:$Rm)),
- (SMAXv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (smax V64:$Rn, V64:$Rm)),
- (SMAXv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (smax V128:$Rn, V128:$Rm)),
- (SMAXv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (smax V128:$Rn, V128:$Rm)),
- (SMAXv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (smax V128:$Rn, V128:$Rm)),
- (SMAXv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (umin V64:$Rn, V64:$Rm)),
- (UMINv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (umin V64:$Rn, V64:$Rm)),
- (UMINv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (umin V64:$Rn, V64:$Rm)),
- (UMINv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (umin V128:$Rn, V128:$Rm)),
- (UMINv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (umin V128:$Rn, V128:$Rm)),
- (UMINv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (umin V128:$Rn, V128:$Rm)),
- (UMINv4i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i8 (umax V64:$Rn, V64:$Rm)),
- (UMAXv8i8 V64:$Rn, V64:$Rm)>;
-def : Pat<(v4i16 (umax V64:$Rn, V64:$Rm)),
- (UMAXv4i16 V64:$Rn, V64:$Rm)>;
-def : Pat<(v2i32 (umax V64:$Rn, V64:$Rm)),
- (UMAXv2i32 V64:$Rn, V64:$Rm)>;
-def : Pat<(v16i8 (umax V128:$Rn, V128:$Rm)),
- (UMAXv16i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v8i16 (umax V128:$Rn, V128:$Rm)),
- (UMAXv8i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i32 (umax V128:$Rn, V128:$Rm)),
- (UMAXv4i32 V128:$Rn, V128:$Rm)>;
def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
@@ -3052,6 +3081,14 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
"|cmlt.2d\t$dst, $src1, $src2}",
(CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
+ "|fcmle.4h\t$dst, $src1, $src2}",
+ (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
+ "|fcmle.8h\t$dst, $src1, $src2}",
+ (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
"|fcmle.2s\t$dst, $src1, $src2}",
(FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3062,6 +3099,14 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
"|fcmle.2d\t$dst, $src1, $src2}",
(FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
+ "|fcmlt.4h\t$dst, $src1, $src2}",
+ (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
+ "|fcmlt.8h\t$dst, $src1, $src2}",
+ (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
"|fcmlt.2s\t$dst, $src1, $src2}",
(FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3072,6 +3117,14 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
"|fcmlt.2d\t$dst, $src1, $src2}",
(FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
+ "|facle.4h\t$dst, $src1, $src2}",
+ (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
+ "|facle.8h\t$dst, $src1, $src2}",
+ (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
"|facle.2s\t$dst, $src1, $src2}",
(FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3082,6 +3135,14 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
"|facle.2d\t$dst, $src1, $src2}",
(FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
+ "|faclt.4h\t$dst, $src1, $src2}",
+ (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
+ "|faclt.8h\t$dst, $src1, $src2}",
+ (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
"|faclt.2s\t$dst, $src1, $src2}",
(FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
@@ -3103,19 +3164,19 @@ defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
-defm FABD : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
int_aarch64_neon_facge>;
-defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
int_aarch64_neon_facgt>;
-defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
-defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
-defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
-defm FMULX : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
-defm FRECPS : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
-defm FRSQRTS : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
@@ -3198,35 +3259,35 @@ defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
-defm FCMEQ : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
-defm FCMGE : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
-defm FCMGT : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
-defm FCMLE : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
-defm FCMLT : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
-defm FCVTAS : SIMDTwoScalarSD< 0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDTwoScalarSD< 1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDTwoScalarSD< 0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDTwoScalarSD< 1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDTwoScalarSD< 0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDTwoScalarSD< 1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDTwoScalarSD< 0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDTwoScalarSD< 1, 1, 0b11010, "fcvtpu">;
+defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDTwoScalarSD< 0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDTwoScalarSD< 1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDTwoScalarSD< 0, 1, 0b11101, "frecpe">;
-defm FRECPX : SIMDTwoScalarSD< 0, 1, 0b11111, "frecpx">;
-defm FRSQRTE : SIMDTwoScalarSD< 1, 1, 0b11101, "frsqrte">;
+defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF : SIMDTwoScalarCVTSD< 0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
int_aarch64_neon_suqadd>;
-defm UCVTF : SIMDTwoScalarCVTSD< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
int_aarch64_neon_usqadd>;
@@ -3390,8 +3451,6 @@ defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
int_aarch64_neon_uabd>;
-defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
- int_aarch64_neon_uabd>;
defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
@@ -3449,8 +3508,8 @@ defm : Neon_mulacc_widen_patterns<
// Patterns for 64-bit pmull
def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
(PMULLv1i64 V64:$Rn, V64:$Rm)>;
-def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
- (vector_extract (v2i64 V128:$Rm), (i64 1))),
+def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
+ (extractelt (v2i64 V128:$Rm), (i64 1))),
(PMULLv2i64 V128:$Rn, V128:$Rm)>;
// CodeGen patterns for addhn and subhn instructions, which can actually be
@@ -3593,11 +3652,11 @@ defm CPY : SIMDScalarCPY<"cpy">;
//----------------------------------------------------------------------------
defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
-defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
+defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
+defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
@@ -3713,12 +3772,12 @@ defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
SDNodeXForm IdxXFORM> {
- def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+ def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
imm:$idx))))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
- def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
- imm:$idx))))),
+ def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
+ imm:$idx))))),
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
}
@@ -3747,6 +3806,13 @@ def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
(i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
+ VectorIndexB:$idx)))), i8),
+ (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
+ VectorIndexH:$idx)))), i16),
+ (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+
// Extracting i8 or i16 elements will have the zero-extend transformed to
// an 'and' mask by type legalization since neither i8 nor i16 are legal types
// for AArch64. Match these patterns here since UMOV already zeroes out the high
@@ -3784,6 +3850,11 @@ def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(i64 FPR64:$Rn), dsub))>;
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
@@ -3949,10 +4020,10 @@ defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
-defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
-defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
// Patterns for across-vector intrinsics, that have a node equivalent, that
// returns a vector (with only the low lane defined) instead of a scalar.
@@ -4199,15 +4270,23 @@ def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
// AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
"fmov", ".2d",
[(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64, fpimm8,
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8,
"fmov", ".2s",
[(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
"fmov", ".4s",
[(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8,
+ "fmov", ".4h",
+ [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
+ "fmov", ".8h",
+ [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+} // Predicates = [HasNEON, HasFullFP16]
// AdvSIMD MOVI
@@ -4235,7 +4314,7 @@ def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>;
// The movi_edit node has the immediate value already encoded, so we use
// a plain imm0_255 in the pattern
let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
simdimmtype10,
"movi", ".2d",
[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
@@ -4296,10 +4375,10 @@ def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
(AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
// Per byte: 8b & 16b
-def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64, imm0_255,
+def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255,
"movi", ".8b",
[(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
"movi", ".16b",
[(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
@@ -4340,8 +4419,8 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
//----------------------------------------------------------------------------
let hasSideEffects = 0 in {
- defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
- defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+ defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">;
+ defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">;
}
// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
@@ -4349,18 +4428,18 @@ let hasSideEffects = 0 in {
// On the other hand, there are quite a few valid combinatorial options due to
// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
@@ -4424,7 +4503,9 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
V128:$Rm, VectorIndexS:$idx)>;
def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
- (vector_extract (v2f32 (fneg V64:$Rm)),
+ (vector_extract (v4f32 (insert_subvector undef,
+ (v2f32 (fneg V64:$Rm)),
+ (i32 0))),
VectorIndexS:$idx))),
(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
@@ -4442,8 +4523,8 @@ defm : FMLSIndexedAfterNegPatterns<
defm : FMLSIndexedAfterNegPatterns<
TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
-defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
-defm FMUL : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
(FMULv2i32_indexed V64:$Rn,
@@ -4497,10 +4578,10 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
//----------------------------------------------------------------------------
// AdvSIMD scalar shift instructions
//----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
+defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
+defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
// Codegen patterns for the above. We don't put these directly on the
// instructions because TableGen's type inference can't handle the truth.
// Having the same base pattern for fp <--> int totally freaks it out.
@@ -4573,7 +4654,7 @@ defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra",
//----------------------------------------------------------------------------
defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
int_aarch64_neon_vcvtfxs2fp>;
defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
int_aarch64_neon_rshrn>;
@@ -4608,7 +4689,7 @@ defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
int_aarch64_neon_vcvtfxu2fp>;
defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
int_aarch64_neon_uqrshrn>;
@@ -5133,10 +5214,10 @@ def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
def : Pat<(i64 (anyext GPR32:$src)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-// When we need to explicitly zero-extend, we use an unsigned bitfield move
-// instruction (UBFM) on the enclosing super-reg.
+// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and
+// then assert the extension has happened.
def : Pat<(i64 (zext GPR32:$src)),
- (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+ (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
// To sign extend, we use a signed bitfield move instruction (SBFM) on the
// containing super-reg.
@@ -5801,6 +5882,21 @@ def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
(v16i8 (REV16v16i8 FPR128:$src))>;
}
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))),
+ (EXTRACT_SUBREG V128:$Rn, dsub)>;
+
def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
@@ -5852,6 +5948,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+// Patterns for nontemporal/no-allocate stores.
+// We have to resort to tricks to turn a single-input store into a store pair,
+// because there is no single-input nontemporal store, only STNP.
+let Predicates = [IsLE] in {
+let AddedComplexity = 15 in {
+class NTStore128Pat<ValueType VT> :
+ Pat<(nontemporalstore (VT FPR128:$Rt),
+ (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+ (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
+ (CPYi64 FPR128:$Rt, (i64 1)),
+ GPR64sp:$Rn, simm7s8:$offset)>;
+
+def : NTStore128Pat<v2i64>;
+def : NTStore128Pat<v4i32>;
+def : NTStore128Pat<v8i16>;
+def : NTStore128Pat<v16i8>;
+
+class NTStore64Pat<ValueType VT> :
+ Pat<(nontemporalstore (VT FPR64:$Rt),
+ (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+ (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
+ (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
+ GPR64sp:$Rn, simm7s4:$offset)>;
+
+// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
+def : NTStore64Pat<v1f64>;
+def : NTStore64Pat<v1i64>;
+def : NTStore64Pat<v2i32>;
+def : NTStore64Pat<v4i16>;
+def : NTStore64Pat<v8i8>;
+
+def : Pat<(nontemporalstore GPR64:$Rt,
+ (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+ (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+ (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+ GPR64sp:$Rn, simm7s4:$offset)>;
+} // AddedComplexity=10
+} // Predicates = [IsLE]
+
// Tail call return handling. These are all compiler pseudo-instructions,
// so no encoding information or anything like that.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 82f77a77ab5e..566aa2c9a9ba 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -41,54 +41,85 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded");
STATISTIC(NumPreFolded, "Number of pre-index updates folded");
STATISTIC(NumUnscaledPairCreated,
"Number of load/store from unscaled generated");
+STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
+STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
+STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
cl::init(20), cl::Hidden);
-// Place holder while testing unscaled load/store combining
-static cl::opt<bool> EnableAArch64UnscaledMemOp(
- "aarch64-unscaled-mem-op", cl::Hidden,
- cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
+namespace llvm {
+void initializeAArch64LoadStoreOptPass(PassRegistry &);
+}
+
+#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
namespace {
+
+typedef struct LdStPairFlags {
+ // If a matching instruction is found, MergeForward is set to true if the
+ // merge is to remove the first instruction and replace the second with
+ // a pair-wise insn, and false if the reverse is true.
+ bool MergeForward;
+
+ // SExtIdx gives the index of the result of the load pair that must be
+ // extended. The value of SExtIdx assumes that the paired load produces the
+ // value in this order: (I, returned iterator), i.e., -1 means no value has
+ // to be extended, 0 means I, and 1 means the returned iterator.
+ int SExtIdx;
+
+ LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
+
+ void setMergeForward(bool V = true) { MergeForward = V; }
+ bool getMergeForward() const { return MergeForward; }
+
+ void setSExtIdx(int V) { SExtIdx = V; }
+ int getSExtIdx() const { return SExtIdx; }
+
+} LdStPairFlags;
+
struct AArch64LoadStoreOpt : public MachineFunctionPass {
static char ID;
- AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
+ AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
+ initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
+ }
const AArch64InstrInfo *TII;
const TargetRegisterInfo *TRI;
+ const AArch64Subtarget *Subtarget;
// Scan the instructions looking for a load/store that can be combined
// with the current instruction into a load/store pair.
// Return the matching instruction if one is found, else MBB->end().
- // If a matching instruction is found, MergeForward is set to true if the
- // merge is to remove the first instruction and replace the second with
- // a pair-wise insn, and false if the reverse is true.
- // \p SExtIdx[out] gives the index of the result of the load pair that
- // must be extended. The value of SExtIdx assumes that the paired load
- // produces the value in this order: (I, returned iterator), i.e.,
- // -1 means no value has to be extended, 0 means I, and 1 means the
- // returned iterator.
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
- bool &MergeForward, int &SExtIdx,
+ LdStPairFlags &Flags,
unsigned Limit);
+
+ // Scan the instructions looking for a store that writes to the address from
+ // which the current load instruction reads. Return true if one is found.
+ bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI);
+
// Merge the two instructions indicated into a single pair-wise instruction.
// If MergeForward is true, erase the first instruction and fold its
// operation into the second. If false, the reverse. Return the instruction
// following the first instruction (which may change during processing).
- // \p SExtIdx index of the result that must be extended for a paired load.
- // -1 means none, 0 means I, and 1 means Paired.
MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Paired, bool MergeForward,
- int SExtIdx);
+ MachineBasicBlock::iterator Paired,
+ const LdStPairFlags &Flags);
+
+ // Promote the load that reads directly from the address stored to.
+ MachineBasicBlock::iterator
+ promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI);
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan forwards.
MachineBasicBlock::iterator
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
- int Value);
+ int UnscaledOffset);
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
@@ -96,97 +127,177 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
MachineBasicBlock::iterator
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
- // Merge a pre-index base register update into a ld/st instruction.
- MachineBasicBlock::iterator
- mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Update);
+ // Find an instruction that updates the base register of the ld/st
+ // instruction.
+ bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
+ unsigned BaseReg, int Offset);
- // Merge a post-index base register update into a ld/st instruction.
+ // Merge a pre- or post-index base register update into a ld/st instruction.
MachineBasicBlock::iterator
- mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Update);
+ mergeUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update, bool IsPreIdx);
+
+ // Find and merge foldable ldr/str instructions.
+ bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
- bool optimizeBlock(MachineBasicBlock &MBB);
+ // Find and promote load instructions which read directly from store.
+ bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
+
+ // Check if converting two narrow loads into a single wider load with
+ // bitfield extracts could be enabled.
+ bool enableNarrowLdMerge(MachineFunction &Fn);
+
+ bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
const char *getPassName() const override {
- return "AArch64 load / store optimization pass";
+ return AARCH64_LOAD_STORE_OPT_NAME;
}
-
-private:
- int getMemSize(MachineInstr *MemMI);
};
char AArch64LoadStoreOpt::ID = 0;
} // namespace
-static bool isUnscaledLdst(unsigned Opc) {
+INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
+ AARCH64_LOAD_STORE_OPT_NAME, false, false)
+
+static bool isUnscaledLdSt(unsigned Opc) {
switch (Opc) {
default:
return false;
case AArch64::STURSi:
- return true;
case AArch64::STURDi:
- return true;
case AArch64::STURQi:
- return true;
+ case AArch64::STURBBi:
+ case AArch64::STURHHi:
case AArch64::STURWi:
- return true;
case AArch64::STURXi:
- return true;
case AArch64::LDURSi:
- return true;
case AArch64::LDURDi:
- return true;
case AArch64::LDURQi:
- return true;
case AArch64::LDURWi:
- return true;
case AArch64::LDURXi:
- return true;
case AArch64::LDURSWi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHWi:
return true;
}
}
-// Size in bytes of the data moved by an unscaled load or store
-int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
- switch (MemMI->getOpcode()) {
+static bool isUnscaledLdSt(MachineInstr *MI) {
+ return isUnscaledLdSt(MI->getOpcode());
+}
+
+static unsigned getBitExtrOpcode(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode.");
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ return AArch64::UBFMWri;
+ case AArch64::LDRSBWui:
+ case AArch64::LDURSBWi:
+ case AArch64::LDRSHWui:
+ case AArch64::LDURSHWi:
+ return AArch64::SBFMWri;
+ }
+}
+
+static bool isNarrowStore(unsigned Opc) {
+ switch (Opc) {
default:
- llvm_unreachable("Opcode has unknown size!");
+ return false;
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ return true;
+ }
+}
+
+static bool isNarrowStore(MachineInstr *MI) {
+ return isNarrowStore(MI->getOpcode());
+}
+
+static bool isNarrowLoad(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return false;
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
+ case AArch64::LDRSHWui:
+ case AArch64::LDURSHWi:
+ case AArch64::LDRSBWui:
+ case AArch64::LDURSBWi:
+ return true;
+ }
+}
+
+static bool isNarrowLoad(MachineInstr *MI) {
+ return isNarrowLoad(MI->getOpcode());
+}
+
+// Scaling factor for unscaled load or store.
+static int getMemScale(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Opcode has unknown scale!");
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
+ case AArch64::LDRSBWui:
+ case AArch64::LDURSBWi:
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ return 1;
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ case AArch64::LDRSHWui:
+ case AArch64::LDURSHWi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ return 2;
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ case AArch64::LDRSWui:
+ case AArch64::LDURSWi:
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
case AArch64::STRSui:
case AArch64::STURSi:
- return 4;
- case AArch64::STRDui:
- case AArch64::STURDi:
- return 8;
- case AArch64::STRQui:
- case AArch64::STURQi:
- return 16;
case AArch64::STRWui:
case AArch64::STURWi:
- return 4;
- case AArch64::STRXui:
- case AArch64::STURXi:
- return 8;
- case AArch64::LDRSui:
- case AArch64::LDURSi:
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPWi:
+ case AArch64::STPSi:
+ case AArch64::STPWi:
return 4;
case AArch64::LDRDui:
case AArch64::LDURDi:
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ case AArch64::LDPDi:
+ case AArch64::LDPXi:
+ case AArch64::STPDi:
+ case AArch64::STPXi:
return 8;
case AArch64::LDRQui:
case AArch64::LDURQi:
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
return 16;
- case AArch64::LDRWui:
- case AArch64::LDURWi:
- return 4;
- case AArch64::LDRXui:
- case AArch64::LDURXi:
- return 8;
- case AArch64::LDRSWui:
- case AArch64::LDURSWi:
- return 4;
}
}
@@ -203,6 +314,10 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::STURDi:
case AArch64::STRQui:
case AArch64::STURQi:
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
case AArch64::STRWui:
case AArch64::STURWi:
case AArch64::STRXui:
@@ -219,11 +334,23 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::STURSi:
case AArch64::LDRSui:
case AArch64::LDURSi:
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
return Opc;
case AArch64::LDRSWui:
return AArch64::LDRWui;
case AArch64::LDURSWi:
return AArch64::LDURWi;
+ case AArch64::LDRSBWui:
+ return AArch64::LDRBBui;
+ case AArch64::LDRSHWui:
+ return AArch64::LDRHHui;
+ case AArch64::LDURSBWi:
+ return AArch64::LDURBBi;
+ case AArch64::LDURSHWi:
+ return AArch64::LDURHHi;
}
}
@@ -240,6 +367,14 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
case AArch64::STRQui:
case AArch64::STURQi:
return AArch64::STPQi;
+ case AArch64::STRBBui:
+ return AArch64::STRHHui;
+ case AArch64::STRHHui:
+ return AArch64::STRWui;
+ case AArch64::STURBBi:
+ return AArch64::STURHHi;
+ case AArch64::STURHHi:
+ return AArch64::STURWi;
case AArch64::STRWui:
case AArch64::STURWi:
return AArch64::STPWi;
@@ -264,6 +399,48 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
case AArch64::LDRSWui:
case AArch64::LDURSWi:
return AArch64::LDPSWi;
+ case AArch64::LDRHHui:
+ case AArch64::LDRSHWui:
+ return AArch64::LDRWui;
+ case AArch64::LDURHHi:
+ case AArch64::LDURSHWi:
+ return AArch64::LDURWi;
+ case AArch64::LDRBBui:
+ case AArch64::LDRSBWui:
+ return AArch64::LDRHHui;
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBWi:
+ return AArch64::LDURHHi;
+ }
+}
+
+static unsigned isMatchingStore(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ unsigned LdOpc = LoadInst->getOpcode();
+ unsigned StOpc = StoreInst->getOpcode();
+ switch (LdOpc) {
+ default:
+ llvm_unreachable("Unsupported load instruction!");
+ case AArch64::LDRBBui:
+ return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
+ StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURBBi:
+ return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
+ StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRHHui:
+ return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
+ StOpc == AArch64::STRXui;
+ case AArch64::LDURHHi:
+ return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
+ StOpc == AArch64::STURXi;
+ case AArch64::LDRWui:
+ return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURWi:
+ return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRXui:
+ return StOpc == AArch64::STRXui;
+ case AArch64::LDURXi:
+ return StOpc == AArch64::STURXi;
}
}
@@ -277,6 +454,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
return AArch64::STRDpre;
case AArch64::STRQui:
return AArch64::STRQpre;
+ case AArch64::STRBBui:
+ return AArch64::STRBBpre;
+ case AArch64::STRHHui:
+ return AArch64::STRHHpre;
case AArch64::STRWui:
return AArch64::STRWpre;
case AArch64::STRXui:
@@ -287,12 +468,38 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
return AArch64::LDRDpre;
case AArch64::LDRQui:
return AArch64::LDRQpre;
+ case AArch64::LDRBBui:
+ return AArch64::LDRBBpre;
+ case AArch64::LDRHHui:
+ return AArch64::LDRHHpre;
case AArch64::LDRWui:
return AArch64::LDRWpre;
case AArch64::LDRXui:
return AArch64::LDRXpre;
case AArch64::LDRSWui:
return AArch64::LDRSWpre;
+ case AArch64::LDPSi:
+ return AArch64::LDPSpre;
+ case AArch64::LDPSWi:
+ return AArch64::LDPSWpre;
+ case AArch64::LDPDi:
+ return AArch64::LDPDpre;
+ case AArch64::LDPQi:
+ return AArch64::LDPQpre;
+ case AArch64::LDPWi:
+ return AArch64::LDPWpre;
+ case AArch64::LDPXi:
+ return AArch64::LDPXpre;
+ case AArch64::STPSi:
+ return AArch64::STPSpre;
+ case AArch64::STPDi:
+ return AArch64::STPDpre;
+ case AArch64::STPQi:
+ return AArch64::STPQpre;
+ case AArch64::STPWi:
+ return AArch64::STPWpre;
+ case AArch64::STPXi:
+ return AArch64::STPXpre;
}
}
@@ -306,6 +513,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
return AArch64::STRDpost;
case AArch64::STRQui:
return AArch64::STRQpost;
+ case AArch64::STRBBui:
+ return AArch64::STRBBpost;
+ case AArch64::STRHHui:
+ return AArch64::STRHHpost;
case AArch64::STRWui:
return AArch64::STRWpost;
case AArch64::STRXui:
@@ -316,19 +527,111 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
return AArch64::LDRDpost;
case AArch64::LDRQui:
return AArch64::LDRQpost;
+ case AArch64::LDRBBui:
+ return AArch64::LDRBBpost;
+ case AArch64::LDRHHui:
+ return AArch64::LDRHHpost;
case AArch64::LDRWui:
return AArch64::LDRWpost;
case AArch64::LDRXui:
return AArch64::LDRXpost;
case AArch64::LDRSWui:
return AArch64::LDRSWpost;
+ case AArch64::LDPSi:
+ return AArch64::LDPSpost;
+ case AArch64::LDPSWi:
+ return AArch64::LDPSWpost;
+ case AArch64::LDPDi:
+ return AArch64::LDPDpost;
+ case AArch64::LDPQi:
+ return AArch64::LDPQpost;
+ case AArch64::LDPWi:
+ return AArch64::LDPWpost;
+ case AArch64::LDPXi:
+ return AArch64::LDPXpost;
+ case AArch64::STPSi:
+ return AArch64::STPSpost;
+ case AArch64::STPDi:
+ return AArch64::STPDpost;
+ case AArch64::STPQi:
+ return AArch64::STPQpost;
+ case AArch64::STPWi:
+ return AArch64::STPWpost;
+ case AArch64::STPXi:
+ return AArch64::STPXpost;
}
}
+static bool isPairedLdSt(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ case AArch64::STPWi:
+ case AArch64::STPXi:
+ return true;
+ }
+}
+
+static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
+ unsigned PairedRegOp = 0) {
+ assert(PairedRegOp < 2 && "Unexpected register operand idx.");
+ unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
+ return MI->getOperand(Idx);
+}
+
+static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
+ unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
+ return MI->getOperand(Idx);
+}
+
+static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
+ unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
+ return MI->getOperand(Idx);
+}
+
+static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
+ int LoadSize = getMemScale(LoadInst);
+ int StoreSize = getMemScale(StoreInst);
+ int UnscaledStOffset = isUnscaledLdSt(StoreInst)
+ ? getLdStOffsetOp(StoreInst).getImm()
+ : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+ int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
+ ? getLdStOffsetOp(LoadInst).getImm()
+ : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+ return (UnscaledStOffset <= UnscaledLdOffset) &&
+ (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
+}
+
+// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
+static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
+ MachineInstr *Op1) {
+ assert(MI->memoperands_empty() && "expected a new machineinstr");
+ size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
+ (Op1->memoperands_end() - Op1->memoperands_begin());
+
+ MachineFunction *MF = MI->getParent()->getParent();
+ MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
+ MachineSDNode::mmo_iterator MemEnd =
+ std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
+ MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
+ MI->setMemRefs(MemBegin, MemEnd);
+}
+
MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- bool MergeForward, int SExtIdx) {
+ const LdStPairFlags &Flags) {
MachineBasicBlock::iterator NextI = I;
++NextI;
// If NextI is the second of the two instructions to be merged, we need
@@ -338,25 +641,26 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
if (NextI == Paired)
++NextI;
+ int SExtIdx = Flags.getSExtIdx();
unsigned Opc =
SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
- bool IsUnscaled = isUnscaledLdst(Opc);
- int OffsetStride =
- IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
+ bool IsUnscaled = isUnscaledLdSt(Opc);
+ int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
+ bool MergeForward = Flags.getMergeForward();
unsigned NewOpc = getMatchingPairOpcode(Opc);
// Insert our new paired instruction after whichever of the paired
// instructions MergeForward indicates.
MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
// Also based on MergeForward is from where we copy the base register operand
// so we get the flags compatible with the input code.
- MachineOperand &BaseRegOp =
- MergeForward ? Paired->getOperand(1) : I->getOperand(1);
+ const MachineOperand &BaseRegOp =
+ MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
// Which register is Rt and which is Rt2 depends on the offset order.
MachineInstr *RtMI, *Rt2MI;
- if (I->getOperand(2).getImm() ==
- Paired->getOperand(2).getImm() + OffsetStride) {
+ if (getLdStOffsetOp(I).getImm() ==
+ getLdStOffsetOp(Paired).getImm() + OffsetStride) {
RtMI = Paired;
Rt2MI = I;
// Here we swapped the assumption made for SExtIdx.
@@ -368,18 +672,135 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
RtMI = I;
Rt2MI = Paired;
}
- // Handle Unscaled
- int OffsetImm = RtMI->getOperand(2).getImm();
- if (IsUnscaled && EnableAArch64UnscaledMemOp)
- OffsetImm /= OffsetStride;
+
+ int OffsetImm = getLdStOffsetOp(RtMI).getImm();
+
+ if (isNarrowLoad(Opc)) {
+ // Change the scaled offset from small to large type.
+ if (!IsUnscaled) {
+ assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+ OffsetImm /= 2;
+ }
+ MachineInstr *RtNewDest = MergeForward ? I : Paired;
+ // When merging small (< 32 bit) loads for big-endian targets, the order of
+ // the component parts gets swapped.
+ if (!Subtarget->isLittleEndian())
+ std::swap(RtMI, Rt2MI);
+ // Construct the new load instruction.
+ MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
+ NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(NewOpc))
+ .addOperand(getLdStRegOp(RtNewDest))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm);
+
+ // Copy MachineMemOperands from the original loads.
+ concatenateMemOperands(NewMemMI, I, Paired);
+
+ DEBUG(
+ dbgs()
+ << "Creating the new load and extract. Replacing instructions:\n ");
+ DEBUG(I->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(Paired->print(dbgs()));
+ DEBUG(dbgs() << " with instructions:\n ");
+ DEBUG((NewMemMI)->print(dbgs()));
+
+ int Width = getMemScale(I) == 1 ? 8 : 16;
+ int LSBLow = 0;
+ int LSBHigh = Width;
+ int ImmsLow = LSBLow + Width - 1;
+ int ImmsHigh = LSBHigh + Width - 1;
+ MachineInstr *ExtDestMI = MergeForward ? Paired : I;
+ if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
+ // Create the bitfield extract for high bits.
+ BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(getBitExtrOpcode(Rt2MI)))
+ .addOperand(getLdStRegOp(Rt2MI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(LSBHigh)
+ .addImm(ImmsHigh);
+ // Create the bitfield extract for low bits.
+ if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
+ // For unsigned, prefer to use AND for low bits.
+ BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(AArch64::ANDWri))
+ .addOperand(getLdStRegOp(RtMI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(ImmsLow);
+ } else {
+ BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(getBitExtrOpcode(RtMI)))
+ .addOperand(getLdStRegOp(RtMI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(LSBLow)
+ .addImm(ImmsLow);
+ }
+ } else {
+ // Create the bitfield extract for low bits.
+ if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
+ // For unsigned, prefer to use AND for low bits.
+ BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(AArch64::ANDWri))
+ .addOperand(getLdStRegOp(RtMI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(ImmsLow);
+ } else {
+ BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(getBitExtrOpcode(RtMI)))
+ .addOperand(getLdStRegOp(RtMI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(LSBLow)
+ .addImm(ImmsLow);
+ }
+
+ // Create the bitfield extract for high bits.
+ BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(getBitExtrOpcode(Rt2MI)))
+ .addOperand(getLdStRegOp(Rt2MI))
+ .addReg(getLdStRegOp(RtNewDest).getReg())
+ .addImm(LSBHigh)
+ .addImm(ImmsHigh);
+ }
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI1)->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI2)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ I->eraseFromParent();
+ Paired->eraseFromParent();
+ return NextI;
+ }
// Construct the new instruction.
- MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
- I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(RtMI->getOperand(0))
- .addOperand(Rt2MI->getOperand(0))
- .addOperand(BaseRegOp)
- .addImm(OffsetImm);
+ MachineInstrBuilder MIB;
+ if (isNarrowStore(Opc)) {
+ // Change the scaled offset from small to large type.
+ if (!IsUnscaled) {
+ assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+ OffsetImm /= 2;
+ }
+ MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(NewOpc))
+ .addOperand(getLdStRegOp(I))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm);
+ // Copy MachineMemOperands from the original stores.
+ concatenateMemOperands(MIB, I, Paired);
+ } else {
+ // Handle Unscaled
+ if (IsUnscaled)
+ OffsetImm /= OffsetStride;
+ MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
+ TII->get(NewOpc))
+ .addOperand(getLdStRegOp(RtMI))
+ .addOperand(getLdStRegOp(Rt2MI))
+ .addOperand(BaseRegOp)
+ .addImm(OffsetImm);
+ }
+
(void)MIB;
// FIXME: Do we need/want to copy the mem operands from the source
@@ -439,13 +860,112 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
return NextI;
}
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI) {
+ MachineBasicBlock::iterator NextI = LoadI;
+ ++NextI;
+
+ int LoadSize = getMemScale(LoadI);
+ int StoreSize = getMemScale(StoreI);
+ unsigned LdRt = getLdStRegOp(LoadI).getReg();
+ unsigned StRt = getLdStRegOp(StoreI).getReg();
+ bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
+
+ assert((IsStoreXReg ||
+ TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
+ "Unexpected RegClass");
+
+ MachineInstr *BitExtMI;
+ if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
+ // Remove the load, if the destination register of the loads is the same
+ // register for stored value.
+ if (StRt == LdRt && LoadSize == 8) {
+ DEBUG(dbgs() << "Remove load instruction:\n ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+ LoadI->eraseFromParent();
+ return NextI;
+ }
+ // Replace the load with a mov if the load and store are in the same size.
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
+ .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
+ .addReg(StRt)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ // FIXME: Currently we disable this transformation in big-endian targets as
+ // performance and correctness are verified only in little-endian.
+ if (!Subtarget->isLittleEndian())
+ return NextI;
+ bool IsUnscaled = isUnscaledLdSt(LoadI);
+ assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
+ assert(LoadSize <= StoreSize && "Invalid load size");
+ int UnscaledLdOffset = IsUnscaled
+ ? getLdStOffsetOp(LoadI).getImm()
+ : getLdStOffsetOp(LoadI).getImm() * LoadSize;
+ int UnscaledStOffset = IsUnscaled
+ ? getLdStOffsetOp(StoreI).getImm()
+ : getLdStOffsetOp(StoreI).getImm() * StoreSize;
+ int Width = LoadSize * 8;
+ int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ int Imms = Immr + Width - 1;
+ unsigned DestReg = IsStoreXReg
+ ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
+ &AArch64::GPR64RegClass)
+ : LdRt;
+
+ assert((UnscaledLdOffset >= UnscaledStOffset &&
+ (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
+ "Invalid offset");
+
+ Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ Imms = Immr + Width - 1;
+ if (UnscaledLdOffset == UnscaledStOffset) {
+ uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
+ | ((Immr) << 6) // immr
+ | ((Imms) << 0) // imms
+ ;
+
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(AndMaskEncoded);
+ } else {
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(Immr)
+ .addImm(Imms);
+ }
+ }
+
+ DEBUG(dbgs() << "Promoting load by replacing :\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << " with instructions:\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ LoadI->eraseFromParent();
+ return NextI;
+}
+
/// trackRegDefsUses - Remember what registers the specified instruction uses
/// and modifies.
-static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
BitVector &UsedRegs,
const TargetRegisterInfo *TRI) {
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- MachineOperand &MO = MI->getOperand(i);
+ for (const MachineOperand &MO : MI->operands()) {
if (MO.isRegMask())
ModifiedRegs.setBitsNotInMask(MO.getRegMask());
@@ -464,16 +984,12 @@ static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
}
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
- if (!IsUnscaled && (Offset > 63 || Offset < -64))
- return false;
- if (IsUnscaled) {
- // Convert the byte-offset used by unscaled into an "element" offset used
- // by the scaled pair load/store instructions.
- int ElemOffset = Offset / OffsetStride;
- if (ElemOffset > 63 || ElemOffset < -64)
- return false;
- }
- return true;
+ // Convert the byte-offset used by unscaled into an "element" offset used
+ // by the scaled pair load/store instructions.
+ if (IsUnscaled)
+ Offset /= OffsetStride;
+
+ return Offset <= 63 && Offset >= -64;
}
// Do alignment, specialized to power of 2 and for signed ints,
@@ -507,12 +1023,65 @@ static bool mayAlias(MachineInstr *MIa,
return false;
}
+bool AArch64LoadStoreOpt::findMatchingStore(
+ MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI) {
+ MachineBasicBlock::iterator E = I->getParent()->begin();
+ MachineBasicBlock::iterator MBBI = I;
+ MachineInstr *FirstMI = I;
+ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+
+ // Track which registers have been modified and used between the first insn
+ // and the second insn.
+ BitVector ModifiedRegs, UsedRegs;
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+
+ for (unsigned Count = 0; MBBI != E && Count < Limit;) {
+ --MBBI;
+ MachineInstr *MI = MBBI;
+ // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+ // optimization by changing how far we scan.
+ if (MI->isDebugValue())
+ continue;
+ // Now that we know this is a real instruction, count it.
+ ++Count;
+
+ // If the load instruction reads directly from the address to which the
+ // store instruction writes and the stored value is not modified, we can
+ // promote the load. Since we do not handle stores with pre-/post-index,
+ // it's unnecessary to check if BaseReg is modified by the store itself.
+ if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
+ BaseReg == getLdStBaseOp(MI).getReg() &&
+ isLdOffsetInRangeOfSt(FirstMI, MI) &&
+ !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+ StoreI = MBBI;
+ return true;
+ }
+
+ if (MI->isCall())
+ return false;
+
+ // Update modified / uses register lists.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg])
+ return false;
+
+ // If we encounter a store aliased with the load, return early.
+ if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
+ return false;
+ }
+ return false;
+}
+
/// findMatchingInsn - Scan the instructions looking for a load/store that can
/// be combined with the current instruction into a load/store pair.
MachineBasicBlock::iterator
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
- bool &MergeForward, int &SExtIdx,
- unsigned Limit) {
+ LdStPairFlags &Flags, unsigned Limit) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator MBBI = I;
MachineInstr *FirstMI = I;
@@ -520,21 +1089,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
unsigned Opc = FirstMI->getOpcode();
bool MayLoad = FirstMI->mayLoad();
- bool IsUnscaled = isUnscaledLdst(Opc);
- unsigned Reg = FirstMI->getOperand(0).getReg();
- unsigned BaseReg = FirstMI->getOperand(1).getReg();
- int Offset = FirstMI->getOperand(2).getImm();
+ bool IsUnscaled = isUnscaledLdSt(FirstMI);
+ unsigned Reg = getLdStRegOp(FirstMI).getReg();
+ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+ int Offset = getLdStOffsetOp(FirstMI).getImm();
+ bool IsNarrowStore = isNarrowStore(Opc);
+
+ // For narrow stores, find only the case where the stored value is WZR.
+ if (IsNarrowStore && Reg != AArch64::WZR)
+ return E;
// Early exit if the first instruction modifies the base register.
// e.g., ldr x0, [x0]
- // Early exit if the offset if not possible to match. (6 bits of positive
- // range, plus allow an extra one in case we find a later insn that matches
- // with Offset-1
if (FirstMI->modifiesRegister(BaseReg, TRI))
return E;
- int OffsetStride =
- IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
- if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+
+ // Early exit if the offset if not possible to match. (6 bits of positive
+ // range, plus allow an extra one in case we find a later insn that matches
+ // with Offset-1)
+ int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
+ if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
+ !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
return E;
// Track which registers have been modified and used between the first insn
@@ -557,18 +1132,19 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
++Count;
bool CanMergeOpc = Opc == MI->getOpcode();
- SExtIdx = -1;
+ Flags.setSExtIdx(-1);
if (!CanMergeOpc) {
bool IsValidLdStrOpc;
unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
- if (!IsValidLdStrOpc)
- continue;
+ assert(IsValidLdStrOpc &&
+ "Given Opc should be a Load or Store with an immediate");
// Opc will be the first instruction in the pair.
- SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0;
+ Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
}
- if (CanMergeOpc && MI->getOperand(2).isImm()) {
+ if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
+ assert(MI->mayLoadOrStore() && "Expected memory operation.");
// If we've found another instruction with the same opcode, check to see
// if the base and offset are compatible with our starting instruction.
// These instructions all have scaled immediate operands, so we just
@@ -579,8 +1155,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// Pairwise instructions have a 7-bit signed offset field. Single insns
// have a 12-bit unsigned offset field. To be a valid combine, the
// final offset must be in range.
- unsigned MIBaseReg = MI->getOperand(1).getReg();
- int MIOffset = MI->getOperand(2).getImm();
+ unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
+ int MIOffset = getLdStOffsetOp(MI).getImm();
if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
(Offset + OffsetStride == MIOffset))) {
int MinOffset = Offset < MIOffset ? Offset : MIOffset;
@@ -591,30 +1167,43 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
return E;
// If the resultant immediate offset of merging these instructions
// is out of range for a pairwise instruction, bail and keep looking.
- bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
- if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+ bool MIIsUnscaled = isUnscaledLdSt(MI);
+ bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
+ if (!IsNarrowLoad &&
+ !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- if (MI->mayLoadOrStore())
- MemInsns.push_back(MI);
+ MemInsns.push_back(MI);
continue;
}
- // If the alignment requirements of the paired (scaled) instruction
- // can't express the offset of the unscaled input, bail and keep
- // looking.
- if (IsUnscaled && EnableAArch64UnscaledMemOp &&
- (alignTo(MinOffset, OffsetStride) != MinOffset)) {
- trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- if (MI->mayLoadOrStore())
+
+ if (IsNarrowLoad || IsNarrowStore) {
+ // If the alignment requirements of the scaled wide load/store
+ // instruction can't express the offset of the scaled narrow
+ // input, bail and keep looking.
+ if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
MemInsns.push_back(MI);
- continue;
+ continue;
+ }
+ } else {
+ // If the alignment requirements of the paired (scaled) instruction
+ // can't express the offset of the unscaled input, bail and keep
+ // looking.
+ if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+ MemInsns.push_back(MI);
+ continue;
+ }
}
// If the destination register of the loads is the same register, bail
// and keep looking. A load-pair instruction with both destination
// registers the same is UNPREDICTABLE and will result in an exception.
- if (MayLoad && Reg == MI->getOperand(0).getReg()) {
+ // For narrow stores, allow only when the stored value is the same
+ // (i.e., WZR).
+ if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
+ (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
- if (MI->mayLoadOrStore())
- MemInsns.push_back(MI);
+ MemInsns.push_back(MI);
continue;
}
@@ -622,10 +1211,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// the two instructions and none of the instructions between the second
// and first alias with the second, we can combine the second into the
// first.
- if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
- !(MI->mayLoad() && UsedRegs[MI->getOperand(0).getReg()]) &&
+ if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
+ !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
!mayAlias(MI, MemInsns, TII)) {
- MergeForward = false;
+ Flags.setMergeForward(false);
return MBBI;
}
@@ -633,11 +1222,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// between the two instructions and none of the instructions between the
// first and the second alias with the first, we can combine the first
// into the second.
- if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
- !(FirstMI->mayLoad() &&
- UsedRegs[FirstMI->getOperand(0).getReg()]) &&
+ if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
+ !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
!mayAlias(FirstMI, MemInsns, TII)) {
- MergeForward = true;
+ Flags.setMergeForward(true);
return MBBI;
}
// Unable to combine these instructions due to interference in between.
@@ -666,51 +1254,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
}
MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
- MachineBasicBlock::iterator Update) {
- assert((Update->getOpcode() == AArch64::ADDXri ||
- Update->getOpcode() == AArch64::SUBXri) &&
- "Unexpected base register update instruction to merge!");
- MachineBasicBlock::iterator NextI = I;
- // Return the instruction following the merged instruction, which is
- // the instruction following our unmerged load. Unless that's the add/sub
- // instruction we're merging, in which case it's the one after that.
- if (++NextI == Update)
- ++NextI;
-
- int Value = Update->getOperand(2).getImm();
- assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
- "Can't merge 1 << 12 offset into pre-indexed load / store");
- if (Update->getOpcode() == AArch64::SUBXri)
- Value = -Value;
-
- unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
- MachineInstrBuilder MIB =
- BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(Update->getOperand(0))
- .addOperand(I->getOperand(0))
- .addOperand(I->getOperand(1))
- .addImm(Value);
- (void)MIB;
-
- DEBUG(dbgs() << "Creating pre-indexed load/store.");
- DEBUG(dbgs() << " Replacing instructions:\n ");
- DEBUG(I->print(dbgs()));
- DEBUG(dbgs() << " ");
- DEBUG(Update->print(dbgs()));
- DEBUG(dbgs() << " with instruction:\n ");
- DEBUG(((MachineInstr *)MIB)->print(dbgs()));
- DEBUG(dbgs() << "\n");
-
- // Erase the old instructions for the block.
- I->eraseFromParent();
- Update->eraseFromParent();
-
- return NextI;
-}
-
-MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
- MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Update,
+ bool IsPreIdx) {
assert((Update->getOpcode() == AArch64::ADDXri ||
Update->getOpcode() == AArch64::SUBXri) &&
"Unexpected base register update instruction to merge!");
@@ -723,20 +1269,36 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
int Value = Update->getOperand(2).getImm();
assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
- "Can't merge 1 << 12 offset into post-indexed load / store");
+ "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
if (Update->getOpcode() == AArch64::SUBXri)
Value = -Value;
- unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
- MachineInstrBuilder MIB =
- BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(Update->getOperand(0))
- .addOperand(I->getOperand(0))
- .addOperand(I->getOperand(1))
- .addImm(Value);
+ unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
+ : getPostIndexedOpcode(I->getOpcode());
+ MachineInstrBuilder MIB;
+ if (!isPairedLdSt(I)) {
+ // Non-paired instruction.
+ MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(getLdStRegOp(Update))
+ .addOperand(getLdStRegOp(I))
+ .addOperand(getLdStBaseOp(I))
+ .addImm(Value);
+ } else {
+ // Paired instruction.
+ int Scale = getMemScale(I);
+ MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+ .addOperand(getLdStRegOp(Update))
+ .addOperand(getLdStRegOp(I, 0))
+ .addOperand(getLdStRegOp(I, 1))
+ .addOperand(getLdStBaseOp(I))
+ .addImm(Value / Scale);
+ }
(void)MIB;
- DEBUG(dbgs() << "Creating post-indexed load/store.");
+ if (IsPreIdx)
+ DEBUG(dbgs() << "Creating pre-indexed load/store.");
+ else
+ DEBUG(dbgs() << "Creating post-indexed load/store.");
DEBUG(dbgs() << " Replacing instructions:\n ");
DEBUG(I->print(dbgs()));
DEBUG(dbgs() << " ");
@@ -752,8 +1314,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
return NextI;
}
-static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
- int Offset) {
+bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
+ MachineInstr *MI,
+ unsigned BaseReg, int Offset) {
switch (MI->getOpcode()) {
default:
break;
@@ -769,44 +1332,65 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
// Watch out for 1 << 12 shifted value.
if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
break;
- // If the instruction has the base register as source and dest and the
- // immediate will fit in a signed 9-bit integer, then we have a match.
- if (MI->getOperand(0).getReg() == BaseReg &&
- MI->getOperand(1).getReg() == BaseReg &&
- MI->getOperand(2).getImm() <= 255 &&
- MI->getOperand(2).getImm() >= -256) {
- // If we have a non-zero Offset, we check that it matches the amount
- // we're adding to the register.
- if (!Offset || Offset == MI->getOperand(2).getImm())
- return true;
+
+ // The update instruction source and destination register must be the
+ // same as the load/store base register.
+ if (MI->getOperand(0).getReg() != BaseReg ||
+ MI->getOperand(1).getReg() != BaseReg)
+ break;
+
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ int UpdateOffset = MI->getOperand(2).getImm();
+ // For non-paired load/store instructions, the immediate must fit in a
+ // signed 9-bit integer.
+ if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
+ break;
+
+ // For paired load/store instructions, the immediate must be a multiple of
+ // the scaling factor. The scaled offset must also fit into a signed 7-bit
+ // integer.
+ if (IsPairedInsn) {
+ int Scale = getMemScale(MemMI);
+ if (UpdateOffset % Scale != 0)
+ break;
+
+ int ScaledOffset = UpdateOffset / Scale;
+ if (ScaledOffset > 64 || ScaledOffset < -64)
+ break;
}
+
+ // If we have a non-zero Offset, we check that it matches the amount
+ // we're adding to the register.
+ if (!Offset || Offset == MI->getOperand(2).getImm())
+ return true;
break;
}
return false;
}
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
- MachineBasicBlock::iterator I, unsigned Limit, int Value) {
+ MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineInstr *MemMI = I;
MachineBasicBlock::iterator MBBI = I;
- const MachineFunction &MF = *MemMI->getParent()->getParent();
- unsigned DestReg = MemMI->getOperand(0).getReg();
- unsigned BaseReg = MemMI->getOperand(1).getReg();
- int Offset = MemMI->getOperand(2).getImm() *
- TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+ unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
- // If the base register overlaps the destination register, we can't
- // merge the update.
- if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ // Scan forward looking for post-index opportunities. Updating instructions
+ // can't be formed if the memory instruction doesn't have the offset we're
+ // looking for.
+ if (MIUnscaledOffset != UnscaledOffset)
return E;
- // Scan forward looking for post-index opportunities.
- // Updating instructions can't be formed if the memory insn already
- // has an offset other than the value we're looking for.
- if (Offset != Value)
- return E;
+ // If the base register overlaps a destination register, we can't
+ // merge the update.
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
// Track which registers have been modified and used between the first insn
// (inclusive) and the second insn.
@@ -825,7 +1409,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
++Count;
// If we found a match, return it.
- if (isMatchingUpdateInsn(MI, BaseReg, Value))
+ if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
return MBBI;
// Update the status of what the instruction clobbered and used.
@@ -845,21 +1429,22 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
MachineBasicBlock::iterator E = I->getParent()->end();
MachineInstr *MemMI = I;
MachineBasicBlock::iterator MBBI = I;
- const MachineFunction &MF = *MemMI->getParent()->getParent();
- unsigned DestReg = MemMI->getOperand(0).getReg();
- unsigned BaseReg = MemMI->getOperand(1).getReg();
- int Offset = MemMI->getOperand(2).getImm();
- unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+ unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+ int Offset = getLdStOffsetOp(MemMI).getImm();
// If the load/store is the first instruction in the block, there's obviously
// not any matching update. Ditto if the memory offset isn't zero.
if (MBBI == B || Offset != 0)
return E;
- // If the base register overlaps the destination register, we can't
+ // If the base register overlaps a destination register, we can't
// merge the update.
- if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
- return E;
+ bool IsPairedInsn = isPairedLdSt(MemMI);
+ for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+ unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+ if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+ return E;
+ }
// Track which registers have been modified and used between the first insn
// (inclusive) and the second insn.
@@ -878,7 +1463,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
++Count;
// If we found a match, return it.
- if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+ if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
return MBBI;
// Update the status of what the instruction clobbered and used.
@@ -892,17 +1477,101 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
-bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr *MI = MBBI;
+ // If this is a volatile load, don't mess with it.
+ if (MI->hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm.
+ // FIXME: It is possible to extend it to handle reg+reg cases.
+ if (!getLdStOffsetOp(MI).isImm())
+ return false;
+
+ // Look backward up to ScanLimit instructions.
+ MachineBasicBlock::iterator StoreI;
+ if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
+ ++NumLoadsFromStoresPromoted;
+ // Promote the load. Keeping the iterator straight is a
+ // pain, so we let the merge routine tell us what the next instruction
+ // is after it's done mucking about.
+ MBBI = promoteLoadFromStore(MBBI, StoreI);
+ return true;
+ }
+ return false;
+}
+
+bool AArch64LoadStoreOpt::tryToMergeLdStInst(
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr *MI = MBBI;
+ MachineBasicBlock::iterator E = MI->getParent()->end();
+ // If this is a volatile load/store, don't mess with it.
+ if (MI->hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm (as opposed to an address reloc).
+ if (!getLdStOffsetOp(MI).isImm())
+ return false;
+
+ // Check if this load/store has a hint to avoid pair formation.
+ // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+ if (TII->isLdStPairSuppressed(MI))
+ return false;
+
+ // Look ahead up to ScanLimit instructions for a pairable instruction.
+ LdStPairFlags Flags;
+ MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
+ if (Paired != E) {
+ if (isNarrowLoad(MI)) {
+ ++NumNarrowLoadsPromoted;
+ } else if (isNarrowStore(MI)) {
+ ++NumZeroStoresPromoted;
+ } else {
+ ++NumPairCreated;
+ if (isUnscaledLdSt(MI))
+ ++NumUnscaledPairCreated;
+ }
+
+ // Merge the loads into a pair. Keeping the iterator straight is a
+ // pain, so we let the merge routine tell us what the next instruction
+ // is after it's done mucking about.
+ MBBI = mergePairedInsns(MBBI, Paired, Flags);
+ return true;
+ }
+ return false;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
+ bool enableNarrowLdOpt) {
bool Modified = false;
- // Two tranformations to do here:
- // 1) Find loads and stores that can be merged into a single load or store
+ // Three tranformations to do here:
+ // 1) Find loads that directly read from stores and promote them by
+ // replacing with mov instructions. If the store is wider than the load,
+ // the load will be replaced with a bitfield extract.
+ // e.g.,
+ // str w1, [x0, #4]
+ // ldrh w2, [x0, #6]
+ // ; becomes
+ // str w1, [x0, #4]
+ // lsr w2, w1, #16
+ // 2) Find narrow loads that can be converted into a single wider load
+ // with bitfield extract instructions.
+ // e.g.,
+ // ldrh w0, [x2]
+ // ldrh w1, [x2, #2]
+ // ; becomes
+ // ldr w0, [x2]
+ // ubfx w1, w0, #16, #16
+ // and w0, w0, #ffff
+ // 3) Find loads and stores that can be merged into a single load or store
// pair instruction.
// e.g.,
// ldr x0, [x2]
// ldr x1, [x2, #8]
// ; becomes
// ldp x0, x1, [x2]
- // 2) Find base register updates that can be merged into the load or store
+ // 4) Find base register updates that can be merged into the load or store
// as a base-reg writeback.
// e.g.,
// ldr x0, [x2]
@@ -918,6 +1587,69 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
// Just move on to the next instruction.
++MBBI;
break;
+ // Scaled instructions.
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ // Unscaled instructions.
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi: {
+ if (tryToPromoteLoadFromStore(MBBI)) {
+ Modified = true;
+ break;
+ }
+ ++MBBI;
+ break;
+ }
+ // FIXME: Do the other instructions.
+ }
+ }
+
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ enableNarrowLdOpt && MBBI != E;) {
+ MachineInstr *MI = MBBI;
+ switch (MI->getOpcode()) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ // Scaled instructions.
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSHWui:
+ case AArch64::STRBBui:
+ case AArch64::STRHHui:
+ // Unscaled instructions.
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHWi:
+ case AArch64::STURBBi:
+ case AArch64::STURHHi: {
+ if (tryToMergeLdStInst(MBBI)) {
+ Modified = true;
+ break;
+ }
+ ++MBBI;
+ break;
+ }
+ // FIXME: Do the other instructions.
+ }
+ }
+
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ MachineInstr *MI = MBBI;
+ switch (MI->getOpcode()) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ // Scaled instructions.
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
@@ -929,7 +1661,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
case AArch64::LDRXui:
case AArch64::LDRWui:
case AArch64::LDRSWui:
- // do the unscaled versions as well
+ // Unscaled instructions.
case AArch64::STURSi:
case AArch64::STURDi:
case AArch64::STURQi:
@@ -941,37 +1673,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
case AArch64::LDURWi:
case AArch64::LDURXi:
case AArch64::LDURSWi: {
- // If this is a volatile load/store, don't mess with it.
- if (MI->hasOrderedMemoryRef()) {
- ++MBBI;
- break;
- }
- // Make sure this is a reg+imm (as opposed to an address reloc).
- if (!MI->getOperand(2).isImm()) {
- ++MBBI;
- break;
- }
- // Check if this load/store has a hint to avoid pair formation.
- // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
- if (TII->isLdStPairSuppressed(MI)) {
- ++MBBI;
- break;
- }
- // Look ahead up to ScanLimit instructions for a pairable instruction.
- bool MergeForward = false;
- int SExtIdx = -1;
- MachineBasicBlock::iterator Paired =
- findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit);
- if (Paired != E) {
- // Merge the loads into a pair. Keeping the iterator straight is a
- // pain, so we let the merge routine tell us what the next instruction
- // is after it's done mucking about.
- MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx);
-
+ if (tryToMergeLdStInst(MBBI)) {
Modified = true;
- ++NumPairCreated;
- if (isUnscaledLdst(MI->getOpcode()))
- ++NumUnscaledPairCreated;
break;
}
++MBBI;
@@ -992,17 +1695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
// Just move on to the next instruction.
++MBBI;
break;
+ // Scaled instructions.
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
case AArch64::STRXui:
case AArch64::STRWui:
+ case AArch64::STRHHui:
+ case AArch64::STRBBui:
case AArch64::LDRSui:
case AArch64::LDRDui:
case AArch64::LDRQui:
case AArch64::LDRXui:
case AArch64::LDRWui:
- // do the unscaled versions as well
+ case AArch64::LDRHHui:
+ case AArch64::LDRBBui:
+ // Unscaled instructions.
case AArch64::STURSi:
case AArch64::STURDi:
case AArch64::STURQi:
@@ -1012,25 +1720,41 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
case AArch64::LDURDi:
case AArch64::LDURQi:
case AArch64::LDURWi:
- case AArch64::LDURXi: {
+ case AArch64::LDURXi:
+ // Paired instructions.
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ case AArch64::STPWi:
+ case AArch64::STPXi: {
// Make sure this is a reg+imm (as opposed to an address reloc).
- if (!MI->getOperand(2).isImm()) {
+ if (!getLdStOffsetOp(MI).isImm()) {
++MBBI;
break;
}
- // Look ahead up to ScanLimit instructions for a mergable instruction.
+ // Look forward to try to form a post-index instruction. For example,
+ // ldr x0, [x20]
+ // add x20, x20, #32
+ // merged into:
+ // ldr x0, [x20], #32
MachineBasicBlock::iterator Update =
findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
if (Update != E) {
// Merge the update into the ld/st.
- MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
Modified = true;
++NumPostFolded;
break;
}
// Don't know how to handle pre/post-index versions, so move to the next
// instruction.
- if (isUnscaledLdst(Opc)) {
+ if (isUnscaledLdSt(Opc)) {
++MBBI;
break;
}
@@ -1043,28 +1767,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
if (Update != E) {
// Merge the update into the ld/st.
- MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
Modified = true;
++NumPreFolded;
break;
}
+ // The immediate in the load/store is scaled by the size of the memory
+ // operation. The immediate in the add we're looking for,
+ // however, is not, so adjust here.
+ int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
// Look forward to try to find a post-index instruction. For example,
// ldr x1, [x0, #64]
// add x0, x0, #64
// merged into:
// ldr x1, [x0, #64]!
-
- // The immediate in the load/store is scaled by the size of the register
- // being loaded. The immediate in the add we're looking for,
- // however, is not, so adjust here.
- int Value = MI->getOperand(2).getImm() *
- TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
- ->getSize();
- Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+ Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
if (Update != E) {
// Merge the update into the ld/st.
- MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+ MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
Modified = true;
++NumPreFolded;
break;
@@ -1081,13 +1802,24 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
return Modified;
}
+bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
+ bool ProfitableArch = Subtarget->isCortexA57();
+ // FIXME: The benefit from converting narrow loads into a wider load could be
+ // microarchitectural as it assumes that a single load with two bitfield
+ // extracts is cheaper than two narrow loads. Currently, this conversion is
+ // enabled only in cortex-a57 on which performance benefits were verified.
+ return ProfitableArch && !Subtarget->requiresStrictAlign();
+}
+
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
- TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
- TRI = Fn.getSubtarget().getRegisterInfo();
+ Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+ TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+ TRI = Subtarget->getRegisterInfo();
bool Modified = false;
+ bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
for (auto &MBB : Fn)
- Modified |= optimizeBlock(MBB);
+ Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
return Modified;
}
@@ -1095,8 +1827,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
// loads and stores near one another?
-/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
-/// optimization pass.
+/// createAArch64LoadStoreOptimizationPass - returns an instance of the
+/// load / store optimization pass.
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
return new AArch64LoadStoreOpt();
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 580427ab3cc1..2b4cdf1083be 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -207,9 +207,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ for (const MachineOperand &MO : MI->operands()) {
MCOperand MCOp;
- if (lowerOperand(MI->getOperand(i), MCOp))
+ if (lowerOperand(MO, MCOp))
OutMI.addOperand(MCOp);
}
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h
deleted file mode 100644
index 4164b3364559..000000000000
--- a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===- AArch64MachineCombinerPattern.h -===//
-//===- AArch64 instruction pattern supported by combiner -===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines instruction pattern supported by combiner
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
-
-namespace llvm {
-
-/// Enumeration of instruction pattern supported by machine combiner
-///
-///
-namespace MachineCombinerPattern {
-enum MC_PATTERN : int {
- MC_NONE = 0,
- MC_MULADDW_OP1 = 1,
- MC_MULADDW_OP2 = 2,
- MC_MULSUBW_OP1 = 3,
- MC_MULSUBW_OP2 = 4,
- MC_MULADDWI_OP1 = 5,
- MC_MULSUBWI_OP1 = 6,
- MC_MULADDX_OP1 = 7,
- MC_MULADDX_OP2 = 8,
- MC_MULSUBX_OP1 = 9,
- MC_MULSUBX_OP2 = 10,
- MC_MULADDXI_OP1 = 11,
- MC_MULSUBXI_OP1 = 12
-};
-} // end namespace MachineCombinerPattern
-} // end namespace llvm
-
-#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 536a8d0f97a0..318f83953505 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
+//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=//
//
// The LLVM Compiler Infrastructure
//
@@ -42,7 +42,7 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
unsigned ArgumentStackToRestore;
/// HasStackFrame - True if this function has a stack frame. Set by
- /// processFunctionBeforeCalleeSavedScan().
+ /// determineCalleeSaves().
bool HasStackFrame;
/// \brief Amount of stack frame size, not including callee-saved registers.
@@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
/// registers.
unsigned VarArgsFPRSize;
+ /// True if this function has a subset of CSRs that is handled explicitly via
+ /// copies.
+ bool IsSplitCSR;
+
public:
AArch64FunctionInfo()
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
- VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+ IsSplitCSR(false) {}
explicit AArch64FunctionInfo(MachineFunction &MF)
: BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
- VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+ VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+ IsSplitCSR(false) {
(void)MF;
}
@@ -96,6 +102,9 @@ public:
bool hasStackFrame() const { return HasStackFrame; }
void setHasStackFrame(bool s) { HasStackFrame = s; }
+ bool isSplitCSR() const { return IsSplitCSR; }
+ void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
unsigned getLocalStackSize() const { return LocalStackSize; }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
index e1b93bf07c89..79c09d9f058d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -489,7 +489,7 @@ bool AArch64PromoteConstant::insertDefinitions(
for (const auto &IPI : InsertPts) {
// Create the load of the global variable.
- IRBuilder<> Builder(IPI.first->getParent(), IPI.first);
+ IRBuilder<> Builder(IPI.first);
LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
DEBUG(dbgs() << "**********\n");
DEBUG(dbgs() << "New def: ");
@@ -540,7 +540,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) {
bool LocalChange = false;
SmallPtrSet<Constant *, 8> AlreadyChecked;
- for (Instruction &I : inst_range(&F)) {
+ for (Instruction &I : instructions(&F)) {
// Traverse the operand, looking for constant vectors. Replace them by a
// load of a global variable of constant vector type.
for (Value *Op : I.operand_values()) {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 841af55f7a65..32b4888f2f64 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -15,6 +15,7 @@
#include "AArch64RegisterInfo.h"
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
@@ -34,10 +35,6 @@ using namespace llvm;
#define GET_REGINFO_TARGET_DESC
#include "AArch64GenRegisterInfo.inc"
-static cl::opt<bool>
-ReserveX18("aarch64-reserve-x18", cl::Hidden,
- cl::desc("Reserve X18, making it unavailable as GPR"));
-
AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
: AArch64GenRegisterInfo(AArch64::LR), TT(TT) {}
@@ -50,10 +47,23 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_AArch64_NoRegs_SaveList;
if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
return CSR_AArch64_AllRegs_SaveList;
+ if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
+ return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
+ CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
+ CSR_AArch64_CXX_TLS_Darwin_SaveList;
else
return CSR_AArch64_AAPCS_SaveList;
}
+const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
+ const MachineFunction *MF) const {
+ assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
+ return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
+ return nullptr;
+}
+
const uint32_t *
AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
@@ -62,6 +72,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return CSR_AArch64_NoRegs_RegMask;
if (CC == CallingConv::AnyReg)
return CSR_AArch64_AllRegs_RegMask;
+ if (CC == CallingConv::CXX_FAST_TLS)
+ return CSR_AArch64_CXX_TLS_Darwin_RegMask;
else
return CSR_AArch64_AAPCS_RegMask;
}
@@ -104,7 +116,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(AArch64::W29);
}
- if (TT.isOSDarwin() || ReserveX18) {
+ if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
Reserved.set(AArch64::X18); // Platform register
Reserved.set(AArch64::W18);
}
@@ -131,7 +143,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
return true;
case AArch64::X18:
case AArch64::W18:
- return TT.isOSDarwin() || ReserveX18;
+ return MF.getSubtarget<AArch64Subtarget>().isX18Reserved();
case AArch64::FP:
case AArch64::W29:
return TFI->hasFP(MF) || TT.isOSDarwin();
@@ -186,29 +198,6 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
return false;
}
-bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const {
-
- if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
- return false;
-
- return true;
-}
-
-// FIXME: share this with other backends with identical implementation?
-bool
-AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const AArch64FrameLowering *TFI = getFrameLowering(MF);
- const Function *F = MF.getFunction();
- unsigned StackAlign = TFI->getStackAlignment();
- bool requiresRealignment =
- ((MFI->getMaxAlignment() > StackAlign) ||
- F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::StackAlignment));
-
- return requiresRealignment && canRealignStack(MF);
-}
-
unsigned
AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -424,10 +413,11 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
case AArch64::GPR64RegClassID:
case AArch64::GPR32commonRegClassID:
case AArch64::GPR64commonRegClassID:
- return 32 - 1 // XZR/SP
- - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
- - (TT.isOSDarwin() || ReserveX18) // X18 reserved as platform register
- - hasBasePointer(MF); // X19
+ return 32 - 1 // XZR/SP
+ - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
+ - MF.getSubtarget<AArch64Subtarget>()
+ .isX18Reserved() // X18 reserved as platform register
+ - hasBasePointer(MF); // X19
case AArch64::FPR8RegClassID:
case AArch64::FPR16RegClassID:
case AArch64::FPR32RegClassID:
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 8c379d926108..f33f788fd437 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -35,6 +35,8 @@ public:
/// Code Generation virtual methods...
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *
+ getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
@@ -93,9 +95,6 @@ public:
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
- // Base pointer (stack realignment) support.
- bool canRealignStack(const MachineFunction &MF) const;
- bool needsStackRealignment(const MachineFunction &MF) const override;
};
} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index b2efca023372..a8c8b176efa9 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -407,7 +407,7 @@ def FPR128 : RegisterClass<"AArch64",
// The lower 16 vector registers. Some instructions can only take registers
// in this range.
def FPR128_lo : RegisterClass<"AArch64",
- [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
128, (trunc FPR128, 16)>;
// Pairs, triples, and quads of 64-bit vector registers.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 486efd6ce3a2..f6ee8cf47a6a 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -31,6 +31,11 @@ static cl::opt<bool>
EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
"converter pass"), cl::init(true), cl::Hidden);
+// If OS supports TBI, use this flag to enable it.
+static cl::opt<bool>
+UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
+ "an address is ignored"), cl::init(false), cl::Hidden);
+
AArch64Subtarget &
AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
// Determine default and user-specified characteristics
@@ -46,9 +51,11 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian)
: AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
- HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false),
- HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
- IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
+ HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false),
+ HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false),
+ HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
+ StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian),
+ CPUString(CPU), TargetTriple(TT), FrameLowering(),
InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
TLInfo(TM, *this) {}
@@ -113,12 +120,30 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
// bi-directional scheduling. 253.perlbmk.
Policy.OnlyTopDown = false;
Policy.OnlyBottomUp = false;
+ // Enabling or Disabling the latency heuristic is a close call: It seems to
+ // help nearly no benchmark on out-of-order architectures, on the other hand
+ // it regresses register pressure on a few benchmarking.
+ if (isCyclone())
+ Policy.DisableLatencyHeuristic = true;
}
bool AArch64Subtarget::enableEarlyIfConversion() const {
return EnableEarlyIfConvert;
}
+bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
+ if (!UseAddressTopByteIgnored)
+ return false;
+
+ if (TargetTriple.isiOS()) {
+ unsigned Major, Minor, Micro;
+ TargetTriple.getiOSVersion(Major, Minor, Micro);
+ return Major >= 8;
+ }
+
+ return false;
+}
+
std::unique_ptr<PBQPRAConstraint>
AArch64Subtarget::getCustomPBQPConstraints() const {
if (!isCortexA57())
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 6bb069423060..1b8b9b27719c 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -33,17 +33,21 @@ class Triple;
class AArch64Subtarget : public AArch64GenSubtargetInfo {
protected:
- enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
+ enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone};
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
ARMProcFamilyEnum ARMProcFamily;
bool HasV8_1aOps;
+ bool HasV8_2aOps;
bool HasFPARMv8;
bool HasNEON;
bool HasCrypto;
bool HasCRC;
+ bool HasPerfMon;
+ bool HasFullFP16;
+ bool HasSPE;
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
bool HasZeroCycleRegMove;
@@ -51,6 +55,12 @@ protected:
// HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
bool HasZeroCycleZeroing;
+ // StrictAlign - Disallow unaligned memory accesses.
+ bool StrictAlign;
+
+ // ReserveX18 - X18 is not available as a general purpose register.
+ bool ReserveX18;
+
bool IsLittle;
/// CPUString - String name of used CPU.
@@ -92,19 +102,30 @@ public:
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override {
- return isCortexA53() || isCortexA57();
+ return isGeneric() || isCortexA53() || isCortexA57();
}
bool hasV8_1aOps() const { return HasV8_1aOps; }
+ bool hasV8_2aOps() const { return HasV8_2aOps; }
bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+ bool requiresStrictAlign() const { return StrictAlign; }
+
+ bool isX18Reserved() const { return ReserveX18; }
bool hasFPARMv8() const { return HasFPARMv8; }
bool hasNEON() const { return HasNEON; }
bool hasCrypto() const { return HasCrypto; }
bool hasCRC() const { return HasCRC; }
+ /// CPU has TBI (top byte of addresses is ignored during HW address
+ /// translation) and OS enables it.
+ bool supportsAddressTopByteIgnored() const;
+
+ bool hasPerfMon() const { return HasPerfMon; }
+ bool hasFullFP16() const { return HasFullFP16; }
+ bool hasSPE() const { return HasSPE; }
bool isLittleEndian() const { return IsLittle; }
@@ -112,11 +133,13 @@ public:
bool isTargetIOS() const { return TargetTriple.isiOS(); }
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+ bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+ bool isGeneric() const { return CPUString == "generic"; }
bool isCyclone() const { return CPUString == "cyclone"; }
bool isCortexA57() const { return CPUString == "cortex-a57"; }
bool isCortexA53() const { return CPUString == "cortex-a53"; }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index db6e244337a7..c52c5544fc7e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -203,7 +203,7 @@ public:
} // namespace
TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(AArch64TTIImpl(this, F));
});
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e085cca35f1c..9af0e6444789 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
/// \brief Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
-unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
+int AArch64TTIImpl::getIntImmCost(int64_t Val) {
// Check if the immediate can be encoded within an instruction.
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
return 0;
@@ -37,7 +37,7 @@ unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
}
/// \brief Calculate the cost of materializing the given constant.
-unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -51,18 +51,18 @@ unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
- unsigned Cost = 0;
+ int Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
int64_t Val = Tmp.getSExtValue();
Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialze the constant.
- return std::max(1U, Cost);
+ return std::max(1, Cost);
}
-unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -118,17 +118,17 @@ unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
}
if (Idx == ImmIdx) {
- unsigned NumConstants = (BitSize + 63) / 64;
- unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
return (Cost <= NumConstants * TTI::TCC_Basic)
- ? static_cast<unsigned>(TTI::TCC_Free)
+ ? static_cast<int>(TTI::TCC_Free)
: Cost;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty);
}
-unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -147,10 +147,10 @@ unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
if (Idx == 1) {
- unsigned NumConstants = (BitSize + 63) / 64;
- unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+ int NumConstants = (BitSize + 63) / 64;
+ int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
return (Cost <= NumConstants * TTI::TCC_Basic)
- ? static_cast<unsigned>(TTI::TCC_Free)
+ ? static_cast<int>(TTI::TCC_Free)
: Cost;
}
break;
@@ -176,8 +176,7 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
-unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
- Type *Src) {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -187,7 +186,31 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
if (!SrcTy.isSimple() || !DstTy.isSimple())
return BaseT::getCastInstrCost(Opcode, Dst, Src);
- static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+ static const TypeConversionCostTblEntry
+ ConversionTbl[] = {
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+
+ // The number of shll instructions for the extension.
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
// LowerVectorINT_TO_FP:
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
@@ -210,6 +233,16 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+ // Complex: to v8f32
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+
+ // Complex: to v16f32
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+
// Complex: to v2f64
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
@@ -250,22 +283,21 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
};
- int Idx = ConvertCostTableLookup<MVT>(
- ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
- SrcTy.getSimpleVT());
- if (Idx != -1)
- return ConversionTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT()))
+ return Entry->Cost;
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
-unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) {
+int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
assert(Val->isVectorTy() && "This must be a vector type");
if (Index != -1U) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
// This type is legalized to a scalar type.
if (!LT.second.isVector())
@@ -281,15 +313,15 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
// All other insert/extracts cost this much.
- return 2;
+ return 3;
}
-unsigned AArch64TTIImpl::getArithmeticInstrCost(
+int AArch64TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -300,10 +332,9 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
- unsigned Cost =
- getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -331,7 +362,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
}
}
-unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -346,19 +377,20 @@ unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
return 1;
}
-unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) {
+int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
- // We don't lower vector selects well that are wider than the register width.
+ // We don't lower some vector selects well that are wider than the register
+ // width.
if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
// We would need this many instructions to hide the scalarization happening.
- const unsigned AmortizationCost = 20;
- static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+ const int AmortizationCost = 20;
+ static const TypeConversionCostTblEntry
VectorSelectTbl[] = {
- { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
- { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
- { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
+ { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
+ { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
@@ -367,20 +399,18 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
EVT SelCondTy = TLI->getValueType(DL, CondTy);
EVT SelValTy = TLI->getValueType(DL, ValTy);
if (SelCondTy.isSimple() && SelValTy.isSimple()) {
- int Idx =
- ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
- SelValTy.getSimpleVT());
- if (Idx != -1)
- return VectorSelectTbl[Idx].Cost;
+ if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
+ SelCondTy.getSimpleVT(),
+ SelValTy.getSimpleVT()))
+ return Entry->Cost;
}
}
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
}
-unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- unsigned Alignment,
- unsigned AddressSpace) {
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment, unsigned AddressSpace) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
Src->getVectorElementType()->isIntegerTy(64)) {
@@ -389,7 +419,7 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// practice on inlined memcpy code.
// We make v2i64 stores expensive so that we will only vectorize if there
// are 6 other instructions getting vectorized.
- unsigned AmortizationCost = 6;
+ int AmortizationCost = 6;
return LT.first * 2 * AmortizationCost;
}
@@ -407,16 +437,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first;
}
-unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
- unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace) {
+int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
- unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy);
+ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
// ldN/stN only support legal vector types of size 64 or 128 in bits.
if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
@@ -427,8 +459,8 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
Alignment, AddressSpace);
}
-unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
- unsigned Cost = 0;
+int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
+ int Cost = 0;
for (auto *I : Tys) {
if (!I->isVectorTy())
continue;
@@ -506,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::aarch64_neon_ld4:
Info.ReadMem = true;
Info.WriteMem = false;
- Info.Vol = false;
+ Info.IsSimple = true;
Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(0);
break;
@@ -515,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::aarch64_neon_st4:
Info.ReadMem = false;
Info.WriteMem = true;
- Info.Vol = false;
+ Info.IsSimple = true;
Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
break;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 444d3ccc15e1..ec58c4fe309f 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -48,7 +48,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
};
public:
- explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F)
+ explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
@@ -63,12 +63,11 @@ public:
/// @{
using BaseT::getIntImmCost;
- unsigned getIntImmCost(int64_t Val);
- unsigned getIntImmCost(const APInt &Imm, Type *Ty);
- unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty);
- unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ int getIntImmCost(int64_t Val);
+ int getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
/// @}
@@ -76,6 +75,8 @@ public:
/// \name Vector TTI Implementations
/// @{
+ bool enableInterleavedAccessVectorization() { return true; }
+
unsigned getNumberOfRegisters(bool Vector) {
if (Vector) {
if (ST->hasNEON())
@@ -96,25 +97,25 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF);
- unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
- unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
- unsigned getArithmeticInstrCost(
+ int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
- unsigned getAddressComputationCost(Type *Ty, bool IsComplex);
+ int getAddressComputationCost(Type *Ty, bool IsComplex);
- unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
- unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace);
- unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
+ int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
@@ -123,11 +124,9 @@ public:
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
- unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace);
+ int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, unsigned Alignment,
+ unsigned AddressSpace);
/// @}
};
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 38e8b4d9a938..394c8e78581f 100644
--- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -43,7 +43,6 @@ class AArch64Operand;
class AArch64AsmParser : public MCTargetAsmParser {
private:
StringRef Mnemonic; ///< Instruction mnemonic.
- MCSubtargetInfo &STI;
// Map of register aliases registers via the .req directive.
StringMap<std::pair<bool, unsigned> > RegisterReqs;
@@ -101,6 +100,7 @@ private:
OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+ OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
@@ -115,16 +115,16 @@ public:
#define GET_OPERAND_DIAGNOSTIC_TYPES
#include "AArch64GenAsmMatcher.inc"
};
- AArch64AsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser,
+ AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(STI) {
+ : MCTargetAsmParser(Options, STI) {
MCAsmParserExtension::Initialize(Parser);
MCStreamer &S = getParser().getStreamer();
if (S.getTargetStreamer() == nullptr)
new AArch64TargetStreamer(S);
// Initialize the set of available features.
- setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -160,7 +160,8 @@ private:
k_Prefetch,
k_ShiftExtend,
k_FPImm,
- k_Barrier
+ k_Barrier,
+ k_PSBHint,
} Kind;
SMLoc StartLoc, EndLoc;
@@ -228,6 +229,12 @@ private:
unsigned Length;
};
+ struct PSBHintOp {
+ unsigned Val;
+ const char *Data;
+ unsigned Length;
+ };
+
struct ShiftExtendOp {
AArch64_AM::ShiftExtendType Type;
unsigned Amount;
@@ -251,6 +258,7 @@ private:
struct SysRegOp SysReg;
struct SysCRImmOp SysCRImm;
struct PrefetchOp Prefetch;
+ struct PSBHintOp PSBHint;
struct ShiftExtendOp ShiftExtend;
};
@@ -302,6 +310,9 @@ public:
case k_Prefetch:
Prefetch = o.Prefetch;
break;
+ case k_PSBHint:
+ PSBHint = o.PSBHint;
+ break;
case k_ShiftExtend:
ShiftExtend = o.ShiftExtend;
break;
@@ -393,6 +404,16 @@ public:
return Prefetch.Val;
}
+ unsigned getPSBHint() const {
+ assert(Kind == k_PSBHint && "Invalid access!");
+ return PSBHint.Val;
+ }
+
+ StringRef getPSBHintName() const {
+ assert(Kind == k_PSBHint && "Invalid access!");
+ return StringRef(PSBHint.Data, PSBHint.Length);
+ }
+
StringRef getPrefetchName() const {
assert(Kind == k_Prefetch && "Invalid access!");
return StringRef(Prefetch.Data, Prefetch.Length);
@@ -497,6 +518,15 @@ public:
return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
}
+ bool isImm0_1() const {
+ if (!isImm())
+ return false;
+ const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+ if (!MCE)
+ return false;
+ int64_t Val = MCE->getValue();
+ return (Val >= 0 && Val < 2);
+ }
bool isImm0_7() const {
if (!isImm())
return false;
@@ -876,12 +906,15 @@ public:
}
bool isMSRSystemRegister() const {
if (!isSysReg()) return false;
-
return SysReg.MSRReg != -1U;
}
- bool isSystemPStateField() const {
+ bool isSystemPStateFieldWithImm0_1() const {
if (!isSysReg()) return false;
-
+ return (SysReg.PStateField == AArch64PState::PAN ||
+ SysReg.PStateField == AArch64PState::UAO);
+ }
+ bool isSystemPStateFieldWithImm0_15() const {
+ if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false;
return SysReg.PStateField != -1U;
}
bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
@@ -950,6 +983,7 @@ public:
}
bool isSysCR() const { return Kind == k_SysCR; }
bool isPrefetch() const { return Kind == k_Prefetch; }
+ bool isPSBHint() const { return Kind == k_PSBHint; }
bool isShiftExtend() const { return Kind == k_ShiftExtend; }
bool isShifter() const {
if (!isShiftExtend())
@@ -1175,8 +1209,10 @@ public:
template <unsigned NumRegs>
void addVectorList64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- static unsigned FirstRegs[] = { AArch64::D0, AArch64::D0_D1,
- AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+ static const unsigned FirstRegs[] = { AArch64::D0,
+ AArch64::D0_D1,
+ AArch64::D0_D1_D2,
+ AArch64::D0_D1_D2_D3 };
unsigned FirstReg = FirstRegs[NumRegs - 1];
Inst.addOperand(
@@ -1186,8 +1222,10 @@ public:
template <unsigned NumRegs>
void addVectorList128Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- static unsigned FirstRegs[] = { AArch64::Q0, AArch64::Q0_Q1,
- AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+ static const unsigned FirstRegs[] = { AArch64::Q0,
+ AArch64::Q0_Q1,
+ AArch64::Q0_Q1_Q2,
+ AArch64::Q0_Q1_Q2_Q3 };
unsigned FirstReg = FirstRegs[NumRegs - 1];
Inst.addOperand(
@@ -1304,6 +1342,12 @@ public:
Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16));
}
+ void addImm0_1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+ Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+ }
+
void addImm0_7Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
@@ -1491,7 +1535,13 @@ public:
Inst.addOperand(MCOperand::createImm(SysReg.MSRReg));
}
- void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
+ void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
+ }
+
+ void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
@@ -1507,6 +1557,11 @@ public:
Inst.addOperand(MCOperand::createImm(getPrefetch()));
}
+ void addPSBHintOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::createImm(getPSBHint()));
+ }
+
void addShifterOperands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
unsigned Imm =
@@ -1703,6 +1758,19 @@ public:
return Op;
}
+ static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
+ StringRef Str,
+ SMLoc S,
+ MCContext &Ctx) {
+ auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+ Op->PSBHint.Val = Val;
+ Op->PSBHint.Data = Str.data();
+ Op->PSBHint.Length = Str.size();
+ Op->StartLoc = S;
+ Op->EndLoc = S;
+ return Op;
+ }
+
static std::unique_ptr<AArch64Operand>
CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
@@ -1776,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const {
OS << "<prfop invalid #" << getPrefetch() << ">";
break;
}
+ case k_PSBHint: {
+ OS << getPSBHintName();
+ break;
+ }
case k_ShiftExtend: {
OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
<< getShiftExtendAmount();
@@ -1849,6 +1921,8 @@ static bool isValidVectorKind(StringRef Name) {
.Case(".h", true)
.Case(".s", true)
.Case(".d", true)
+ // Needed for fp16 scalar pairwise reductions
+ .Case(".2h", true)
.Default(false);
}
@@ -2016,7 +2090,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
bool Valid;
auto Mapper = AArch64PRFM::PRFMMapper();
StringRef Name =
- Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid);
+ Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name,
S, getContext()));
return MatchOperand_Success;
@@ -2030,7 +2104,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
bool Valid;
auto Mapper = AArch64PRFM::PRFMMapper();
unsigned prfop =
- Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid);
+ Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
if (!Valid) {
TokError("pre-fetch hint expected");
return MatchOperand_ParseFail;
@@ -2042,6 +2116,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
return MatchOperand_Success;
}
+/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
+ SMLoc S = getLoc();
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier)) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ bool Valid;
+ auto Mapper = AArch64PSBHint::PSBHintMapper();
+ unsigned psbhint =
+ Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
+ if (!Valid) {
+ TokError("invalid operand for instruction");
+ return MatchOperand_ParseFail;
+ }
+
+ Parser.Lex(); // Eat identifier token.
+ Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(),
+ S, getContext()));
+ return MatchOperand_Success;
+}
+
/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
/// instruction.
AArch64AsmParser::OperandMatchResultTy
@@ -2439,6 +2539,13 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
} else if (!Op.compare_lower("cisw")) {
// SYS #0, C7, C14, #2
SYS_ALIAS(0, 7, 14, 2);
+ } else if (!Op.compare_lower("cvap")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #3, C7, C12, #1
+ SYS_ALIAS(3, 7, 12, 1);
+ } else {
+ return TokError("DC CVAP requires ARMv8.2a");
+ }
} else {
return TokError("invalid operand for DC instruction");
}
@@ -2479,6 +2586,20 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
} else if (!Op.compare_lower("s12e0w")) {
// SYS #4, C7, C8, #7
SYS_ALIAS(4, 7, 8, 7);
+ } else if (!Op.compare_lower("s1e1rp")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #0, C7, C9, #0
+ SYS_ALIAS(0, 7, 9, 0);
+ } else {
+ return TokError("AT S1E1RP requires ARMv8.2a");
+ }
+ } else if (!Op.compare_lower("s1e1wp")) {
+ if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+ // SYS #0, C7, C9, #1
+ SYS_ALIAS(0, 7, 9, 1);
+ } else {
+ return TokError("AT S1E1WP requires ARMv8.2a");
+ }
} else {
return TokError("invalid operand for AT instruction");
}
@@ -2644,7 +2765,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
bool Valid;
auto Mapper = AArch64DB::DBarrierMapper();
StringRef Name =
- Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid);
+ Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name,
ExprLoc, getContext()));
return MatchOperand_Success;
@@ -2658,7 +2779,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
bool Valid;
auto Mapper = AArch64DB::DBarrierMapper();
unsigned Opt =
- Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid);
+ Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
if (!Valid) {
TokError("invalid barrier option name");
return MatchOperand_ParseFail;
@@ -2687,20 +2808,21 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
bool IsKnown;
auto MRSMapper = AArch64SysReg::MRSMapper();
- uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), STI.getFeatureBits(),
- IsKnown);
+ uint32_t MRSReg = MRSMapper.fromString(Tok.getString(),
+ getSTI().getFeatureBits(), IsKnown);
assert(IsKnown == (MRSReg != -1U) &&
"register should be -1 if and only if it's unknown");
auto MSRMapper = AArch64SysReg::MSRMapper();
- uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), STI.getFeatureBits(),
- IsKnown);
+ uint32_t MSRReg = MSRMapper.fromString(Tok.getString(),
+ getSTI().getFeatureBits(), IsKnown);
assert(IsKnown == (MSRReg != -1U) &&
"register should be -1 if and only if it's unknown");
auto PStateMapper = AArch64PState::PStateMapper();
uint32_t PStateField =
- PStateMapper.fromString(Tok.getString(), STI.getFeatureBits(), IsKnown);
+ PStateMapper.fromString(Tok.getString(),
+ getSTI().getFeatureBits(), IsKnown);
assert(IsKnown == (PStateField != -1U) &&
"register should be -1 if and only if it's unknown");
@@ -3151,7 +3273,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
if (Operands.size() < 2 ||
!static_cast<AArch64Operand &>(*Operands[1]).isReg())
- return true;
+ return Error(Loc, "Only valid when first operand is register");
bool IsXReg =
AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
@@ -3183,7 +3305,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
}
// If it is a label or an imm that cannot fit in a movz, put it into CP.
const MCExpr *CPLoc =
- getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4);
+ getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc);
Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
return false;
}
@@ -3601,6 +3723,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
case Match_InvalidMemoryIndexed16:
return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+ case Match_InvalidImm0_1:
+ return Error(Loc, "immediate must be an integer in range [0, 1].");
case Match_InvalidImm0_7:
return Error(Loc, "immediate must be an integer in range [0, 7].");
case Match_InvalidImm0_15:
@@ -3912,7 +4036,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
unsigned zreg =
- AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+ !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(
RegOp.getReg())
? AArch64::WZR
: AArch64::XZR;
@@ -3929,10 +4053,27 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// If that fails, try against the alternate table containing long-form NEON:
// "fadd v0.2s, v1.2s, v2.2s"
- if (MatchResult != Match_Success)
+ if (MatchResult != Match_Success) {
+ // But first, save the short-form match result: we can use it in case the
+ // long-form match also fails.
+ auto ShortFormNEONErrorInfo = ErrorInfo;
+ auto ShortFormNEONMatchResult = MatchResult;
+
MatchResult =
MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+ // Now, both matches failed, and the long-form match failed on the mnemonic
+ // suffix token operand. The short-form match failure is probably more
+ // relevant: use it instead.
+ if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 &&
+ Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() &&
+ ((AArch64Operand &)*Operands[1]).isTokenSuffix()) {
+ MatchResult = ShortFormNEONMatchResult;
+ ErrorInfo = ShortFormNEONErrorInfo;
+ }
+ }
+
+
switch (MatchResult) {
case Match_Success: {
// Perform range checking and other semantic validations
@@ -3944,7 +4085,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return true;
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, STI);
+ Out.EmitInstruction(Inst, getSTI());
return false;
}
case Match_MissingFeature: {
@@ -3966,6 +4107,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return showMatchError(IDLoc, MatchResult);
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
+
if (ErrorInfo != ~0ULL) {
if (ErrorInfo >= Operands.size())
return Error(IDLoc, "too few operands for instruction");
@@ -4011,6 +4153,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidMemoryIndexed8SImm7:
case Match_InvalidMemoryIndexed16SImm7:
case Match_InvalidMemoryIndexedSImm9:
+ case Match_InvalidImm0_1:
case Match_InvalidImm0_7:
case Match_InvalidImm0_15:
case Match_InvalidImm0_31:
@@ -4083,7 +4226,7 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
if (getParser().parseExpression(Value))
return true;
- getParser().getStreamer().EmitValue(Value, Size);
+ getParser().getStreamer().EmitValue(Value, Size, L);
if (getLexer().is(AsmToken::EndOfStatement))
break;
@@ -4155,7 +4298,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
Inst.setOpcode(AArch64::TLSDESCCALL);
Inst.addOperand(MCOperand::createExpr(Expr));
- getParser().getStreamer().EmitInstruction(Inst, STI);
+ getParser().getStreamer().EmitInstruction(Inst, getSTI());
return false;
}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index db9fb0e775df..f1f968e73123 100644
--- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1516,6 +1516,10 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
uint64_t pstate_field = (op1 << 3) | op2;
+ if ((pstate_field == AArch64PState::PAN ||
+ pstate_field == AArch64PState::UAO) && crm > 1)
+ return Fail;
+
Inst.addOperand(MCOperand::createImm(pstate_field));
Inst.addOperand(MCOperand::createImm(crm));
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 7f56c2cf6bb8..d8a810824370 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -19,6 +19,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -55,7 +56,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
unsigned Opcode = MI->getOpcode();
if (Opcode == AArch64::SYSxt)
- if (printSysAlias(MI, O)) {
+ if (printSysAlias(MI, STI, O)) {
printAnnotation(O, Annot);
return;
}
@@ -269,7 +270,7 @@ struct LdStNInstrDesc {
int NaturalOffset;
};
-static LdStNInstrDesc LdStNInstInfo[] = {
+static const LdStNInstrDesc LdStNInstInfo[] = {
{ AArch64::LD1i8, "ld1", ".b", 1, true, 0 },
{ AArch64::LD1i16, "ld1", ".h", 1, true, 0 },
{ AArch64::LD1i32, "ld1", ".s", 1, true, 0 },
@@ -612,7 +613,7 @@ static LdStNInstrDesc LdStNInstInfo[] = {
{ AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 },
};
-static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
unsigned Idx;
for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
if (LdStNInstInfo[Idx].Opcode == Opcode)
@@ -641,7 +642,7 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
return;
}
- if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+ if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
// Now onto the operands: first a vector list with possible lane
@@ -674,7 +675,9 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
AArch64InstPrinter::printInst(MI, O, Annot, STI);
}
-bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
#ifndef NDEBUG
unsigned Opcode = MI->getOpcode();
assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
@@ -729,6 +732,11 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
if (Op1Val == 3 && Op2Val == 1)
Asm = "dc\tcvau";
break;
+ case 12:
+ if (Op1Val == 3 && Op2Val == 1 &&
+ (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
+ Asm = "dc\tcvap";
+ break;
case 14:
if (Op1Val == 3 && Op2Val == 1)
Asm = "dc\tcivac";
@@ -773,6 +781,21 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
break;
}
break;
+ case 9:
+ switch (Op1Val) {
+ default:
+ break;
+ case 0:
+ if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
+ switch (Op2Val) {
+ default:
+ break;
+ case 0: Asm = "at\ts1e1rp"; break;
+ case 1: Asm = "at\ts1e1wp"; break;
+ }
+ }
+ break;
+ }
}
} else if (CnVal == 8) {
// TLBI aliases
@@ -1122,6 +1145,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
O << '#' << prfop;
}
+void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ unsigned psbhintop = MI->getOperand(OpNum).getImm();
+ bool Valid;
+ StringRef Name =
+ AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid);
+ if (Valid)
+ O << Name;
+ else
+ O << '#' << psbhintop;
+}
+
void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 15dee978e229..ea68d9848b42 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -15,14 +15,10 @@
#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
namespace llvm {
-class MCOperand;
-
class AArch64InstPrinter : public MCInstPrinter {
public:
AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
@@ -48,7 +44,8 @@ public:
unsigned AltIdx = AArch64::NoRegAltName);
protected:
- bool printSysAlias(const MCInst *MI, raw_ostream &O);
+ bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI,
+ raw_ostream &O);
// Operand printers
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
@@ -122,6 +119,9 @@ protected:
void printPrefetchOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printPSBHintOp(const MCInst *MI, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+
void printFPImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index ed24343a6f2a..648b1dfc8c5e 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -364,6 +364,32 @@ static inline float getFPImmFloat(unsigned Imm) {
return FPUnion.F;
}
+/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP16Imm(const APInt &Imm) {
+ uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+ int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15
+ int32_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits
+
+ // We can handle 4 bits of mantissa.
+ // mantissa = (16+UInt(e:f:g:h))/16.
+ if (Mantissa & 0x3f)
+ return -1;
+ Mantissa >>= 6;
+
+ // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+ if (Exp < -3 || Exp > 4)
+ return -1;
+ Exp = ((Exp+3) & 0x7) ^ 4;
+
+ return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP16Imm(const APFloat &FPImm) {
+ return getFP16Imm(FPImm.bitcastToAPInt());
+}
+
/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
/// floating-point value. If the value cannot be represented as an 8-bit
/// floating-point value, then return -1.
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 16d53569b231..d26604f5765d 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -128,10 +128,9 @@ public:
/// This is one of the functions used to emit data into an ELF section, so the
/// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
/// if necessary.
- void EmitValueImpl(const MCExpr *Value, unsigned Size,
- const SMLoc &Loc) override {
+ void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
EmitDataMappingSymbol();
- MCELFStreamer::EmitValueImpl(Value, Size);
+ MCELFStreamer::EmitValueImpl(Value, Size, Loc);
}
private:
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 921c4b94a729..fbce26e1d9a1 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -48,10 +48,6 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
UseDataRegionDirectives = true;
ExceptionsType = ExceptionHandling::DwarfCFI;
-
- // AArch64 Darwin doesn't have the baggage of X86/ARM, so it's fine to use
- // LShr instead of AShr.
- UseLogicalShr = true;
}
const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 28703419514a..a540f49866a9 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -85,13 +85,13 @@ void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const {
Streamer.visitUsedExpr(*getSubExpr());
}
-MCSection *AArch64MCExpr::findAssociatedSection() const {
+MCFragment *AArch64MCExpr::findAssociatedFragment() const {
llvm_unreachable("FIXME: what goes here?");
}
bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const {
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
return false;
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 1165314e4105..db36a65564ce 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -149,11 +149,10 @@ public:
void visitUsedExpr(MCStreamer &Streamer) const override;
- MCSection *findAssociatedSection() const override;
+ MCFragment *findAssociatedFragment() const override;
- bool evaluateAsRelocatableImpl(MCValue &Res,
- const MCAsmLayout *Layout,
- const MCFixup *Fixup) const override;
+ bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
@@ -162,7 +161,6 @@ public:
}
static bool classof(const AArch64MCExpr *) { return true; }
-
};
} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 741b273073e4..61c96f1d93c1 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -90,9 +90,11 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
Log2Size = llvm::Log2_32(4);
// This encompasses the relocation for the whole 21-bit value.
switch (Sym->getKind()) {
- default:
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "ADR/ADRP relocations must be GOT relative");
+ default: {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "ADR/ADRP relocations must be GOT relative");
+ return false;
+ }
case MCSymbolRefExpr::VK_PAGE:
RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
return true;
@@ -170,25 +172,25 @@ void AArch64MachObjectWriter::recordRelocation(
// assembler local symbols. If we got here, that's not what we have,
// so complain loudly.
if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "conditional branch requires assembler-local"
- " label. '" +
- Target.getSymA()->getSymbol().getName() +
- "' is external.");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "conditional branch requires assembler-local"
+ " label. '" +
+ Target.getSymA()->getSymbol().getName() +
+ "' is external.");
return;
}
// 14-bit branch relocations should only target internal labels, and so
// should never get here.
if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "Invalid relocation on conditional branch!");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "Invalid relocation on conditional branch!");
return;
}
if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
- Asm)) {
- Asm.getContext().reportFatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+ Asm)) {
+ Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
return;
}
@@ -200,8 +202,9 @@ void AArch64MachObjectWriter::recordRelocation(
Type = MachO::ARM64_RELOC_UNSIGNED;
if (IsPCRel) {
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "PC relative absolute relocation!");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "PC relative absolute relocation!");
+ return;
// FIXME: x86_64 sets the type to a branch reloc here. Should we do
// something similar?
@@ -229,16 +232,20 @@ void AArch64MachObjectWriter::recordRelocation(
Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
return;
} else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
- Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+ Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
// Otherwise, neither symbol can be modified.
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unsupported relocation of modified symbol");
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported relocation of modified symbol");
+ return;
+ }
// We don't support PCrel relocations of differences.
- if (IsPCRel)
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unsupported pc-relative relocation of "
- "difference");
+ if (IsPCRel) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unsupported pc-relative relocation of "
+ "difference");
+ return;
+ }
// AArch64 always uses external relocations. If there is no symbol to use as
// a base address (a local symbol with no preceding non-local symbol),
@@ -246,20 +253,26 @@ void AArch64MachObjectWriter::recordRelocation(
//
// FIXME: We should probably just synthesize an external symbol and use
// that.
- if (!A_Base)
- Asm.getContext().reportFatalError(
+ if (!A_Base) {
+ Asm.getContext().reportError(
Fixup.getLoc(),
"unsupported relocation of local symbol '" + A->getName() +
"'. Must have non-local symbol earlier in section.");
- if (!B_Base)
- Asm.getContext().reportFatalError(
+ return;
+ }
+ if (!B_Base) {
+ Asm.getContext().reportError(
Fixup.getLoc(),
"unsupported relocation of local symbol '" + B->getName() +
"'. Must have non-local symbol earlier in section.");
+ return;
+ }
- if (A_Base == B_Base && A_Base)
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unsupported relocation with identical base");
+ if (A_Base == B_Base && A_Base) {
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "unsupported relocation with identical base");
+ return;
+ }
Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) -
(!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress(
@@ -309,10 +322,12 @@ void AArch64MachObjectWriter::recordRelocation(
// we need to preserve and merge with the new Target? How about
// the FixedValue?
if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout,
- &Fixup))
- Asm.getContext().reportFatalError(Fixup.getLoc(),
- "unable to resolve variable '" +
- Symbol->getName() + "'");
+ &Fixup)) {
+ Asm.getContext().reportError(Fixup.getLoc(),
+ "unable to resolve variable '" +
+ Symbol->getName() + "'");
+ return;
+ }
return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
FixedValue);
}
@@ -337,11 +352,13 @@ void AArch64MachObjectWriter::recordRelocation(
Value +=
Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base);
} else if (Symbol->isInSection()) {
- if (!CanUseLocalRelocation)
- Asm.getContext().reportFatalError(
+ if (!CanUseLocalRelocation) {
+ Asm.getContext().reportError(
Fixup.getLoc(),
"unsupported relocation of local symbol '" + Symbol->getName() +
"'. Must have non-local symbol earlier in section.");
+ return;
+ }
// Adjust the relocation to be section-relative.
// The index is the section ordinal (1-based).
const MCSection &Sec = Symbol->getSection();
@@ -361,9 +378,10 @@ void AArch64MachObjectWriter::recordRelocation(
return;
}
}
- Asm.getContext().reportFatalError(Fixup.getLoc(),
+ Asm.getContext().reportError(Fixup.getLoc(),
"unsupported relocation of variable '" +
Symbol->getName() + "'");
+ return;
}
}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 52b000d15b8d..3e86a42d5be6 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -26,8 +26,9 @@ AArch64TargetStreamer::~AArch64TargetStreamer() {}
// The constant pool handling is shared by all AArch64TargetStreamer
// implementations.
const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr,
- unsigned Size) {
- return ConstantPools->addEntry(Streamer, Expr, Size);
+ unsigned Size,
+ SMLoc Loc) {
+ return ConstantPools->addEntry(Streamer, Expr, Size, Loc);
}
void AArch64TargetStreamer::emitCurrentConstantPool() {
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index fcc0d053f6e2..51432830f795 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -24,7 +24,7 @@ public:
/// Callback used to implement the ldr= pseudo.
/// Add a new entry to the constant pool for the current section and return an
/// MCExpr that can be used to refer to the constant pool location.
- const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size);
+ const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc);
/// Callback used to implemnt the .ltorg directive.
/// Emit contents of constant pool for the current section.
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index ee85b65bf39a..78f5289ec26d 100644
--- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -146,11 +146,22 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings
// v8.1a "Privileged Access Never" extension-specific PStates
{"pan", PAN, {AArch64::HasV8_1aOps}},
+
+ // v8.2a
+ {"uao", UAO, {AArch64::HasV8_2aOps}},
};
AArch64PState::PStateMapper::PStateMapper()
: AArch64NamedImmMapper(PStateMappings, 0) {}
+const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = {
+ // v8.2a "Statistical Profiling" extension-specific PSB operand
+ {"csync", CSync, {AArch64::FeatureSPE}},
+};
+
+AArch64PSBHint::PSBHintMapper::PSBHintMapper()
+ : AArch64NamedImmMapper(PSBHintMappings, 0) {}
+
const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
{"mdccsr_el0", MDCCSR_EL0, {}},
{"dbgdtrrx_el0", DBGDTRRX_EL0, {}},
@@ -192,6 +203,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
{"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}},
{"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}},
{"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}},
+ {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}},
{"mvfr0_el1", MVFR0_EL1, {}},
{"mvfr1_el1", MVFR1_EL1, {}},
{"mvfr2_el1", MVFR2_EL1, {}},
@@ -275,9 +287,6 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = {
{"icc_sgi1r_el1", ICC_SGI1R_EL1, {}},
{"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}},
{"icc_sgi0r_el1", ICC_SGI0R_EL1, {}},
-
- // v8.1a "Privileged Access Never" extension-specific system registers
- {"pan", PAN, {AArch64::HasV8_1aOps}},
};
AArch64SysReg::MSRMapper::MSRMapper() {
@@ -804,6 +813,24 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings
{"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}},
{"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}},
{"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}},
+
+ // v8.2a registers
+ {"uao", UAO, {AArch64::HasV8_2aOps}},
+
+ // v8.2a "Statistical Profiling extension" registers
+ {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}},
+ {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}},
+ {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}},
+ {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}},
+ {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}},
+ {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}},
+ {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}},
+ {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}},
+ {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}},
+ {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}},
+ {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}},
+ {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}},
+ {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}},
};
uint32_t
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 7e42f8e3601e..f649cb9b8a8d 100644
--- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -337,7 +337,9 @@ namespace AArch64AT {
S12E1R = 0x63c4, // 01 100 0111 1000 100
S12E1W = 0x63c5, // 01 100 0111 1000 101
S12E0R = 0x63c6, // 01 100 0111 1000 110
- S12E0W = 0x63c7 // 01 100 0111 1000 111
+ S12E0W = 0x63c7, // 01 100 0111 1000 111
+ S1E1RP = 0x43c8, // 01 000 0111 1001 000
+ S1E1WP = 0x43c9 // 01 000 0111 1001 001
};
struct ATMapper : AArch64NamedImmMapper {
@@ -463,6 +465,9 @@ namespace AArch64PState {
// v8.1a "Privileged Access Never" extension-specific PStates
PAN = 0x04,
+
+ // v8.2a "User Access Override" extension-specific PStates
+ UAO = 0x03
};
struct PStateMapper : AArch64NamedImmMapper {
@@ -473,6 +478,21 @@ namespace AArch64PState {
}
+namespace AArch64PSBHint {
+ enum PSBHintValues {
+ Invalid = -1,
+ // v8.2a "Statistical Profiling" extension-specific PSB operands
+ CSync = 0x11, // psb csync = hint #0x11
+ };
+
+ struct PSBHintMapper : AArch64NamedImmMapper {
+ const static Mapping PSBHintMappings[];
+
+ PSBHintMapper();
+ };
+
+}
+
namespace AArch64SE {
enum ShiftExtSpecifiers {
Invalid = -1,
@@ -594,6 +614,7 @@ namespace AArch64SysReg {
ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001
ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000
ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001
+ ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010
MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000
MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001
MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010
@@ -1190,6 +1211,24 @@ namespace AArch64SysReg {
SPSR_EL12 = 0xea00, // 11 101 0100 0000 000
ELR_EL12 = 0xea01, // 11 101 0100 0000 001
+ // v8.2a registers
+ UAO = 0xc214, // 11 000 0100 0010 100
+
+ // v8.2a "Statistical Profiling extension" registers
+ PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000
+ PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001
+ PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011
+ PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111
+ PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000
+ PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000
+ PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000
+ PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010
+ PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011
+ PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100
+ PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101
+ PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110
+ PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111
+
// Cyclone specific system registers
CPM_IOACC_CTL_EL3 = 0xff90,
};