aboutsummaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/AArch64/AArch64AsmPrinter.cpp5
-rw-r--r--lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp119
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp11
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp31
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp127
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h2
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td2
-rw-r--r--lib/Target/AArch64/AArch64MacroFusion.cpp13
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkor.td86
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorDetails.td1109
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorWriteRes.td403
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp1
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp12
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td20
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp45
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp213
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h8
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp83
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp93
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h5
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNMinRegStrategy.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp6
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h12
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp46
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp8
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.h2
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.td2
-rw-r--r--lib/Target/AMDGPU/SIDefines.h19
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp52
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h2
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp4
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td180
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td4
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp11
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h4
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td33
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td70
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td9
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td37
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td114
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp111
-rw-r--r--lib/Target/ARM/ARMCallLowering.h5
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp4
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp8
-rw-r--r--lib/Target/ARM/ARMISelLowering.h2
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td496
-rw-r--r--lib/Target/ARM/ARMSchedule.td11
-rw-r--r--lib/Target/ARM/ARMScheduleA9.td9
-rw-r--r--lib/Target/ARM/ARMScheduleR52.td103
-rw-r--r--lib/Target/ARM/ARMScheduleSwift.td10
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp63
-rw-r--r--lib/Target/ARM/ARMTargetMachine.h62
-rw-r--r--lib/Target/ARM/ARMTargetObjectFile.cpp4
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.cpp10
-rw-r--r--lib/Target/AVR/AVRInstrInfo.td1
-rw-r--r--lib/Target/BPF/BPFISelLowering.cpp9
-rw-r--r--lib/Target/BPF/BPFISelLowering.h4
-rw-r--r--lib/Target/Hexagon/HexagonFrameLowering.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.cpp4
-rw-r--r--lib/Target/Hexagon/HexagonPseudo.td39
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.cpp5
-rw-r--r--lib/Target/Hexagon/HexagonRegisterInfo.h1
-rw-r--r--lib/Target/Hexagon/HexagonVLIWPacketizer.cpp4
-rw-r--r--lib/Target/LLVMBuild.txt1
-rw-r--r--lib/Target/MSP430/MSP430.td14
-rw-r--r--lib/Target/MSP430/MSP430ISelDAGToDAG.cpp4
-rw-r--r--lib/Target/MSP430/MSP430ISelLowering.cpp27
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.td5
-rw-r--r--lib/Target/MSP430/MSP430RegisterInfo.cpp2
-rw-r--r--lib/Target/MSP430/MSP430Subtarget.cpp27
-rw-r--r--lib/Target/MSP430/MSP430Subtarget.h11
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp34
-rw-r--r--lib/Target/Mips/MipsSubtarget.cpp7
-rw-r--r--lib/Target/Mips/MipsSubtarget.h7
-rw-r--r--lib/Target/Nios2/CMakeLists.txt18
-rw-r--r--lib/Target/Nios2/LLVMBuild.txt61
-rw-r--r--lib/Target/Nios2/MCTargetDesc/CMakeLists.txt2
-rw-r--r--lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt25
-rw-r--r--lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp25
-rw-r--r--lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h34
-rw-r--r--lib/Target/Nios2/Nios2.h25
-rw-r--r--lib/Target/Nios2/Nios2.td29
-rw-r--r--lib/Target/Nios2/Nios2InstrFormats.td117
-rw-r--r--lib/Target/Nios2/Nios2InstrInfo.td50
-rw-r--r--lib/Target/Nios2/Nios2RegisterInfo.td60
-rw-r--r--lib/Target/Nios2/Nios2TargetMachine.cpp46
-rw-r--r--lib/Target/Nios2/Nios2TargetMachine.h30
-rw-r--r--lib/Target/Nios2/TargetInfo/CMakeLists.txt1
-rw-r--r--lib/Target/Nios2/TargetInfo/LLVMBuild.txt23
-rw-r--r--lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp24
-rw-r--r--lib/Target/PowerPC/PPCExpandISEL.cpp2
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp92
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h6
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td4
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp2
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td2
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td76
-rw-r--r--lib/Target/SystemZ/SystemZExpandPseudo.cpp2
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.cpp30
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.h1
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp5
-rw-r--r--lib/Target/X86/CMakeLists.txt1
-rw-r--r--lib/Target/X86/X86.td3
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp4
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp8
-rw-r--r--lib/Target/X86/X86InstrAVX512.td152
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td24
-rw-r--r--lib/Target/X86/X86InstrFMA.td13
-rw-r--r--lib/Target/X86/X86InstrFormats.td10
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp3406
-rw-r--r--lib/Target/X86/X86InstrInfo.td23
-rw-r--r--lib/Target/X86/X86InstrMMX.td5
-rw-r--r--lib/Target/X86/X86InstrSSE.td66
-rw-r--r--lib/Target/X86/X86InstrXOP.td8
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp20
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp17
-rw-r--r--lib/Target/X86/X86LegalizerInfo.h1
-rw-r--r--lib/Target/X86/X86Subtarget.cpp1
-rw-r--r--lib/Target/X86/X86Subtarget.h4
122 files changed, 3581 insertions, 5095 deletions
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 056ffd58b521..981fd22c213c 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -320,6 +320,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
switch (ExtraCode[0]) {
default:
return true; // Unknown modifier.
+ case 'a': // Print 'a' modifier
+ PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+ return false;
case 'w': // Print W register
case 'x': // Print X register
if (MO.isReg())
@@ -388,7 +391,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &O) {
- if (ExtraCode && ExtraCode[0])
+ if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a')
return true; // Unknown modifier.
const MachineOperand &MO = MI->getOperand(OpNum);
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 629ad5c61b78..33fec74998d6 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -584,27 +584,21 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
return true;
}
-static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
- for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
- MBB->addLiveIn(*I);
-}
-
bool AArch64ExpandPseudo::expandCMP_SWAP(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
MachineBasicBlock::iterator &NextMBBI) {
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
- MachineOperand &Dest = MI.getOperand(0);
+ const MachineOperand &Dest = MI.getOperand(0);
unsigned StatusReg = MI.getOperand(1).getReg();
- MachineOperand &Addr = MI.getOperand(2);
- MachineOperand &Desired = MI.getOperand(3);
- MachineOperand &New = MI.getOperand(4);
-
- LivePhysRegs LiveRegs(&TII->getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- for (auto I = std::prev(MBB.end()); I != MBBI; --I)
- LiveRegs.stepBackward(*I);
+ bool StatusDead = MI.getOperand(1).isDead();
+ // Duplicating undef operands into 2 instructions does not guarantee the same
+ // value on both; However undef should be replaced by xzr anyway.
+ assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
+ unsigned AddrReg = MI.getOperand(2).getReg();
+ unsigned DesiredReg = MI.getOperand(3).getReg();
+ unsigned NewReg = MI.getOperand(4).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -616,19 +610,18 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
MF->insert(++StoreBB->getIterator(), DoneBB);
// .Lloadcmp:
+ // mov wStatus, 0
// ldaxr xDest, [xAddr]
// cmp xDest, xDesired
// b.ne .Ldone
- LoadCmpBB->addLiveIn(Addr.getReg());
- LoadCmpBB->addLiveIn(Dest.getReg());
- LoadCmpBB->addLiveIn(Desired.getReg());
- addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
+ if (!StatusDead)
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg)
+ .addImm(0).addImm(0);
BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
- .addReg(Addr.getReg());
+ .addReg(AddrReg);
BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
.addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
- .add(Desired)
+ .addReg(DesiredReg)
.addImm(ExtendImm);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
.addImm(AArch64CC::NE)
@@ -640,25 +633,35 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
// .Lstore:
// stlxr wStatus, xNew, [xAddr]
// cbnz wStatus, .Lloadcmp
- StoreBB->addLiveIn(Addr.getReg());
- StoreBB->addLiveIn(New.getReg());
- addPostLoopLiveIns(StoreBB, LiveRegs);
-
- BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg).add(New).add(Addr);
+ BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
+ .addReg(NewReg)
+ .addReg(AddrReg);
BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
- .addReg(StatusReg, RegState::Kill)
+ .addReg(StatusReg, getKillRegState(StatusDead))
.addMBB(LoadCmpBB);
StoreBB->addSuccessor(LoadCmpBB);
StoreBB->addSuccessor(DoneBB);
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
DoneBB->transferSuccessors(&MBB);
- addPostLoopLiveIns(DoneBB, LiveRegs);
MBB.addSuccessor(LoadCmpBB);
NextMBBI = MBB.end();
MI.eraseFromParent();
+
+ // Recompute livein lists.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ LivePhysRegs LiveRegs;
+ computeLiveIns(LiveRegs, MRI, *DoneBB);
+ computeLiveIns(LiveRegs, MRI, *StoreBB);
+ computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+ // Do an extra pass around the loop to get loop carried registers right.
+ StoreBB->clearLiveIns();
+ computeLiveIns(LiveRegs, MRI, *StoreBB);
+ LoadCmpBB->clearLiveIns();
+ computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
return true;
}
@@ -671,16 +674,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
MachineOperand &DestLo = MI.getOperand(0);
MachineOperand &DestHi = MI.getOperand(1);
unsigned StatusReg = MI.getOperand(2).getReg();
- MachineOperand &Addr = MI.getOperand(3);
- MachineOperand &DesiredLo = MI.getOperand(4);
- MachineOperand &DesiredHi = MI.getOperand(5);
- MachineOperand &NewLo = MI.getOperand(6);
- MachineOperand &NewHi = MI.getOperand(7);
-
- LivePhysRegs LiveRegs(&TII->getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- for (auto I = std::prev(MBB.end()); I != MBBI; --I)
- LiveRegs.stepBackward(*I);
+ bool StatusDead = MI.getOperand(2).isDead();
+ // Duplicating undef operands into 2 instructions does not guarantee the same
+ // value on both; However undef should be replaced by xzr anyway.
+ assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
+ unsigned AddrReg = MI.getOperand(3).getReg();
+ unsigned DesiredLoReg = MI.getOperand(4).getReg();
+ unsigned DesiredHiReg = MI.getOperand(5).getReg();
+ unsigned NewLoReg = MI.getOperand(6).getReg();
+ unsigned NewHiReg = MI.getOperand(7).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -696,20 +698,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
// cmp xDestLo, xDesiredLo
// sbcs xDestHi, xDesiredHi
// b.ne .Ldone
- LoadCmpBB->addLiveIn(Addr.getReg());
- LoadCmpBB->addLiveIn(DestLo.getReg());
- LoadCmpBB->addLiveIn(DestHi.getReg());
- LoadCmpBB->addLiveIn(DesiredLo.getReg());
- LoadCmpBB->addLiveIn(DesiredHi.getReg());
- addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
.addReg(DestLo.getReg(), RegState::Define)
.addReg(DestHi.getReg(), RegState::Define)
- .addReg(Addr.getReg());
+ .addReg(AddrReg);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
.addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
- .add(DesiredLo)
+ .addReg(DesiredLoReg)
.addImm(0);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
.addUse(AArch64::WZR)
@@ -717,14 +712,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
.addImm(AArch64CC::EQ);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
.addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
- .add(DesiredHi)
+ .addReg(DesiredHiReg)
.addImm(0);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
.addUse(StatusReg, RegState::Kill)
.addUse(StatusReg, RegState::Kill)
.addImm(AArch64CC::EQ);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
- .addUse(StatusReg, RegState::Kill)
+ .addUse(StatusReg, getKillRegState(StatusDead))
.addMBB(DoneBB);
LoadCmpBB->addSuccessor(DoneBB);
LoadCmpBB->addSuccessor(StoreBB);
@@ -732,28 +727,36 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
// .Lstore:
// stlxp wStatus, xNewLo, xNewHi, [xAddr]
// cbnz wStatus, .Lloadcmp
- StoreBB->addLiveIn(Addr.getReg());
- StoreBB->addLiveIn(NewLo.getReg());
- StoreBB->addLiveIn(NewHi.getReg());
- addPostLoopLiveIns(StoreBB, LiveRegs);
BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
- .add(NewLo)
- .add(NewHi)
- .add(Addr);
+ .addReg(NewLoReg)
+ .addReg(NewHiReg)
+ .addReg(AddrReg);
BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
- .addReg(StatusReg, RegState::Kill)
+ .addReg(StatusReg, getKillRegState(StatusDead))
.addMBB(LoadCmpBB);
StoreBB->addSuccessor(LoadCmpBB);
StoreBB->addSuccessor(DoneBB);
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
DoneBB->transferSuccessors(&MBB);
- addPostLoopLiveIns(DoneBB, LiveRegs);
MBB.addSuccessor(LoadCmpBB);
NextMBBI = MBB.end();
MI.eraseFromParent();
+
+ // Recompute liveness bottom up.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ LivePhysRegs LiveRegs;
+ computeLiveIns(LiveRegs, MRI, *DoneBB);
+ computeLiveIns(LiveRegs, MRI, *StoreBB);
+ computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+ // Do an extra pass in the loop to get the loop carried dependencies right.
+ StoreBB->clearLiveIns();
+ computeLiveIns(LiveRegs, MRI, *StoreBB);
+ LoadCmpBB->clearLiveIns();
+ computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
return true;
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 1aec602a2a36..0b92249580c8 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -267,12 +267,12 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
return AArch64::X9;
const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
- const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
LivePhysRegs LiveRegs(TRI);
LiveRegs.addLiveIns(*MBB);
// Mark callee saved registers as used so we will not choose them.
- const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(MF);
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
for (unsigned i = 0; CSRegs[i]; ++i)
LiveRegs.addReg(CSRegs[i]);
@@ -991,6 +991,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
SmallVector<RegPairInfo, 8> RegPairs;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
++RPII) {
@@ -1022,9 +1023,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
dbgs() << ")\n");
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
- MBB.addLiveIn(Reg1);
+ if (!MRI.isReserved(Reg1))
+ MBB.addLiveIn(Reg1);
if (RPI.isPaired()) {
- MBB.addLiveIn(Reg2);
+ if (!MRI.isReserved(Reg2))
+ MBB.addLiveIn(Reg2);
MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1af36086ad90..62f4c953830b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -886,18 +886,21 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
// Create the new constant immediate node.
EVT VT = Op.getValueType();
SDLoc DL(Op);
+ SDValue New;
// If the new constant immediate is all-zeros or all-ones, let the target
// independent DAG combine optimize this node.
- if (NewImm == 0 || NewImm == OrigMask)
- return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT));
-
+ if (NewImm == 0 || NewImm == OrigMask) {
+ New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
+ TLO.DAG.getConstant(NewImm, DL, VT));
// Otherwise, create a machine node so that target independent DAG combine
// doesn't undo this optimization.
- Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
- SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
- SDValue New(
- TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+ } else {
+ Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
+ SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
+ New = SDValue(
+ TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+ }
return TLO.CombineTo(Op, New);
}
@@ -9219,16 +9222,26 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
// instructions (stp).
SDLoc DL(&St);
SDValue BasePtr = St.getBasePtr();
+ uint64_t BaseOffset = 0;
+
const MachinePointerInfo &PtrInfo = St.getPointerInfo();
SDValue NewST1 =
DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
OrigAlignment, St.getMemOperand()->getFlags());
+ // As this in ISel, we will not merge this add which may degrade results.
+ if (BasePtr->getOpcode() == ISD::ADD &&
+ isa<ConstantSDNode>(BasePtr->getOperand(1))) {
+ BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
+ BasePtr = BasePtr->getOperand(0);
+ }
+
unsigned Offset = EltOffset;
while (--NumVecElts) {
unsigned Alignment = MinAlign(OrigAlignment, Offset);
- SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
- DAG.getConstant(Offset, DL, MVT::i64));
+ SDValue OffsetPtr =
+ DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
PtrInfo.getWithOffset(Offset), Alignment,
St.getMemOperand()->getFlags());
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index c42738da7ab0..faf39be9b41e 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -763,15 +763,126 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
llvm_unreachable("Unknown opcode to check as cheap as a move!");
}
-bool AArch64InstrInfo::isFalkorLSLFast(const MachineInstr &MI) const {
- if (MI.getNumOperands() < 4)
+bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
return false;
- unsigned ShOpVal = MI.getOperand(3).getImm();
- unsigned ShImm = AArch64_AM::getShiftValue(ShOpVal);
- if (AArch64_AM::getShiftType(ShOpVal) == AArch64_AM::LSL &&
- ShImm < 4)
- return true;
- return false;
+
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSXrs: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+ if (ShiftVal == 0)
+ return true;
+ return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
+ }
+
+ case AArch64::ADDWrx:
+ case AArch64::ADDXrx:
+ case AArch64::ADDXrx64:
+ case AArch64::ADDSWrx:
+ case AArch64::ADDSXrx:
+ case AArch64::ADDSXrx64: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ switch (AArch64_AM::getArithExtendType(Imm)) {
+ default:
+ return false;
+ case AArch64_AM::UXTB:
+ case AArch64_AM::UXTH:
+ case AArch64_AM::UXTW:
+ case AArch64_AM::UXTX:
+ return AArch64_AM::getArithShiftValue(Imm) <= 4;
+ }
+ }
+
+ case AArch64::SUBWrs:
+ case AArch64::SUBSWrs: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+ return ShiftVal == 0 ||
+ (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
+ }
+
+ case AArch64::SUBXrs:
+ case AArch64::SUBSXrs: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+ return ShiftVal == 0 ||
+ (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
+ }
+
+ case AArch64::SUBWrx:
+ case AArch64::SUBXrx:
+ case AArch64::SUBXrx64:
+ case AArch64::SUBSWrx:
+ case AArch64::SUBSXrx:
+ case AArch64::SUBSXrx64: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ switch (AArch64_AM::getArithExtendType(Imm)) {
+ default:
+ return false;
+ case AArch64_AM::UXTB:
+ case AArch64_AM::UXTH:
+ case AArch64_AM::UXTW:
+ case AArch64_AM::UXTX:
+ return AArch64_AM::getArithShiftValue(Imm) == 0;
+ }
+ }
+
+ case AArch64::LDRBBroW:
+ case AArch64::LDRBBroX:
+ case AArch64::LDRBroW:
+ case AArch64::LDRBroX:
+ case AArch64::LDRDroW:
+ case AArch64::LDRDroX:
+ case AArch64::LDRHHroW:
+ case AArch64::LDRHHroX:
+ case AArch64::LDRHroW:
+ case AArch64::LDRHroX:
+ case AArch64::LDRQroW:
+ case AArch64::LDRQroX:
+ case AArch64::LDRSBWroW:
+ case AArch64::LDRSBWroX:
+ case AArch64::LDRSBXroW:
+ case AArch64::LDRSBXroX:
+ case AArch64::LDRSHWroW:
+ case AArch64::LDRSHWroX:
+ case AArch64::LDRSHXroW:
+ case AArch64::LDRSHXroX:
+ case AArch64::LDRSWroW:
+ case AArch64::LDRSWroX:
+ case AArch64::LDRSroW:
+ case AArch64::LDRSroX:
+ case AArch64::LDRWroW:
+ case AArch64::LDRWroX:
+ case AArch64::LDRXroW:
+ case AArch64::LDRXroX:
+ case AArch64::PRFMroW:
+ case AArch64::PRFMroX:
+ case AArch64::STRBBroW:
+ case AArch64::STRBBroX:
+ case AArch64::STRBroW:
+ case AArch64::STRBroX:
+ case AArch64::STRDroW:
+ case AArch64::STRDroX:
+ case AArch64::STRHHroW:
+ case AArch64::STRHHroX:
+ case AArch64::STRHroW:
+ case AArch64::STRHroX:
+ case AArch64::STRQroW:
+ case AArch64::STRQroX:
+ case AArch64::STRSroW:
+ case AArch64::STRSroX:
+ case AArch64::STRWroW:
+ case AArch64::STRWroX:
+ case AArch64::STRXroW:
+ case AArch64::STRXroX: {
+ unsigned IsSigned = MI.getOperand(3).getImm();
+ return !IsSigned;
+ }
+ }
}
bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 4cd14db633b9..59f3405fe439 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -270,7 +270,7 @@ public:
bool IsTailCall) const override;
/// Returns true if the instruction has a shift by immediate that can be
/// executed in one cycle less.
- bool isFalkorLSLFast(const MachineInstr &MI) const;
+ bool isFalkorShiftExtFast(const MachineInstr &MI) const;
private:
/// \brief Sets the offsets on outlined instructions in \p MBB which use SP
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index da68f3165c5e..ad24612239fa 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -442,7 +442,7 @@ def MSRpstateImm4 : MSRpstateImm0_15;
// TPIDR_EL0. Add pseudo op so we can mark it as not having any side effects.
let hasSideEffects = 0 in
def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
- [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[]>;
+ [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;
// The cycle counter PMC register is PMCCNTR_EL0.
let Predicates = [HasPerfMon] in
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index a6926a6700e1..3b71d529db59 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -232,6 +232,19 @@ static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );
+ if (&SecondSU != &DAG->ExitSU)
+ // Make instructions dependent on FirstSU also dependent on SecondSU to
+ // prevent them from being scheduled between FirstSU and and SecondSU.
+ for (SUnit::const_succ_iterator
+ SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end();
+ SI != SE; ++SI) {
+ if (!SI->getSUnit() || SI->getSUnit() == &SecondSU)
+ continue;
+ DEBUG(dbgs() << " Copy Succ ";
+ SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';);
+ DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial));
+ }
+
++NumFused;
return true;
}
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index cf1c0b66db58..44fd94fc3d48 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -61,56 +61,42 @@ let SchedModel = FalkorModel in {
let SchedModel = FalkorModel in {
-def : WriteRes<WriteImm, [FalkorUnitXYZ]> { let Latency = 1; }
-def : WriteRes<WriteI, [FalkorUnitXYZ]> { let Latency = 1; }
-def : WriteRes<WriteISReg, [FalkorUnitVXVY, FalkorUnitVXVY]>
- { let Latency = 1; let NumMicroOps = 2; }
-def : WriteRes<WriteIEReg, [FalkorUnitXYZ, FalkorUnitXYZ]>
- { let Latency = 2; let NumMicroOps = 2; }
-def : WriteRes<WriteExtr, [FalkorUnitXYZ, FalkorUnitXYZ]>
- { let Latency = 2; let NumMicroOps = 2; }
-def : WriteRes<WriteIS, [FalkorUnitXYZ]> { let Latency = 1; }
-def : WriteRes<WriteID32, [FalkorUnitX, FalkorUnitZ]>
- { let Latency = 8; let NumMicroOps = 2; }
-def : WriteRes<WriteID64, [FalkorUnitX, FalkorUnitZ]>
- { let Latency = 16; let NumMicroOps = 2; }
-def : WriteRes<WriteIM32, [FalkorUnitX]> { let Latency = 4; }
-def : WriteRes<WriteIM64, [FalkorUnitX]> { let Latency = 5; }
-def : WriteRes<WriteBr, [FalkorUnitB]> { let Latency = 1; }
-def : WriteRes<WriteBrReg, [FalkorUnitB]> { let Latency = 1; }
-def : WriteRes<WriteLD, [FalkorUnitLD]> { let Latency = 3; }
-def : WriteRes<WriteST, [FalkorUnitST, FalkorUnitSD]>
- { let Latency = 0; let NumMicroOps = 2; }
-def : WriteRes<WriteSTP, [FalkorUnitST, FalkorUnitSD]>
- { let Latency = 0; let NumMicroOps = 2; }
-def : WriteRes<WriteAdr, [FalkorUnitXYZ]> { let Latency = 1; }
-def : WriteRes<WriteLDIdx, [FalkorUnitLD]> { let Latency = 5; }
-def : WriteRes<WriteSTIdx, [FalkorUnitST, FalkorUnitSD]>
- { let Latency = 0; let NumMicroOps = 2; }
-def : WriteRes<WriteF, [FalkorUnitVXVY, FalkorUnitVXVY]>
- { let Latency = 3; let NumMicroOps = 2; }
-def : WriteRes<WriteFCmp, [FalkorUnitVXVY]> { let Latency = 2; }
-def : WriteRes<WriteFCvt, [FalkorUnitVXVY]> { let Latency = 4; }
-def : WriteRes<WriteFCopy, [FalkorUnitVXVY]> { let Latency = 4; }
-def : WriteRes<WriteFImm, [FalkorUnitVXVY]> { let Latency = 4; }
-def : WriteRes<WriteFMul, [FalkorUnitVXVY, FalkorUnitVXVY]>
- { let Latency = 6; let NumMicroOps = 2; }
-def : WriteRes<WriteFDiv, [FalkorUnitVXVY, FalkorUnitVXVY]>
- { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
-def : WriteRes<WriteV, [FalkorUnitVXVY]> { let Latency = 6; }
-def : WriteRes<WriteVLD, [FalkorUnitLD]> { let Latency = 3; }
-def : WriteRes<WriteVST, [FalkorUnitST, FalkorUnitVSD]>
- { let Latency = 0; let NumMicroOps = 2; }
-
-def : WriteRes<WriteSys, []> { let Latency = 1; }
-def : WriteRes<WriteBarrier, []> { let Latency = 1; }
-def : WriteRes<WriteHint, []> { let Latency = 1; }
-
-def : WriteRes<WriteLDHi, []> { let Latency = 3; }
-
-def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
-
-// No forwarding logic is modelled yet.
+// These WriteRes entries are not used in the Falkor sched model.
+def : WriteRes<WriteImm, []> { let Unsupported = 1; }
+def : WriteRes<WriteI, []> { let Unsupported = 1; }
+def : WriteRes<WriteISReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteIEReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteExtr, []> { let Unsupported = 1; }
+def : WriteRes<WriteIS, []> { let Unsupported = 1; }
+def : WriteRes<WriteID32, []> { let Unsupported = 1; }
+def : WriteRes<WriteID64, []> { let Unsupported = 1; }
+def : WriteRes<WriteIM32, []> { let Unsupported = 1; }
+def : WriteRes<WriteIM64, []> { let Unsupported = 1; }
+def : WriteRes<WriteBr, []> { let Unsupported = 1; }
+def : WriteRes<WriteBrReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteLD, []> { let Unsupported = 1; }
+def : WriteRes<WriteST, []> { let Unsupported = 1; }
+def : WriteRes<WriteSTP, []> { let Unsupported = 1; }
+def : WriteRes<WriteAdr, []> { let Unsupported = 1; }
+def : WriteRes<WriteLDIdx, []> { let Unsupported = 1; }
+def : WriteRes<WriteSTIdx, []> { let Unsupported = 1; }
+def : WriteRes<WriteF, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCmp, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCvt, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCopy, []> { let Unsupported = 1; }
+def : WriteRes<WriteFImm, []> { let Unsupported = 1; }
+def : WriteRes<WriteFMul, []> { let Unsupported = 1; }
+def : WriteRes<WriteFDiv, []> { let Unsupported = 1; }
+def : WriteRes<WriteV, []> { let Unsupported = 1; }
+def : WriteRes<WriteVLD, []> { let Unsupported = 1; }
+def : WriteRes<WriteVST, []> { let Unsupported = 1; }
+def : WriteRes<WriteSys, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Unsupported = 1; }
+def : WriteRes<WriteHint, []> { let Unsupported = 1; }
+def : WriteRes<WriteLDHi, []> { let Unsupported = 1; }
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// These ReadAdvance entries are not used in the Falkor sched model.
def : ReadAdvance<ReadI, 0>;
def : ReadAdvance<ReadISReg, 0>;
def : ReadAdvance<ReadIEReg, 0>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index a9b4d44a523e..d098cf7a5a37 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -12,7 +12,509 @@
//
//===----------------------------------------------------------------------===//
-include "AArch64SchedFalkorWriteRes.td"
+// Contains all of the Falkor specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+// Prefix: FalkorWr
+// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD)
+// Latency: #cyc
+//
+// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued
+// down one Z pipe, six SD pipes, four VX pipes and the total latency is
+// six cycles.
+//
+// Contains all of the Falkor specific ReadAdvance types for forwarding logic.
+//
+// Contains all of the Falkor specific WriteVariant types for immediate zero
+// and LSLFast.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define 0 micro-op types
+def FalkorWr_none_1cyc : SchedWriteRes<[]> {
+ let Latency = 1;
+ let NumMicroOps = 0;
+}
+def FalkorWr_none_3cyc : SchedWriteRes<[]> {
+ let Latency = 3;
+ let NumMicroOps = 0;
+}
+def FalkorWr_none_4cyc : SchedWriteRes<[]> {
+ let Latency = 4;
+ let NumMicroOps = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 1 micro-op types
+
+def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; }
+def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
+def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; }
+def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; }
+def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; }
+def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; }
+def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
+def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
+def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
+def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
+def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; }
+
+def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
+def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
+def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
+def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+
+def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; }
+def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; }
+def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; }
+
+def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
+def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
+def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Define 2 micro-op types
+
+def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2, 8];
+}
+
+def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+ let Latency = 16;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2, 16];
+}
+
+def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitVSD, FalkorUnitST]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 3 micro-op types
+
+def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+ FalkorUnitLD]> {
+ let Latency = 0;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+ FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitZ]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_1XYZ_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitSD, FalkorUnitST]> {
+ let Latency = 0;
+ let NumMicroOps = 3;
+}
+def FalkorWr_1XYZ_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitVSD, FalkorUnitST]> {
+ let Latency = 0;
+ let NumMicroOps = 3;
+}
+//===----------------------------------------------------------------------===//
+// Define 4 micro-op types
+
+def FalkorWr_2VX_2VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitLD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST,
+ FalkorUnitSD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2VSD_2ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 5 micro-op types
+
+def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+}
+def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+}
+def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+}
+def FalkorWr_1XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitST,
+ FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 5;
+}
+def FalkorWr_1VXVY_2ST_2VSD_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitST,
+ FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 5;
+}
+//===----------------------------------------------------------------------===//
+// Define 6 micro-op types
+
+def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitXYZ,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 6;
+}
+
+def FalkorWr_2VXVY_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitVXVY,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 6;
+}
+
+def FalkorWr_3VSD_3ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 8 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 8;
+}
+
+def FalkorWr_4VSD_4ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 9 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+ FalkorUnitLD, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitLD,
+ FalkorUnitLD, FalkorUnitXYZ,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 9;
+}
+
+def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+ FalkorUnitLD, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitXYZ,
+ FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 9;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 10 micro-op types
+
+def FalkorWr_2VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitVXVY,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 12 micro-op types
+
+def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitVXVY,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitVXVY,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 12;
+}
+
+// Forwarding logic is modeled for multiply add/accumulate.
+// -----------------------------------------------------------------------------
+def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
+def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
+def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
+def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
+def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
+
+// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
+// -----------------------------------------------------------------------------
+def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
+def FalkorFMOVZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
+ MI->getOperand(1).getReg() == AArch64::XZR}]>;
+def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>;
+
+def FalkorWr_FMOV : SchedWriteVariant<[
+ SchedVar<FalkorFMOVZrReg, [FalkorWr_1none_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>;
+
+def FalkorWr_MOVZ : SchedWriteVariant<[
+ SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZB_1cyc]>]>;
+
+def FalkorWr_ADDSUBsx : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>;
+
+def FalkorWr_LDRro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_3cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_4cyc]>]>;
+
+def FalkorWr_LDRSro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>;
+
+def FalkorWr_PRFMro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>;
+
+def FalkorWr_STRVro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1VSD_1ST_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1VSD_1ST_0cyc]>]>;
+
+def FalkorWr_STRQro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_2ST_2VSD_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_2XYZ_2ST_2VSD_0cyc]>]>;
+
+def FalkorWr_STRro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1SD_1ST_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1SD_1ST_0cyc]>]>;
//===----------------------------------------------------------------------===//
// Specialize the coarse model by associating instruction groups with the
@@ -22,63 +524,76 @@ include "AArch64SchedFalkorWriteRes.td"
// Miscellaneous
// -----------------------------------------------------------------------------
-def : InstRW<[WriteI], (instrs COPY)>;
+// FIXME: This could be better modeled by looking at the regclasses of the operands.
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs COPY)>;
// SIMD Floating-point Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)v2f32$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v4f16|v2i16p|v2i32p)$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(16|32|64)$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(16|32|64|v2f32|v4f16|v2i32|v4i16)$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i16|v1i32|v1i64|v2i32|v4i16)rz$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v2i32p)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(32|64|v2f32|v2i32)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64|v2i32)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)v2f32$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?V(v4i16|v4i32|v8i16)v$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)(v2f32|v4f16)$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i16p|v2i32p|v2i64p|v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?Vv4i32v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)v2f32$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i32p|v2i64p|v2f32)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v2f32)$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)v2i32(_shift)?$")>;
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+ (instregex "^(FMUL|FMULX)(v2f32|(v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+ (instrs FMULX32)>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+ (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+ (instrs FMULX64)>;
-def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32)$")>;
-def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v8f16|v2i64p)$")>;
-def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32|v8i16)rz$")>;
-def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v2i64p)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs FCVTLv4i16, FCVTLv2i32)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)$")>;
-def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)(v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)v2f32$")>;
-def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32|v8f16)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs FCVTLv8i16, FCVTLv4i32)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32)(_shift)?$")>;
-def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc],
+ (instregex "^(FMUL|FMULX)(v2f64|v4f32|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
+ (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
-def : InstRW<[FalkorWr_3VXVY_4cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_3VXVY_4cyc], (instrs FCVTNv4i16, FCVTNv2i32, FCVTXNv2f32)>;
+def : InstRW<[FalkorWr_3VXVY_5cyc], (instrs FCVTNv8i16, FCVTNv4i32, FCVTXNv4f32)>;
-def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i16|v4i32|v8i16|v4f32)$")>;
+def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32)$")>;
-def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+ (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+ (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
-
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32],
+ (instregex "^FML(A|S)(v2f32|(v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64],
+ (instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32],
+ (instregex "^FML(A|S)(v4f32|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64],
+ (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
// SIMD Integer Instructions
// -----------------------------------------------------------------------------
@@ -92,12 +607,14 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$"
def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>;
def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHLv1i64$")>;
def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHRd$")>;
def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>;
def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>;
def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>;
def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>;
def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs PMULv8i8)>;
def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHLd$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>;
@@ -110,6 +627,8 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)
def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHRd$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^R?SHRN(v2i32|v4i16|v8i8)_shift$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs ADDVv4i16v)>;
@@ -120,10 +639,14 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+ (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+ (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+ (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+ (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
@@ -154,7 +677,7 @@ def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^ADDP(v4i32|v8i16|v16i8)$")>;
def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>;
def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>;
def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>;
-def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL2?(v8i8|v16i8)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL(v8i8|v16i8)$")>;
def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>;
def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
@@ -165,14 +688,18 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^R?SHRN(v2i64|v4i32|v8i16|v16i8)_shift$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>;
-def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL2?(v1i64|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL(v1i64|v2i64)$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+ (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+ (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+ (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>;
@@ -186,99 +713,114 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>;
def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+ (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+ (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
// SIMD Load Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[WriteVLD], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
-def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD], (instrs LD2i64)>;
-def : InstRW<[WriteVLD, WriteAdr], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
-def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[WriteVLD, WriteAdr], (instrs LD2i64_POST)>;
-
-def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "LD1i(8|16|32)$")>;
-def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>;
-def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instrs LD3i64_POST)>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instrs LD4i64_POST)>;
-
-def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>;
-def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, WriteAdr], (instregex "^LD2i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>;
-def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instrs LD3Threev2d_POST)>;
-def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "LD3i(8|16|32)$")>;
-def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, WriteAdr], (instregex "LD3i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>;
-def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instrs LD4Fourv2d_POST)>;
-def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>;
-def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, WriteAdr], (instregex "^LD4i(8|16|32)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, WriteAdr],(instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, WriteAdr],(instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "LD3Threev(16b|8h|4s)$")>;
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>;
-
-def : InstRW<[FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, WriteAdr],(instregex "LD3Threev(16b|8h|4s)_POST$")>;
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, WriteAdr],(instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc], (instrs LD2i64)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instrs LD2i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD4i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "^LD3Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1none_4cyc],
+ (instregex "^LD3Threev(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2none_4cyc],
+ (instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD3Threev(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc],
+ (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
+
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc],
+ (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
// Arithmetic and Logical Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_ADD], (instregex "^ADD(S)?(W|X)r(s|x)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADC(S)?(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADD(S)?(W|X)r(r|i)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>;
def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^AND(S)?(W|X)r(i|r|s)$")>;
def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^BIC(S)?(W|X)r(r|s)$")>;
def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EON(W|X)r(r|s)$")>;
def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EOR(W|X)r(i|r|s)$")>;
def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORN(W|X)r(r|s)$")>;
def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(i|r|s)$")>;
-def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^SUB(S)?(W|X)r(s|x)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SBC(S)?(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SUB(S)?(W|X)r(r|i)$")>;
+def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>;
+def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>;
// SIMD Miscellaneous Instructions
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>;
def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>;
def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>;
def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>;
def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>;
@@ -287,35 +829,42 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i
def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>;
def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs NOTv8i8)>;
def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^REV(16|32|64)v.*$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN|XTN2)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v2i32|v4i16|v8i8)$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "(S|U)QXTU?Nv.*$")>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>;
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+ (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+ (instrs FRECPS64, FRSQRTS64)>;
-def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
+ (instregex "^INSv(i32|i64)(gpr|lane)$")>;
def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>;
def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>;
def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>;
def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>;
def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs TBLv16i8One)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>;
def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>;
def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>;
-def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc],
+ (instrs FRECPSv4f32, FRSQRTSv4f32)>;
-def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
+ (instrs FRECPSv2f64, FRSQRTSv2f64)>;
def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>;
def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>;
@@ -328,50 +877,95 @@ def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>;
// SIMD Store Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[WriteVST], (instregex "^STP(D|S)(i)$")>;
-def : InstRW<[WriteVST, WriteAdr], (instregex "^STP(D|S)(post|pre)$")>;
-def : InstRW<[FalkorWr_2XYZ_2ST_2VSD_0cyc], (instregex "^STRQro(W|X)$")>;
-
-def : InstRW<[WriteVST], (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[WriteVST], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
-def : InstRW<[WriteVST, WriteAdr], (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
-def : InstRW<[WriteVST, WriteAdr], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
-
-def : InstRW<[WriteVST, WriteVST], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[WriteVST, WriteVST], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[WriteVST, WriteVST], (instregex "^ST3(i8|i16|i32|i64)$")>;
-def : InstRW<[WriteVST, WriteVST], (instregex "^ST4(i8|i16|i32|i64)$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
-
-def : InstRW<[WriteV, WriteVST, WriteVST], (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
-def : InstRW<[WriteV, WriteVST, WriteVST, WriteAdr], (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
-
-def : InstRW<[WriteVST, WriteVST, WriteVST], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST], (instrs ST3Threev2d)>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr], (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr], (instrs ST3Threev2d_POST)>;
-
-def : InstRW<[WriteV, WriteV, WriteVST, WriteVST], (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
-def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteAdr], (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
-
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], (instrs ST4Fourv2d)>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr], (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr], (instrs ST4Fourv2d_POST)>;
-
-def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST3Three(v16b|v8h|v4s)$")>;
-def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
-
-def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST4Four(v16b|v8h|v4s)$")>;
-def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STR(Q|D|S|H|B)ui$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+ (instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRVro], (instregex "^STR(D|S|H|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^STPQi$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2VSD_2ST_0cyc],
+ (instregex "^STPQ(post|pre)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STP(D|S)(i)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+ (instregex "^STP(D|S)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRQro], (instregex "^STRQro(W|X)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STUR(Q|D|S|B|H)i$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instrs STNPDi, STNPSi)>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instrs STNPQi)>;
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+ (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
+ (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
+ (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST4(i8|i16|i32|i64)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+ (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+ (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+ (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+ (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc],
+ (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc],
+ (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instrs ST3Threev2d)>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+ (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+ (instrs ST3Threev2d_POST)>;
+
+def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc],
+ (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc],
+ (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instrs ST4Fourv2d)>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+ (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+ (instrs ST4Fourv2d_POST)>;
+
+def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc],
+ (instregex "^ST3Three(v16b|v8h|v4s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc],
+ (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
+
+def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc],
+ (instregex "^ST4Four(v16b|v8h|v4s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc],
+ (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
// Branch Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1none_0cyc], (instrs B)>;
+def : InstRW<[FalkorWr_1none_0cyc], (instrs B, TCRETURNdi)>;
def : InstRW<[FalkorWr_1Z_0cyc], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>;
+def : InstRW<[FalkorWr_1Z_0cyc], (instrs RET_ReallyLR, TCRETURNri)>;
def : InstRW<[FalkorWr_1ZB_0cyc], (instrs Bcc)>;
def : InstRW<[FalkorWr_1XYZB_0cyc], (instrs BL)>;
def : InstRW<[FalkorWr_1Z_1XY_0cyc], (instrs BLR)>;
@@ -388,89 +982,103 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>;
// FP Load Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[WriteLD], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
-def : InstRW<[WriteLD, WriteAdr], (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
-def : InstRW<[WriteLD], (instregex "^LDUR(Q|D|S|H|B)i$")>;
-def : InstRW<[FalkorWr_LDR], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDNPQi)>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDPQi)>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDNP(D|S)i$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDP(D|S)i$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi, WriteAdr],(instregex "LDP(D|S)(pre|post)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi, WriteAdr],(instregex "^LDPQ(pre|post)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+ (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+ (instrs LDNPQi)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+ (instrs LDPQi)>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+ (instregex "LDNP(D|S)i$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+ (instregex "LDP(D|S)i$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+ (instregex "LDP(D|S)(pre|post)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+ (instregex "^LDPQ(pre|post)$")>;
// FP Data Processing Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(H|S|D)rr$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(H|S|D)r(r|i)$")>;
-def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P)(S|U)U(W|X)(H|S|D)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(H|S|D)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(H|S|D)rrr$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(S|D)r(r|i)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(S|D)rrr$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(H|S|D)rr$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(16|32|64)p$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTHSr, FCVTHDr)>;
-def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(H|S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(32|64)p$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTSHr, FCVTDHr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(S|D)r$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(16|32|64)$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(H|S|D)rr$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTSHr, FCVTDHr)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTHSr, FCVTHDr)>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>;
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+ (instregex "^F(N)?MULSrr$")>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+ (instregex "^F(N)?MULDrr$")>;
-def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>;
-def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(S|D)r$")>;
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32],
+ (instregex "^F(N)?M(ADD|SUB)Srrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64],
+ (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
// FP Miscellaneous Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(H|S|D)i$")>;
-def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
-def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
-def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>;
-// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0
+def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(WS|XD|XDHigh)r$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(S|D)i$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(d|s)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>;
+// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr
def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>;
def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
// Load Instructions
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>;
def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFUMi)>;
-
-def : InstRW<[WriteLD, WriteLDHi], (instregex "^LDNP(W|X)i$")>;
-def : InstRW<[WriteLD, WriteLDHi], (instregex "^LDP(W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(B|H|W|X)ui$")>;
-def : InstRW<[WriteLD, WriteAdr], (instregex "^LDR(B|H|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+ (instregex "^LDNP(W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+ (instregex "^LDP(W|X)i$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+ (instregex "^LDP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+ (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(W|X)l$")>;
def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDTR(B|H|W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(B|H|W|X)i$")>;
-
+def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_PRFMro], (instregex "^PRFMro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+ (instrs LDPSWi)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+ (instregex "^LDPSW(post|pre)$")>;
def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc],
+ (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
+def : InstRW<[FalkorWr_LDRSro], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
def : InstRW<[FalkorWr_1LD_4cyc], (instrs LDRSWl)>;
def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
-def : InstRW<[FalkorWr_PRFM], (instregex "^PRFMro(W|X)$")>;
-def : InstRW<[FalkorWr_LDR], (instregex "^LDR(B|H|W|X)ro(W|X)$")>;
-
-def : InstRW<[FalkorWr_LDRS], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
-
-def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
-def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>;
-def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>;
-
// Miscellaneous Data-Processing Instructions
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(S|U)?BFM(W|X)ri$")>;
@@ -480,17 +1088,22 @@ def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>;
// Divide and Multiply Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
-def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64],
+ (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32],
+ (instregex "^M(ADD|SUB)Wrrr$")>;
-def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
-def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64],
+ (instregex "^M(ADD|SUB)Xrrr$")>;
-def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>;
-def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>;
+def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>;
+def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(S|U)MULLv.*$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+ (instregex "^(S|U)MULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+ (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
// Move and Shift Instructions
// -----------------------------------------------------------------------------
@@ -498,6 +1111,11 @@ def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W|
def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^ADRP?$")>;
def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^MOVN(W|X)i$")>;
def : InstRW<[FalkorWr_MOVZ], (instregex "^MOVZ(W|X)i$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs MOVi32imm, MOVi64imm)>;
+def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>],
+ (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>;
+def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>],
+ (instrs LOADgot)>;
// Other Instructions
// -----------------------------------------------------------------------------
@@ -507,13 +1125,12 @@ def : InstRW<[FalkorWr_1ST_0cyc], (instrs SYSxt, SYSLxt)>;
def : InstRW<[FalkorWr_1Z_0cyc], (instrs MSRpstateImm1, MSRpstateImm4)>;
def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>;
-def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS)>;
+def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS, MOVbaseTLS)>;
def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>;
def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
-def : InstRW<[WriteVST], (instrs STNPDi, STNPSi)>;
-def : InstRW<[WriteSTP], (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs STNPWi, STNPXi)>;
def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>;
def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>;
@@ -523,20 +1140,16 @@ def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>;
def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>;
def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>;
-def : InstRW<[WriteVST, WriteVST], (instrs STNPQi)>;
// Store Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[WriteST], (instregex "^STP(W|X)i$")>;
-def : InstRW<[WriteST, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>;
-def : InstRW<[WriteST], (instregex "^STR(Q|D|S|BB|HH)ui$")>;
-def : InstRW<[WriteST], (instregex "^STUR(Q|D|S|BB|HH)i$")>;
-def : InstRW<[WriteST], (instregex "^STR(B|H|W|X)ui$")>;
-def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)(post|pre)$")>;
-def : InstRW<[WriteST], (instregex "^STTR(B|H|W|X)i$")>;
-def : InstRW<[WriteST], (instregex "^STUR(B|H|W|X)i$")>;
-
-def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)ro(W|X)$")>;
-
-def : InstRW<[WriteVST, WriteVST], (instregex "^STPQi$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^STPQ(post|pre)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STP(W|X)i$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+ (instregex "^STP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+ (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRro], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STUR(BB|HH|W|X)i$")>;
+
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
deleted file mode 100644
index 6526cc28e806..000000000000
--- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
+++ /dev/null
@@ -1,403 +0,0 @@
-//=- AArch64SchedFalkorWrRes.td - Falkor Write Res ---*- tablegen -*-=//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Contains all of the Falkor specific SchedWriteRes types. The approach
-// below is to define a generic SchedWriteRes for every combination of
-// latency and microOps. The naming conventions is to use a prefix, one field
-// for latency, and one or more microOp count/type designators.
-// Prefix: FalkorWr
-// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD)
-// Latency: #cyc
-//
-// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued
-// down one Z pipe, six SD pipes, four VX pipes and the total latency is
-// six cycles.
-//
-// Contains all of the Falkor specific ReadAdvance types for forwarding logic.
-//
-// Contains all of the Falkor specific WriteVariant types for immediate zero
-// and LSLFast.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Define 1 micro-op types
-
-def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; }
-def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
-def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
-def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
-def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; }
-def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; }
-def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; }
-def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; }
-def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
-def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
-def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
-def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
-def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; }
-
-def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
-def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
-def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
-def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
-def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
-def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
-def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
-def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
-
-def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; }
-def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; }
-def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; }
-
-def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
-def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
-def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
-
-//===----------------------------------------------------------------------===//
-// Define 2 micro-op types
-
-def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 1;
- let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 2;
- let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 3;
- let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 5;
- let NumMicroOps = 2;
-}
-def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 5;
- let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 6;
- let NumMicroOps = 2;
-}
-def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 6;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
- let Latency = 3;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
- let Latency = 5;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
- let Latency = 2;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
- let Latency = 10;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
- let Latency = 2;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> {
- let Latency = 1;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
- let Latency = 5;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> {
- let Latency = 2;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> {
- let Latency = 0;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
- let Latency = 8;
- let ResourceCycles = [2, 8];
-}
-
-def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
- let Latency = 16;
- let ResourceCycles = [2, 16];
-}
-
-def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
- let Latency = 3;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> {
- let Latency = 3;
- let NumMicroOps = 2;
-}
-
-def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
- let Latency = 0;
- let NumMicroOps = 2;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 3 micro-op types
-
-def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
- FalkorUnitLD]> {
- let Latency = 0;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
- FalkorUnitLD]> {
- let Latency = 3;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 3;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 5;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 6;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
- let Latency = 3;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
- FalkorUnitLD]> {
- let Latency = 3;
- let NumMicroOps = 3;
-}
-
-def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
- FalkorUnitZ]> {
- let Latency = 3;
- let NumMicroOps = 3;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 4 micro-op types
-
-def FalkorWr_2VX_2VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
- FalkorUnitVX, FalkorUnitVY]> {
- let Latency = 2;
- let NumMicroOps = 4;
-}
-
-def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 2;
- let NumMicroOps = 4;
-}
-def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 3;
- let NumMicroOps = 4;
-}
-def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 4;
-}
-def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 6;
- let NumMicroOps = 4;
-}
-
-def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
- FalkorUnitLD, FalkorUnitLD]> {
- let Latency = 3;
- let NumMicroOps = 4;
-}
-
-def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 4;
-}
-
-def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
- let Latency = 3;
- let NumMicroOps = 4;
-}
-
-def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST,
- FalkorUnitSD, FalkorUnitLD]> {
- let Latency = 3;
- let NumMicroOps = 4;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 5 micro-op types
-
-def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitVXVY,
- FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 5;
-}
-def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 5;
-}
-def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitVXVY,
- FalkorUnitVXVY]> {
- let Latency = 7;
- let NumMicroOps = 5;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 6 micro-op types
-
-def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 6;
-}
-
-def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
- FalkorUnitVSD, FalkorUnitXYZ,
- FalkorUnitST, FalkorUnitVSD]> {
- let Latency = 0;
- let NumMicroOps = 6;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 8 micro-op types
-
-def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
- FalkorUnitVXVY, FalkorUnitVXVY,
- FalkorUnitLD, FalkorUnitLD,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 8;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 9 micro-op types
-
-def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
- FalkorUnitLD, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitLD,
- FalkorUnitLD, FalkorUnitXYZ,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 9;
-}
-
-def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
- FalkorUnitLD, FalkorUnitVXVY,
- FalkorUnitVXVY, FalkorUnitXYZ,
- FalkorUnitLD, FalkorUnitLD,
- FalkorUnitVXVY, FalkorUnitVXVY]> {
- let Latency = 4;
- let NumMicroOps = 9;
-}
-
-// Forwarding logic is modeled for multiply add/accumulate.
-// -----------------------------------------------------------------------------
-def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
-def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
-def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
-def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
-def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
-
-// SchedPredicates and WriteVariants for Immediate Zero and LSLFast
-// -----------------------------------------------------------------------------
-def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
-def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>;
-
-def FalkorWr_FMOV : SchedWriteVariant<[
- SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
- SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>;
-
-def FalkorWr_MOVZ : SchedWriteVariant<[
- SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
- SchedVar<NoSchedPred, [FalkorWr_1XYZB_1cyc]>]>;
-
-def FalkorWr_LDR : SchedWriteVariant<[
- SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_3cyc]>,
- SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_4cyc]>]>;
-
-def FalkorWr_ADD : SchedWriteVariant<[
- SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>,
- SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>;
-
-def FalkorWr_PRFM : SchedWriteVariant<[
- SchedVar<FalkorLSLFastPred, [FalkorWr_1ST_3cyc]>,
- SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>;
-
-def FalkorWr_LDRS : SchedWriteVariant<[
- SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_4cyc]>,
- SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>;
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index b369ee7e4ba2..d3cab1ad3397 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -90,7 +90,6 @@ void AArch64Subtarget::initializeProperties() {
break;
case Falkor:
MaxInterleaveFactor = 4;
- VectorInsertExtractBaseCost = 2;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 132f192f2a9a..cb3f72a524f5 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -10,10 +10,10 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64TargetMachine.h"
#include "AArch64.h"
#include "AArch64MacroFusion.h"
#include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
#include "AArch64TargetObjectFile.h"
#include "AArch64TargetTransformInfo.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
@@ -23,6 +23,7 @@
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
@@ -277,7 +278,7 @@ public:
ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext *C) const override {
const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
- if (ST.hasFuseLiterals()) {
+ if (ST.hasFuseAES() || ST.hasFuseLiterals()) {
// Run the Macro Fusion after RA again since literals are expanded from
// pseudos then (v. addPreSched2()).
ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
@@ -295,6 +296,7 @@ public:
bool addIRTranslator() override;
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
+ void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
#endif
bool addILPOpts() override;
@@ -404,6 +406,12 @@ bool AArch64PassConfig::addRegBankSelect() {
return false;
}
+void AArch64PassConfig::addPreGlobalInstructionSelect() {
+ // Workaround the deficiency of the fast register allocator.
+ if (TM->getOptLevel() == CodeGenOpt::None)
+ addPass(new Localizer());
+}
+
bool AArch64PassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect());
return false;
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index b279bd61e180..e7ebb37a9d62 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -425,7 +425,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
- FeatureFastFMAF32, FeatureDPP,
+ FeatureFastFMAF32, FeatureSDWA, FeatureDPP,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
]
>;
@@ -534,10 +534,12 @@ def AMDGPUAsmVariants {
int VOP3_ID = 1;
string SDWA = "SDWA";
int SDWA_ID = 2;
+ string SDWA9 = "SDWA9";
+ int SDWA9_ID = 3;
string DPP = "DPP";
- int DPP_ID = 3;
+ int DPP_ID = 4;
string Disable = "Disable";
- int Disable_ID = 4;
+ int Disable_ID = 5;
}
def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
@@ -555,6 +557,12 @@ def SDWAAsmParserVariant : AsmParserVariant {
let Name = AMDGPUAsmVariants.SDWA;
}
+def SDWA9AsmParserVariant : AsmParserVariant {
+ let Variant = AMDGPUAsmVariants.SDWA9_ID;
+ let Name = AMDGPUAsmVariants.SDWA9;
+}
+
+
def DPPAsmParserVariant : AsmParserVariant {
let Variant = AMDGPUAsmVariants.DPP_ID;
let Name = AMDGPUAsmVariants.DPP;
@@ -567,6 +575,7 @@ def AMDGPU : Target {
let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,
VOP3AsmParserVariant,
SDWAAsmParserVariant,
+ SDWA9AsmParserVariant,
DPPAsmParserVariant];
let AssemblyWriters = [AMDGPUAsmWriter];
}
@@ -607,7 +616,10 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<"FeatureVOP3P">;
def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
- AssemblerPredicate<"FeatureSDWA">;
+ AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
+
+def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
+ AssemblerPredicate<"FeatureSDWA,FeatureGFX9">;
def HasDPP : Predicate<"Subtarget->hasDPP()">,
AssemblerPredicate<"FeatureDPP">;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5ec46a8294c0..723e8a7b54e2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -127,6 +127,29 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
+bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)
+{
+ assert(Op.getOpcode() == ISD::OR);
+
+ SDValue N0 = Op->getOperand(0);
+ SDValue N1 = Op->getOperand(1);
+ EVT VT = N0.getValueType();
+
+ if (VT.isInteger() && !VT.isVector()) {
+ KnownBits LHSKnown, RHSKnown;
+ DAG.computeKnownBits(N0, LHSKnown);
+
+ if (LHSKnown.Zero.getBoolValue()) {
+ DAG.computeKnownBits(N1, RHSKnown);
+
+ if (!(~RHSKnown.Zero & ~LHSKnown.Zero))
+ return true;
+ }
+ }
+
+ return false;
+}
+
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
const AMDGPUSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
@@ -2596,8 +2619,6 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
- if (VT != MVT::i64)
- return SDValue();
ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!RHS)
@@ -2618,6 +2639,8 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
case ISD::SIGN_EXTEND:
case ISD::ANY_EXTEND: {
// shl (ext x) => zext (shl x), if shift does not overflow int
+ if (VT != MVT::i64)
+ break;
KnownBits Known;
SDValue X = LHS->getOperand(0);
DAG.computeKnownBits(X, Known);
@@ -2628,8 +2651,23 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
return DAG.getZExtOrTrunc(Shl, SL, VT);
}
+ case ISD::OR: if (!isOrEquivalentToAdd(DAG, LHS)) break;
+ case ISD::ADD: { // Fall through from above
+ // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
+ if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+ SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
+ SDValue(RHS, 0));
+ SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,
+ SDLoc(C2), VT);
+ return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);
+ }
+ break;
+ }
}
+ if (VT != MVT::i64)
+ return SDValue();
+
// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
@@ -3440,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DL);
}
- if ((OffsetVal + WidthVal) >= 32) {
+ if ((OffsetVal + WidthVal) >= 32 &&
+ !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
BitsFrom, ShiftVal);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fb2f15022d25..0d066cdbdff4 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -34,6 +34,9 @@ private:
/// compare.
SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
+public:
+ static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
+
protected:
const AMDGPUSubtarget *Subtarget;
AMDGPUAS AMDGPUASI;
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9de302994e68..57905be18813 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -36,6 +36,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
setAction({G_CONSTANT, S32}, Legal);
setAction({G_CONSTANT, S64}, Legal);
+ setAction({G_FCONSTANT, S32}, Legal);
+
setAction({G_GEP, P1}, Legal);
setAction({G_GEP, P2}, Legal);
setAction({G_GEP, 1, S64}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 85184b363905..07f92918a43f 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -97,6 +97,9 @@ private:
Instruction *UseInst,
int OpIdx0, int OpIdx1) const;
+ /// Check whether we have enough local memory for promotion.
+ bool hasSufficientLocalMem(const Function &F);
+
public:
static char ID;
@@ -107,7 +110,7 @@ public:
StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
- void handleAlloca(AllocaInst &I);
+ bool handleAlloca(AllocaInst &I, bool SufficientLDS);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
@@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
if (!ST.isPromoteAllocaEnabled())
return false;
- AS = AMDGPU::getAMDGPUAS(*F.getParent());
-
- FunctionType *FTy = F.getFunctionType();
-
- // If the function has any arguments in the local address space, then it's
- // possible these arguments require the entire local memory space, so
- // we cannot use local memory in the pass.
- for (Type *ParamTy : FTy->params()) {
- PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
- if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
- LocalMemLimit = 0;
- DEBUG(dbgs() << "Function has local memory argument. Promoting to "
- "local memory disabled.\n");
- return false;
- }
- }
-
- LocalMemLimit = ST.getLocalMemorySize();
- if (LocalMemLimit == 0)
- return false;
-
- const DataLayout &DL = Mod->getDataLayout();
-
- // Check how much local memory is being used by global objects
- CurrentLocalMemUsage = 0;
- for (GlobalVariable &GV : Mod->globals()) {
- if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
- continue;
-
- for (const User *U : GV.users()) {
- const Instruction *Use = dyn_cast<Instruction>(U);
- if (!Use)
- continue;
-
- if (Use->getParent()->getParent() == &F) {
- unsigned Align = GV.getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(GV.getValueType());
- // FIXME: Try to account for padding here. The padding is currently
- // determined from the inverse order of uses in the function. I'm not
- // sure if the use list order is in any way connected to this, so the
- // total reported size is likely incorrect.
- uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
- CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
- CurrentLocalMemUsage += AllocSize;
- break;
- }
- }
- }
-
- unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
- F);
-
- // Restrict local memory usage so that we don't drastically reduce occupancy,
- // unless it is already significantly reduced.
-
- // TODO: Have some sort of hint or other heuristics to guess occupancy based
- // on other factors..
- unsigned OccupancyHint = ST.getWavesPerEU(F).second;
- if (OccupancyHint == 0)
- OccupancyHint = 7;
-
- // Clamp to max value.
- OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
-
- // Check the hint but ignore it if it's obviously wrong from the existing LDS
- // usage.
- MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
-
-
- // Round up to the next tier of usage.
- unsigned MaxSizeWithWaveCount
- = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
-
- // Program is possibly broken by using more local mem than available.
- if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
- return false;
-
- LocalMemLimit = MaxSizeWithWaveCount;
-
- DEBUG(
- dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
- << " Rounding size to " << MaxSizeWithWaveCount
- << " with a maximum occupancy of " << MaxOccupancy << '\n'
- << " and " << (LocalMemLimit - CurrentLocalMemUsage)
- << " available for promotion\n"
- );
+ AS = AMDGPU::getAMDGPUAS(*F.getParent());
+ bool SufficientLDS = hasSufficientLocalMem(F);
+ bool Changed = false;
BasicBlock &EntryBB = *F.begin();
for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
AllocaInst *AI = dyn_cast<AllocaInst>(I);
++I;
if (AI)
- handleAlloca(*AI);
+ Changed |= handleAlloca(*AI, SufficientLDS);
}
- return true;
+ return Changed;
}
std::pair<Value *, Value *>
@@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
return true;
}
+bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+
+ FunctionType *FTy = F.getFunctionType();
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+
+ // If the function has any arguments in the local address space, then it's
+ // possible these arguments require the entire local memory space, so
+ // we cannot use local memory in the pass.
+ for (Type *ParamTy : FTy->params()) {
+ PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+ if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
+ LocalMemLimit = 0;
+ DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+ "local memory disabled.\n");
+ return false;
+ }
+ }
+
+ LocalMemLimit = ST.getLocalMemorySize();
+ if (LocalMemLimit == 0)
+ return false;
+
+ const DataLayout &DL = Mod->getDataLayout();
+
+ // Check how much local memory is being used by global objects
+ CurrentLocalMemUsage = 0;
+ for (GlobalVariable &GV : Mod->globals()) {
+ if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
+ continue;
+
+ for (const User *U : GV.users()) {
+ const Instruction *Use = dyn_cast<Instruction>(U);
+ if (!Use)
+ continue;
+
+ if (Use->getParent()->getParent() == &F) {
+ unsigned Align = GV.getAlignment();
+ if (Align == 0)
+ Align = DL.getABITypeAlignment(GV.getValueType());
+
+ // FIXME: Try to account for padding here. The padding is currently
+ // determined from the inverse order of uses in the function. I'm not
+ // sure if the use list order is in any way connected to this, so the
+ // total reported size is likely incorrect.
+ uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+ CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+ CurrentLocalMemUsage += AllocSize;
+ break;
+ }
+ }
+ }
+
+ unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+ F);
+
+ // Restrict local memory usage so that we don't drastically reduce occupancy,
+ // unless it is already significantly reduced.
+
+ // TODO: Have some sort of hint or other heuristics to guess occupancy based
+ // on other factors..
+ unsigned OccupancyHint = ST.getWavesPerEU(F).second;
+ if (OccupancyHint == 0)
+ OccupancyHint = 7;
+
+ // Clamp to max value.
+ OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
+
+ // Check the hint but ignore it if it's obviously wrong from the existing LDS
+ // usage.
+ MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+ // Round up to the next tier of usage.
+ unsigned MaxSizeWithWaveCount
+ = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
+
+ // Program is possibly broken by using more local mem than available.
+ if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+ return false;
+
+ LocalMemLimit = MaxSizeWithWaveCount;
+
+ DEBUG(
+ dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+ << " Rounding size to " << MaxSizeWithWaveCount
+ << " with a maximum occupancy of " << MaxOccupancy << '\n'
+ << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+ << " available for promotion\n"
+ );
+
+ return true;
+}
+
// FIXME: Should try to pick the most likely to be profitable allocas first.
-void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// Array allocations are probably not worth handling, since an allocation of
// the array type is the canonical form.
if (!I.isStaticAlloca() || I.isArrayAllocation())
- return;
+ return false;
IRBuilder<> Builder(&I);
@@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
DEBUG(dbgs() << "Trying to promote " << I << '\n');
- if (tryPromoteAllocaToVector(&I, AS)) {
- DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
- return;
- }
+ if (tryPromoteAllocaToVector(&I, AS))
+ return true; // Promoted to vector.
const Function &ContainingFunction = *I.getParent()->getParent();
CallingConv::ID CC = ContainingFunction.getCallingConv();
@@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
break;
default:
DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
- return;
+ return false;
}
+ // Not likely to have sufficient local memory for promotion.
+ if (!SufficientLDS)
+ return false;
+
const AMDGPUSubtarget &ST =
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
@@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
if (NewSize > LocalMemLimit) {
DEBUG(dbgs() << " " << AllocSize
<< " bytes of local memory not available to promote\n");
- return;
+ return false;
}
CurrentLocalMemUsage = NewSize;
@@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
DEBUG(dbgs() << " Do not know how to convert all uses\n");
- return;
+ return false;
}
DEBUG(dbgs() << "Promoting alloca to local memory\n");
@@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}
+ return true;
}
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index e543cae07ada..660879426810 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -416,6 +416,10 @@ public:
return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
}
+ bool hasSDWA() const {
+ return HasSDWA;
+ }
+
/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
@@ -670,10 +674,6 @@ public:
return HasInv2PiInlineImm;
}
- bool hasSDWA() const {
- return HasSDWA;
- }
-
bool hasDPP() const {
return HasDPP;
}
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b52ea2b3a2c6..f5541e08e1b7 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -881,6 +881,10 @@ public:
return AMDGPU::isVI(getSTI());
}
+ bool isGFX9() const {
+ return AMDGPU::isGFX9(getSTI());
+ }
+
bool hasInv2PiInlineImm() const {
return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
}
@@ -989,7 +993,6 @@ private:
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
- bool isSGPR(unsigned Reg);
public:
OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
@@ -1042,9 +1045,10 @@ public:
OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);
void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
+ void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
- uint64_t BasicInstType);
+ uint64_t BasicInstType, bool skipVcc = false);
};
struct OptionalOperand {
@@ -1966,7 +1970,8 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
}
if (isForcedSDWA()) {
- static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA};
+ static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA,
+ AMDGPUAsmVariants::SDWA9};
return makeArrayRef(Variants);
}
@@ -1977,7 +1982,7 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
static const unsigned Variants[] = {
AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
- AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP
+ AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
};
return makeArrayRef(Variants);
@@ -2000,14 +2005,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
return AMDGPU::NoRegister;
}
-bool AMDGPUAsmParser::isSGPR(unsigned Reg) {
- const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
- const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
- return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
- Reg == AMDGPU::SCC;
-}
-
// NB: This code is correct only when used to check constant
// bus limitations because GFX7 support no f16 inline constants.
// Note that there are no cases when a GFX7 opcode violates
@@ -2049,7 +2046,8 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
if (MO.isImm()) {
return !isInlineConstant(Inst, OpIdx);
}
- return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg()));
+ return !MO.isReg() ||
+ isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
}
bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
@@ -2060,7 +2058,8 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
if (Desc.TSFlags &
(SIInstrFlags::VOPC |
SIInstrFlags::VOP1 | SIInstrFlags::VOP2 |
- SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) {
+ SIInstrFlags::VOP3 | SIInstrFlags::VOP3P |
+ SIInstrFlags::SDWA)) {
// Check special imm operands (used by madmk, etc)
if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) {
@@ -4151,14 +4150,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);
}
+void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) {
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true);
+}
+
void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
- cvtSDWA(Inst, Operands, SIInstrFlags::VOPC);
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI());
}
void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
- uint64_t BasicInstType) {
+ uint64_t BasicInstType, bool skipVcc) {
using namespace llvm::AMDGPU::SDWA;
OptionalImmIndexMap OptionalIdx;
+ bool skippedVcc = false;
unsigned I = 1;
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
@@ -4168,15 +4172,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
- // Add the register arguments
- if ((BasicInstType == SIInstrFlags::VOPC ||
- BasicInstType == SIInstrFlags::VOP2)&&
- Op.isReg() &&
- Op.Reg.RegNo == AMDGPU::VCC) {
- // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
- // Skip it.
- continue;
- } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+ if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+ // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
+ // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)
+ // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.
+ // Skip VCC only if we didn't skip it on previous iteration.
+ if (BasicInstType == SIInstrFlags::VOP2 &&
+ (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) {
+ skippedVcc = true;
+ continue;
+ } else if (BasicInstType == SIInstrFlags::VOPC &&
+ Inst.getNumOperands() == 0) {
+ skippedVcc = true;
+ continue;
+ }
+ }
+ if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegWithInputModsOperands(Inst, 2);
} else if (Op.isImm()) {
// Handle optional arguments
@@ -4184,20 +4195,30 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
} else {
llvm_unreachable("Invalid operand type");
}
+ skippedVcc = false;
}
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-
- if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
+ if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
+ Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
// V_NOP_sdwa_vi has no optional sdwa arguments
switch (BasicInstType) {
case SIInstrFlags::VOP1:
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+ if (isGFX9() &&
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+ }
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
break;
case SIInstrFlags::VOP2:
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+ if (isGFX9() &&
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+ }
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
@@ -4205,6 +4226,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
break;
case SIInstrFlags::VOPC:
+ if (isVI()) {
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+ }
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
break;
@@ -4220,10 +4244,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {
auto it = Inst.begin();
std::advance(
- it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+ it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
Inst.insert(it, Inst.getOperand(0)); // src2 = dst
}
-
}
/// Force static initialization.
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 137b5cca96ce..9b3cde7c4df6 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -62,32 +62,33 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
return addOperand(Inst, MCOperand::createImm(Imm));
}
-#define DECODE_OPERAND2(RegClass, DecName) \
-static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \
- unsigned Imm, \
- uint64_t /*Addr*/, \
- const void *Decoder) { \
+#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
+static DecodeStatus StaticDecoderName(MCInst &Inst, \
+ unsigned Imm, \
+ uint64_t /*Addr*/, \
+ const void *Decoder) { \
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
- return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \
+ return addOperand(Inst, DAsm->DecoderName(Imm)); \
}
-#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass)
+#define DECODE_OPERAND_REG(RegClass) \
+DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
-DECODE_OPERAND(VGPR_32)
-DECODE_OPERAND(VS_32)
-DECODE_OPERAND(VS_64)
+DECODE_OPERAND_REG(VGPR_32)
+DECODE_OPERAND_REG(VS_32)
+DECODE_OPERAND_REG(VS_64)
-DECODE_OPERAND(VReg_64)
-DECODE_OPERAND(VReg_96)
-DECODE_OPERAND(VReg_128)
+DECODE_OPERAND_REG(VReg_64)
+DECODE_OPERAND_REG(VReg_96)
+DECODE_OPERAND_REG(VReg_128)
-DECODE_OPERAND(SReg_32)
-DECODE_OPERAND(SReg_32_XM0_XEXEC)
-DECODE_OPERAND(SReg_64)
-DECODE_OPERAND(SReg_64_XEXEC)
-DECODE_OPERAND(SReg_128)
-DECODE_OPERAND(SReg_256)
-DECODE_OPERAND(SReg_512)
+DECODE_OPERAND_REG(SReg_32)
+DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
+DECODE_OPERAND_REG(SReg_64)
+DECODE_OPERAND_REG(SReg_64_XEXEC)
+DECODE_OPERAND_REG(SReg_128)
+DECODE_OPERAND_REG(SReg_256)
+DECODE_OPERAND_REG(SReg_512)
static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
@@ -106,6 +107,13 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
}
+#define DECODE_SDWA9(DecName) \
+DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName)
+
+DECODE_SDWA9(Src32)
+DECODE_SDWA9(Src16)
+DECODE_SDWA9(VopcDst)
+
#include "AMDGPUGenDisassemblerTables.inc"
//===----------------------------------------------------------------------===//
@@ -164,6 +172,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
if (Res) break;
+
+ Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
+ if (Res) break;
}
// Reinitialize Bytes as DPP64 could have eaten too much
@@ -582,6 +593,48 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
return errOperand(Val, "unknown operand encoding " + Twine(Val));
}
+MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width,
+ unsigned Val) const {
+ using namespace AMDGPU::SDWA;
+
+ if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
+ Val <= SDWA9EncValues::SRC_VGPR_MAX) {
+ return createRegOperand(getVgprClassId(Width),
+ Val - SDWA9EncValues::SRC_VGPR_MIN);
+ }
+ if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
+ Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+ return createSRegOperand(getSgprClassId(Width),
+ Val - SDWA9EncValues::SRC_SGPR_MIN);
+ }
+
+ return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+}
+
+MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const {
+ return decodeSDWA9Src(OPW16, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const {
+ return decodeSDWA9Src(OPW32, Val);
+}
+
+
+MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const {
+ using namespace AMDGPU::SDWA;
+
+ if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
+ Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
+ if (Val > AMDGPU::EncValues::SGPR_MAX) {
+ return decodeSpecialReg64(Val);
+ } else {
+ return createSRegOperand(getSgprClassId(OPW64), Val);
+ }
+ } else {
+ return createRegOperand(AMDGPU::VCC);
+ }
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUSymbolizer
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 620bae0a6d1a..0ff405a71e9b 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -104,6 +104,11 @@ public:
MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
MCOperand decodeSpecialReg32(unsigned Val) const;
MCOperand decodeSpecialReg64(unsigned Val) const;
+
+ MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const;
+ MCOperand decodeSDWA9Src16(unsigned Val) const;
+ MCOperand decodeSDWA9Src32(unsigned Val) const;
+ MCOperand decodeSDWA9VopcDst(unsigned Val) const;
};
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 3bb5c9bc22b7..8ead48067336 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -191,6 +191,7 @@ public:
}
};
+namespace {
// just a stub to make base class happy
class SchedStrategyStub : public MachineSchedStrategy {
public:
@@ -202,6 +203,7 @@ public:
void releaseTopNode(SUnit *SU) override {}
void releaseBottomNode(SUnit *SU) override {}
};
+} // namespace
GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
StrategyKind S)
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index c6d0f2179950..d378df674be9 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -17,6 +17,7 @@ using namespace llvm;
#define DEBUG_TYPE "misched"
+namespace {
class GCNMinRegScheduler {
struct Candidate : ilist_node<Candidate> {
const SUnit *SU;
@@ -71,6 +72,7 @@ public:
std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
const ScheduleDAG &DAG);
};
+} // namespace
void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
NumPreds.resize(SUnits.size());
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 18374dca3f84..390a8286c76a 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -211,9 +211,9 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO,
return getLiveLaneMask(MO.getReg(), SI, LIS, MRI);
}
-SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI,
- const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI) {
+static SmallVector<RegisterMaskPair, 8>
+collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI) {
SmallVector<RegisterMaskPair, 8> Res;
for (const auto &MO : MI.operands()) {
if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 3d3858ab47ec..a856b17a228f 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -52,6 +52,18 @@ public:
return 0;
}
+ virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 0;
+ }
+
+ virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ return 0;
+ }
+
protected:
uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
void verifyInstructionPredicates(const MCInst &MI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index bda0928036fd..e02acf516c0d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -69,6 +69,14 @@ public:
unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
+
+ unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
};
} // end anonymous namespace
@@ -319,6 +327,44 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
return getMachineOpValue(MI, MO, Fixups, STI);
}
+unsigned
+SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ using namespace AMDGPU::SDWA;
+
+ uint64_t RegEnc = 0;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ unsigned Reg = MO.getReg();
+ RegEnc |= MRI.getEncodingValue(Reg);
+ RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
+ if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
+ RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+ }
+ return RegEnc;
+}
+
+unsigned
+SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ using namespace AMDGPU::SDWA;
+
+ uint64_t RegEnc = 0;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ unsigned Reg = MO.getReg();
+ if (Reg != AMDGPU::VCC) {
+ RegEnc |= MRI.getEncodingValue(Reg);
+ RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
+ RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
+ }
+ return RegEnc;
+}
+
uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 3590a9b05e1d..60b913cfd39a 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1618,6 +1618,14 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
return VT.changeVectorElementTypeToInteger();
}
+bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {
+ // Local and Private addresses do not handle vectors. Limit to i32
+ if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
+ return (MemVT.getSizeInBits() <= 32);
+ }
+ return true;
+}
+
bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned AddrSpace,
unsigned Align,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 9700ce14c6f3..d6a0876a6ee7 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -44,6 +44,8 @@ public:
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const override;
+ bool canMergeStoresTo(unsigned AS, EVT MemVT) const override;
+
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
unsigned Align,
bool *IsFast) const override;
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
index cc667d985a82..3c1e8527284c 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
R600_Addr,
R600_KC0, R600_KC1,
ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
- ALU_CONST, ALU_PARAM, OQAP
+ ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR
)>;
def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a01330cb9171..80967edee0ab 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -118,6 +118,10 @@ namespace AMDGPU {
// Operand for source modifiers for VOP instructions
OPERAND_INPUT_MODS,
+ // Operand for GFX9 SDWA instructions
+ OPERAND_SDWA9_SRC,
+ OPERAND_SDWA9_VOPC_DST,
+
/// Operand with 32-bit immediate that uses the constant bus.
OPERAND_KIMM32,
OPERAND_KIMM16
@@ -160,7 +164,8 @@ namespace AMDGPUAsmVariants {
DEFAULT = 0,
VOP3 = 1,
SDWA = 2,
- DPP = 3
+ SDWA9 = 3,
+ DPP = 4
};
}
@@ -294,6 +299,18 @@ enum DstUnused {
UNUSED_PRESERVE = 2,
};
+enum SDWA9EncValues{
+ SRC_SGPR_MASK = 0x100,
+ SRC_VGPR_MASK = 0xFF,
+ VOPC_DST_VCC_MASK = 0x80,
+ VOPC_DST_SGPR_MASK = 0x7F,
+
+ SRC_VGPR_MIN = 0,
+ SRC_VGPR_MAX = 255,
+ SRC_SGPR_MIN = 256,
+ SRC_SGPR_MAX = 357,
+};
+
} // namespace SDWA
} // namespace AMDGPU
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 01c1f78e7ca4..76c2644867aa 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -698,6 +698,18 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
}
+bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {
+ if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
+ return (MemVT.getSizeInBits() <= 4 * 32);
+ } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+ unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
+ return (MemVT.getSizeInBits() <= MaxPrivateBits);
+ } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+ return (MemVT.getSizeInBits() <= 2 * 32);
+ }
+ return true;
+}
+
bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned AddrSpace,
unsigned Align,
@@ -4229,12 +4241,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
SDValue RHS = N->getOperand(1);
- if (VT == MVT::i64) {
- const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
- if (CRHS) {
- if (SDValue Split
- = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
- return Split;
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (VT == MVT::i64 && CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+ return Split;
+ }
+
+ if (CRHS && VT == MVT::i32) {
+ // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
+ // nb = number of trailing zeroes in mask
+ // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
+ // given that we are selecting 8 or 16 bit fields starting at byte boundary.
+ uint64_t Mask = CRHS->getZExtValue();
+ unsigned Bits = countPopulation(Mask);
+ if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
+ (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
+ if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+ unsigned Shift = CShift->getZExtValue();
+ unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
+ unsigned Offset = NB + Shift;
+ if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
+ SDLoc SL(N);
+ SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+ LHS->getOperand(0),
+ DAG.getConstant(Offset, SL, MVT::i32),
+ DAG.getConstant(Bits, SL, MVT::i32));
+ EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+ SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
+ DAG.getValueType(NarrowVT));
+ SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
+ DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
+ return Shl;
+ }
+ }
}
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index e68837747491..8e2ec40b224c 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -150,6 +150,8 @@ public:
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
+ bool canMergeStoresTo(unsigned AS, EVT MemVT) const override;
+
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
unsigned Align,
bool *IsFast) const override;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 38a16b525a75..36d29b8ecf06 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2331,6 +2331,10 @@ static bool isSubRegOf(const SIRegisterInfo &TRI,
bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI.getOpcode();
+
+ if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
+ return true;
+
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 7b052844f177..c5287c7f64ba 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -439,6 +439,27 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
let ParserMatchClass = VReg32OrOffClass;
}
+class SDWA9Src : RegisterOperand<VS_32> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_SDWA9_SRC";
+ let EncoderMethod = "getSDWA9SrcEncoding";
+}
+
+def SDWA9Src32 : SDWA9Src {
+ let DecoderMethod = "decodeSDWA9Src32";
+}
+
+def SDWA9Src16 : SDWA9Src {
+ let DecoderMethod = "decodeSDWA9Src16";
+}
+
+def SDWA9VopcDst : VOPDstOperand<SReg_64> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_SDWA9_VOPC_DST";
+ let EncoderMethod = "getSDWA9VopcDstEncoding";
+ let DecoderMethod = "decodeSDWA9VopcDst";
+}
+
class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
let Name = "Imm"#CName;
let PredicateMethod = "is"#CName;
@@ -588,6 +609,16 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+def FPRegInputModsMatchClass : AsmOperandClass {
+ let Name = "RegWithFPInputMods";
+ let ParserMethod = "parseRegWithFPInputMods";
+ let PredicateMethod = "isRegKind";
+}
+
+def FPRegInputMods : InputMods <FPRegInputModsMatchClass> {
+ let PrintMethod = "printOperandAndFPInputMods";
+}
+
def FPVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithFPInputMods";
let ParserMethod = "parseRegWithFPInputMods";
@@ -598,6 +629,17 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
let PrintMethod = "printOperandAndFPInputMods";
}
+
+def IntRegInputModsMatchClass : AsmOperandClass {
+ let Name = "RegWithIntInputMods";
+ let ParserMethod = "parseRegWithIntInputMods";
+ let PredicateMethod = "isRegKind";
+}
+
+def IntRegInputMods : InputMods <IntRegInputModsMatchClass> {
+ let PrintMethod = "printOperandAndIntInputMods";
+}
+
def IntVRegInputModsMatchClass : AsmOperandClass {
let Name = "VRegWithIntInputMods";
let ParserMethod = "parseRegWithIntInputMods";
@@ -783,6 +825,14 @@ class getVALUDstForVT<ValueType VT> {
VOPDstOperand<SReg_64>)))); // else VT == i1
}
+// Returns the register class to use for the destination of VOP[12C]
+// instructions with GFX9 SDWA extension
+class getSDWA9DstForVT<ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 1),
+ SDWA9VopcDst, // VOPC
+ VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst
+}
+
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
@@ -823,6 +873,9 @@ class getVregSrcForVT<ValueType VT> {
!if(!eq(VT.Size, 64), VReg_64, VGPR_32));
}
+class getSDWA9SrcForVT <ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32);
+}
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
@@ -926,6 +979,15 @@ class getSrcModExt <ValueType VT> {
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
+// Return type of input modifiers operand specified input operand for SDWA 9
+class getSrcModSDWA9 <ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods);
+}
+
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1
@@ -1062,6 +1124,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
// VOP1 without input operands (V_NOP)
(ins),
!if(!eq(NumSrcArgs, 1),
+ // VOP1_SDWA
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel),
@@ -1071,7 +1134,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
- // VOP2_SDWA or VOPC_SDWA with modifiers
+ // VOP2_SDWA with modifiers
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
@@ -1079,12 +1142,65 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
(ins)/* endif */)));
}
+// Ins for GFX9 SDWA
+class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
+ bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
+ ValueType DstVT> {
+
+ dag ret = !if(!eq(NumSrcArgs, 0),
+ // VOP1 without input operands (V_NOP)
+ (ins),
+ !if(!eq(NumSrcArgs, 1),
+ // VOP1
+ !if(!eq(HasSDWAOMod, 0),
+ // VOP1_SDWA9 without omod
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod:$clamp,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel),
+ // VOP1_SDWA9 with omod
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ clampmod:$clamp, omod:$omod,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel)),
+ !if(!eq(NumSrcArgs, 2),
+ !if(!eq(DstVT.Size, 1),
+ // VOPC_SDWA9
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ src0_sel:$src0_sel, src1_sel:$src1_sel),
+ // VOP2_SDWA9
+ !if(!eq(HasSDWAOMod, 0),
+ // VOP2_SDWA9 without omod
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ clampmod:$clamp,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel),
+ // VOP1_SDWA9 with omod
+ (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+ Src1Mod:$src1_modifiers, Src1RC:$src1,
+ clampmod:$clamp, omod:$omod,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel))),
+ (ins)/* endif */)));
+}
+
// Outs for DPP and SDWA
-class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
+class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
dag ret = !if(HasDst,
!if(!eq(DstVT.Size, 1),
(outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
- (outs DstRCDPP:$vdst)),
+ (outs DstRCExt:$vdst)),
+ (outs)); // V_NOP
+}
+
+// Outs for GFX9 SDWA
+class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> {
+ dag ret = !if(HasDst,
+ !if(!eq(DstVT.Size, 1),
+ (outs DstRCSDWA9:$sdst),
+ (outs DstRCSDWA9:$vdst)),
(outs)); // V_NOP
}
@@ -1153,8 +1269,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
}
-class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
- ValueType DstVT = i32> {
+class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
string dst = !if(HasDst,
!if(!eq(DstVT.Size, 1),
" vcc", // use vcc token as dst for VOPC instructioins
@@ -1182,6 +1297,35 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
string ret = dst#args#sdwa;
}
+class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,
+ ValueType DstVT = i32> {
+ string dst = !if(HasDst,
+ !if(!eq(DstVT.Size, 1),
+ "$sdst", // VOPC
+ "$vdst"), // VOP1/2
+ "");
+ string src0 = "$src0_modifiers";
+ string src1 = "$src1_modifiers";
+ string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod");
+ string args = !if(!eq(NumSrcArgs, 0), "",
+ !if(!eq(NumSrcArgs, 1),
+ ", "#src0,
+ ", "#src0#", "#src1
+ )
+ );
+ string sdwa = !if(!eq(NumSrcArgs, 0), "",
+ !if(!eq(NumSrcArgs, 1),
+ out_mods#" $dst_sel $dst_unused $src0_sel",
+ !if(!eq(DstVT.Size, 1),
+ " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC
+ out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel"
+ )
+ )
+ );
+ string ret = dst#args#sdwa;
+}
+
+
// Function that checks if instruction supports DPP and SDWA
class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32> {
@@ -1219,6 +1363,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
+ field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret;
field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
@@ -1228,6 +1373,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
+ field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
+ field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
field Operand Src0Mod = getSrcMod<Src0VT>.ret;
field Operand Src1Mod = getSrcMod<Src1VT>.ret;
field Operand Src2Mod = getSrcMod<Src2VT>.ret;
@@ -1235,6 +1382,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
+ field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret;
+ field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret;
field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
@@ -1261,14 +1410,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
field bit HasClamp = HasModifiers;
- field bit HasSDWAClamp = HasSrc0;
+ field bit HasSDWAClamp = EmitDst;
field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
field bit IsPacked = isPackedType<Src0VT>.ret;
field bit HasOpSel = IsPacked;
field bit HasOMod = !if(HasOpSel, 0, HasModifiers);
+ field bit HasSDWAOMod = isFloatType<DstVT>.ret;
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+ field bit HasSDWA9 = HasExt;
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -1282,6 +1433,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Outs64 = Outs;
field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
+ field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
@@ -1296,16 +1448,21 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
HasModifiers, Src0ModSDWA, Src1ModSDWA,
DstVT>.ret;
+ field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs,
+ HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9,
+ DstVT>.ret;
field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
- field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+ field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
+ field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
}
class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
let HasExt = 0;
+ let HasSDWA9 = 0;
}
def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
@@ -1446,6 +1603,15 @@ def getSDWAOp : InstrMapping {
let ValueCols = [["SDWA"]];
}
+// Maps ordinary instructions to their SDWA GFX9 counterparts
+def getSDWA9Op : InstrMapping {
+ let FilterClass = "VOP";
+ let RowFields = ["OpName"];
+ let ColFields = ["AsmVariantName"];
+ let KeyCol = ["Default"];
+ let ValueCols = [["SDWA9"]];
+}
+
def getMaskedMIMGOp : InstrMapping {
let FilterClass = "MIMG_Mask";
let RowFields = ["Op"];
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index f2d8b6f7b7a4..ec29a66c8bbb 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -184,7 +184,9 @@ def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">;
def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;
def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
-def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
+def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",
+ [(set i64:$sdst, (int_amdgcn_s_getpc))]
+>;
let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2abd4afad3b6..630f469eabf0 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -544,6 +544,17 @@ bool isVI(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
}
+bool isGFX9(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
+}
+
+bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
+ const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
+ const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
+ return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
+ Reg == AMDGPU::SCC;
+}
+
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
switch(Reg) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 8e74aa2cc9a8..19888ad7556a 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -273,6 +273,10 @@ inline bool isKernel(CallingConv::ID CC) {
bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
+bool isGFX9(const MCSubtargetInfo &STI);
+
+/// \brief Is Reg - scalar register
+bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
/// If \p Reg is a pseudo reg, return the correct hardware register given
/// \p STI otherwise return \p Reg.
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 1febc6bf8ec2..95b5ef0a49db 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -30,6 +30,15 @@ class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
let Inst{31-25} = 0x3f; // encoding
}
+class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> {
+ bits<8> vdst;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = op;
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{31-25} = 0x3f; // encoding
+}
+
class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
InstSI <P.Outs32, P.Ins32, "", pattern>,
VOP <opName>,
@@ -84,6 +93,11 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP1";
}
+class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA9_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOP1";
+}
+
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret =
!if(P.HasModifiers,
@@ -103,6 +117,7 @@ multiclass VOP1Inst <string opName, VOPProfile P,
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+ def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>;
}
// Special profile for instructions which have clamp
@@ -243,6 +258,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
let Src0RC64 = VRegSrc_32;
let HasExt = 0;
+ let HasSDWA9 = 0;
}
// Special case because there are no true output operands. Hack vdst
@@ -258,16 +274,21 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
- let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
+ let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel);
+ let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
+ clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel);
let Asm32 = getAsm32<1, 1>.ret;
let Asm64 = getAsm64<1, 1, 0, 1>.ret;
let AsmDPP = getAsmDPP<1, 1, 0>.ret;
- let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
+ let AsmSDWA = getAsmSDWA<1, 1>.ret;
+ let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;
let HasExt = 0;
+ let HasSDWA9 = 0;
let HasDst = 0;
let EmitDst = 1; // force vdst emission
}
@@ -324,7 +345,7 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
} // End SubtargetPredicate = isCIVI
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
@@ -347,7 +368,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
}
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
def : Pat<
(f32 (f16_to_fp i16:$src)),
@@ -523,6 +544,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>,
+ VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 4a11d9471f1d..657cacaa792c 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -48,6 +48,18 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
let Inst{31} = 0x0; // encoding
}
+class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
+ bits<8> vdst;
+ bits<9> src1;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+ let Inst{30-25} = op;
+ let Inst{31} = 0x0; // encoding
+ let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+}
+
class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
InstSI <P.Outs32, P.Ins32, "", pattern>,
VOP <opName>,
@@ -102,6 +114,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOP2";
}
+class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA9_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOP2";
+}
+
class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set P.DstVT:$vdst,
@@ -121,10 +138,10 @@ multiclass VOP2Inst <string opName,
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+ def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>;
}
-// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst
multiclass VOP2bInst <string opName,
VOPProfile P,
SDPatternOperator node = null_frag,
@@ -136,7 +153,13 @@ multiclass VOP2bInst <string opName,
def _e32 : VOP2_Pseudo <opName, P>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2b";
+ }
+
+ def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2b";
+ }
}
def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -203,13 +226,21 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
VGPR_32:$src2, // stub argument
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
+ Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
+ VGPR_32:$src2, // stub argument
+ clampmod:$clamp, omod:$omod,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
let Asm32 = getAsm32<1, 2, vt>.ret;
let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
- let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
+ let AsmSDWA = getAsmSDWA<1, 2, vt>.ret;
+ let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
let HasSrc2 = 0;
let HasSrc2Mods = 0;
let HasExt = 1;
+ let HasSDWA9 = 0;
}
def VOP_MAC_F16 : VOP_MAC <f16> {
@@ -229,6 +260,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
let Asm32 = "$vdst, vcc, $src0, $src1";
let Asm64 = "$vdst, $sdst, $src0, $src1";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
@@ -246,6 +278,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+ let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
let Outs32 = (outs DstRC:$vdst);
let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
@@ -254,16 +287,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
// implicit VCC use.
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
- let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0,
- Src1Mod:$src1_modifiers, Src1SDWA:$src1,
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
+ Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
+ clampmod:$clamp, omod:$omod,
+ dst_sel:$dst_sel, dst_unused:$dst_unused,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
+
let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
Src1Mod:$src1_modifiers, Src1DPP:$src1,
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let HasExt = 1;
+ let HasSDWA9 = 1;
}
// Read in from vcc or arbitrary SGPR
@@ -387,7 +427,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
} // End let SubtargetPredicate = SICI
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
@@ -418,7 +458,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
}
} // End isCommutable = 1
-} // End SubtargetPredicate = isVI
+} // End SubtargetPredicate = Has16BitInsts
// Note: 16-bit instructions produce a 0 result in the high 16-bits.
multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
@@ -468,7 +508,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : Pat <
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
>;
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
@@ -513,7 +553,7 @@ def : Pat<
(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
>;
-} // End Predicates = [isVI]
+} // End Predicates = [Has16BitInsts]
//===----------------------------------------------------------------------===//
// SI
@@ -686,15 +726,21 @@ multiclass VOP2_SDWA_Real <bits<6> op> {
VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
+multiclass VOP2_SDWA9_Real <bits<6> op> {
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+}
+
multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
- Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+ Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
}
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
- Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+ Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index c0b5069948fb..001fc960b228 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -243,7 +243,7 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
} // End SubtargetPredicate = isCIVI
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
let isCommutable = 1 in {
@@ -258,12 +258,13 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
} // End isCommutable = 1
+} // End SubtargetPredicate = Has16BitInsts
+let SubtargetPredicate = isVI in {
def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-
} // End SubtargetPredicate = isVI
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
Instruction inst, SDPatternOperator op3> {
@@ -288,7 +289,7 @@ def : Pat<
defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
-} // End Predicates = [isVI]
+} // End Predicates = [Has16BitInsts]
let SubtargetPredicate = isGFX9 in {
def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>;
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index a3550a63677b..cd347b86d305 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -34,6 +34,17 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
let Inst{44-43} = SDWA.UNUSED_PRESERVE;
}
+class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
+ bits<9> src1;
+
+ let Inst{8-0} = 0xf9; // sdwa
+ let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
+ let Inst{24-17} = op;
+ let Inst{31-25} = 0x3e; // encoding
+ let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+}
+
+
//===----------------------------------------------------------------------===//
// VOPC classes
//===----------------------------------------------------------------------===//
@@ -102,6 +113,11 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
let AsmMatchConverter = "cvtSdwaVOPC";
}
+class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+ VOP_SDWA9_Pseudo <OpName, P, pattern> {
+ let AsmMatchConverter = "cvtSdwaVOPC";
+}
+
// This class is used only with VOPC instructions. Use $sdst for out operand
class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
@@ -173,6 +189,13 @@ multiclass VOPC_Pseudos <string opName,
let isConvergent = DefExec;
let isCompare = 1;
}
+
+ def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = P.Schedule;
+ let isConvergent = DefExec;
+ let isCompare = 1;
+ }
}
def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
@@ -520,7 +543,11 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
+ let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
+ Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
+ src0_sel:$src0_sel, src1_sel:$src1_sel);
let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
+ //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
let HasSrc1Mods = 0;
let HasClamp = 0;
let HasOMod = 0;
@@ -553,6 +580,12 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
let SchedRW = p.Schedule;
let isConvergent = DefExec;
}
+
+ def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = p.Schedule;
+ let isConvergent = DefExec;
+ }
}
def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
@@ -920,6 +953,10 @@ multiclass VOPC_Real_vi <bits<10> op> {
VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>,
+ VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+
def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
!cast<Instruction>(NAME#"_e32_vi")> {
let AssemblerPredicate = isVI;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 69906c419db3..4da654f84f9d 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -293,11 +293,52 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
- let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+ let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+ let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+}
+
+// gfx9 SDWA basic encoding
+class VOP_SDWA9e<VOPProfile P> : Enc64 {
+ bits<9> src0; // {src0_sgpr{0}, src0{7-0}}
+ bits<3> src0_sel;
+ bits<2> src0_modifiers; // float: {abs,neg}, int {sext}
+ bits<3> src1_sel;
+ bits<2> src1_modifiers;
+ bits<1> src1_sgpr;
+
+ let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+ let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+ let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+ let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
+ let Inst{55} = !if(P.HasSrc0, src0{8}, 0);
+ let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+ let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+ let Inst{63} = 0; // src1_sgpr - should be specified in subclass
+}
+
+// gfx9 SDWA-A
+class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
+ bits<3> dst_sel;
+ bits<2> dst_unused;
+ bits<1> clamp;
+ bits<2> omod;
+
+ let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
+ let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+ let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0);
+ let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
+}
+
+// gfx9 SDWA-B
+class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
+ bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}}
+
+ let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0);
+ let Inst{47} = !if(P.EmitDst, sdst{7}, 0);
}
class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
@@ -331,6 +372,50 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
VOPProfile Pfl = P;
}
+// GFX9 adds two features to SDWA:
+// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
+// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
+// than VGPRs (at most 1 can be an SGPR);
+// b. OMOD is the standard output modifier (result *2, *4, /2)
+// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This
+// replaces OMOD and the dest fields with SD and SDST (SGPR destination)
+// field.
+// a. When SD=1, the SDST is used as the destination for the compare result;
+// b.when SD=0, VCC is used.
+//
+// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
+
+class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+ InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>,
+ MnemonicAlias <opName#"_sdwa9", opName> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let UseNamedOperandTable = 1;
+
+ string Mnemonic = opName;
+ string AsmOperands = P.AsmSDWA9;
+
+ let Size = 8;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+
+ let VALU = 1;
+ let SDWA = 1;
+ let Uses = [EXEC];
+
+ let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
+ let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
+ let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9,
+ AMDGPUAsmVariants.Disable);
+ let DecoderNamespace = "SDWA9";
+
+ VOPProfile Pfl = P;
+}
+
class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
@@ -358,6 +443,33 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
let TSFlags = ps.TSFlags;
}
+class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
+ InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let Defs = ps.Defs;
+ let Uses = ps.Uses;
+ let SchedRW = ps.SchedRW;
+ let hasSideEffects = ps.hasSideEffects;
+
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+
+ // Copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AssemblerPredicate = ps.AssemblerPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let AsmVariantName = ps.AsmVariantName;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let DecoderNamespace = ps.DecoderNamespace;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+}
+
class VOP_DPPe<VOPProfile P> : Enc64 {
bits<2> src0_modifiers;
bits<8> src0;
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 46ac4d0ad933..31a2f499a9a7 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -34,6 +34,9 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
Type *T) {
+ if (T->isArrayTy())
+ return true;
+
EVT VT = TLI.getValueType(DL, T, true);
if (!VT.isSimple() || VT.isVector() ||
!(VT.isInteger() || VT.isFloatingPoint()))
@@ -148,23 +151,47 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
};
} // End anonymous namespace.
-void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL,
- MachineRegisterInfo &MRI) const {
+void ARMCallLowering::splitToValueTypes(
+ const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+ MachineFunction &MF, const SplitArgTy &PerformArgSplit) const {
const ARMTargetLowering &TLI = *getTLI<ARMTargetLowering>();
LLVMContext &Ctx = OrigArg.Ty->getContext();
+ const DataLayout &DL = MF.getDataLayout();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const Function *F = MF.getFunction();
SmallVector<EVT, 4> SplitVTs;
SmallVector<uint64_t, 4> Offsets;
ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
- assert(SplitVTs.size() == 1 && "Unsupported type");
+ if (SplitVTs.size() == 1) {
+ // Even if there is no splitting to do, we still want to replace the
+ // original type (e.g. pointer type -> integer).
+ SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
+ OrigArg.Flags, OrigArg.IsFixed);
+ return;
+ }
+
+ unsigned FirstRegIdx = SplitArgs.size();
+ for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) {
+ EVT SplitVT = SplitVTs[i];
+ Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
+ auto Flags = OrigArg.Flags;
+ bool NeedsConsecutiveRegisters =
+ TLI.functionArgumentNeedsConsecutiveRegisters(
+ SplitTy, F->getCallingConv(), F->isVarArg());
+ if (NeedsConsecutiveRegisters) {
+ Flags.setInConsecutiveRegs();
+ if (i == e - 1)
+ Flags.setInConsecutiveRegsLast();
+ }
+ SplitArgs.push_back(
+ ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
+ SplitTy, Flags, OrigArg.IsFixed});
+ }
- // Even if there is no splitting to do, we still want to replace the original
- // type (e.g. pointer type -> integer).
- SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
- OrigArg.Flags, OrigArg.IsFixed);
+ for (unsigned i = 0; i < Offsets.size(); ++i)
+ PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
}
/// Lower the return value for the already existing \p Ret. This assumes that
@@ -187,7 +214,9 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
SmallVector<ArgInfo, 4> SplitVTs;
ArgInfo RetInfo(VReg, Val->getType());
setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
- splitToValueTypes(RetInfo, SplitVTs, DL, MF.getRegInfo());
+ splitToValueTypes(RetInfo, SplitVTs, MF, [&](unsigned Reg, uint64_t Offset) {
+ MIRBuilder.buildExtract(Reg, VReg, Offset);
+ });
CCAssignFn *AssignFn =
TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
@@ -307,6 +336,26 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
return 1;
}
+ /// Merge the values in \p SrcRegs into \p DstReg at offsets \p SrcOffsets.
+ /// Note that the source registers are not required to have homogeneous types,
+ /// so we use G_INSERT rather than G_MERGE_VALUES.
+ // FIXME: Use G_MERGE_VALUES if the types are homogeneous.
+ void mergeRegisters(unsigned DstReg, ArrayRef<unsigned> SrcRegs,
+ ArrayRef<uint64_t> SrcOffsets) {
+ LLT Ty = MRI.getType(DstReg);
+
+ unsigned Dst = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildUndef(Dst);
+
+ for (unsigned i = 0; i < SrcRegs.size(); ++i) {
+ unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildInsert(Tmp, Dst, SrcRegs[i], SrcOffsets[i]);
+ Dst = Tmp;
+ }
+
+ MIRBuilder.buildCopy(DstReg, Dst);
+ }
+
/// Marking a physical register as used is different between formal
/// parameters, where it's a basic block live-in, and call returns, where it's
/// an implicit-def of the call instruction.
@@ -335,6 +384,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
return false;
auto &MF = MIRBuilder.getMF();
+ auto &MBB = MIRBuilder.getMBB();
auto DL = MF.getDataLayout();
auto &TLI = *getTLI<ARMTargetLowering>();
@@ -350,17 +400,34 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
CCAssignFn *AssignFn =
TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg());
+ FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
+ AssignFn);
+
SmallVector<ArgInfo, 8> ArgInfos;
+ SmallVector<unsigned, 4> SplitRegs;
+ SmallVector<uint64_t, 4> RegOffsets;
unsigned Idx = 0;
for (auto &Arg : F.args()) {
ArgInfo AInfo(VRegs[Idx], Arg.getType());
setArgFlags(AInfo, Idx + AttributeList::FirstArgIndex, DL, F);
- splitToValueTypes(AInfo, ArgInfos, DL, MF.getRegInfo());
+
+ SplitRegs.clear();
+ RegOffsets.clear();
+
+ splitToValueTypes(AInfo, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
+ SplitRegs.push_back(Reg);
+ RegOffsets.push_back(Offset);
+ });
+
+ if (!SplitRegs.empty())
+ ArgHandler.mergeRegisters(VRegs[Idx], SplitRegs, RegOffsets);
+
Idx++;
}
- FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
- AssignFn);
+ if (!MBB.empty())
+ MIRBuilder.setInstr(*MBB.begin());
+
return handleAssignments(MIRBuilder, ArgInfos, ArgHandler);
}
@@ -407,7 +474,9 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (!Arg.IsFixed)
return false;
- splitToValueTypes(Arg, ArgInfos, DL, MRI);
+ splitToValueTypes(Arg, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
+ MIRBuilder.buildExtract(Reg, Arg.Reg, Offset);
+ });
}
auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
@@ -423,12 +492,24 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
ArgInfos.clear();
- splitToValueTypes(OrigRet, ArgInfos, DL, MRI);
+ SmallVector<uint64_t, 8> RegOffsets;
+ SmallVector<unsigned, 8> SplitRegs;
+ splitToValueTypes(OrigRet, ArgInfos, MF,
+ [&](unsigned Reg, uint64_t Offset) {
+ RegOffsets.push_back(Offset);
+ SplitRegs.push_back(Reg);
+ });
auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false);
CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
return false;
+
+ if (!RegOffsets.empty()) {
+ // We have split the value and allocated each individual piece, now build
+ // it up again.
+ RetHandler.mergeRegisters(OrigRet.Reg, SplitRegs, RegOffsets);
+ }
}
// We now know the size of the stack - update the ADJCALLSTACKDOWN
diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h
index 6404c7a2689e..f5a6872336f6 100644
--- a/lib/Target/ARM/ARMCallLowering.h
+++ b/lib/Target/ARM/ARMCallLowering.h
@@ -42,11 +42,14 @@ private:
bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
unsigned VReg, MachineInstrBuilder &Ret) const;
+ typedef std::function<void(unsigned Reg, uint64_t Offset)> SplitArgTy;
+
/// Split an argument into one or more arguments that the CC lowering can cope
/// with (e.g. replace pointers with integers).
void splitToValueTypes(const ArgInfo &OrigArg,
SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL, MachineRegisterInfo &MRI) const;
+ MachineFunction &MF,
+ const SplitArgTy &PerformArgSplit) const;
};
} // End of namespace llvm
#endif
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 78a9144bd321..90baabcdb652 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -779,7 +779,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
MachineOperand &Desired = MI.getOperand(3);
MachineOperand &New = MI.getOperand(4);
- LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LivePhysRegs LiveRegs(TII->getRegisterInfo());
LiveRegs.addLiveOuts(MBB);
for (auto I = std::prev(MBB.end()); I != MBBI; --I)
LiveRegs.stepBackward(*I);
@@ -903,7 +903,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0);
unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1);
- LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LivePhysRegs LiveRegs(TII->getRegisterInfo());
LiveRegs.addLiveOuts(MBB);
for (auto I = std::prev(MBB.end()); I != MBBI; --I)
LiveRegs.stepBackward(*I);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index f8b584db7b99..62e774d869da 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -127,7 +127,7 @@ static cl::opt<bool> EnableConstpoolPromotion(
"arm-promote-constant", cl::Hidden,
cl::desc("Enable / disable promotion of unnamed_addr constants into "
"constant pools"),
- cl::init(true));
+ cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
static cl::opt<unsigned> ConstpoolPromotionMaxSize(
"arm-promote-constant-max-size", cl::Hidden,
cl::desc("Maximum size of constant to promote into a constant pool"),
@@ -12147,12 +12147,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
}
}
- // Lowering to i32/i16 if the size permits.
- if (Size >= 4)
- return MVT::i32;
- else if (Size >= 2)
- return MVT::i16;
-
// Let the target-independent logic figure it out.
return MVT::Other;
}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 875c06210ae6..26da528c19e6 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -510,7 +510,7 @@ class InstrItineraryData;
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const override;
- bool canMergeStoresTo(EVT MemVT) const override {
+ bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT) const override {
// Do not merge to larger than i32.
return (MemVT.getSizeInBits() <= 32);
}
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 51290e5a5b93..858136a82078 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -674,7 +674,7 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
(ins AddrMode:$Rn), IIC_VLD1,
- "vld1", Dt, "$Vd, $Rn", "", []> {
+ "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD1]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -682,7 +682,7 @@ class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
(ins AddrMode:$Rn), IIC_VLD1x2,
- "vld1", Dt, "$Vd, $Rn", "", []> {
+ "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD2]> {
let Rm = 0b1111;
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -703,7 +703,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
(ins AddrMode:$Rn), IIC_VLD1u,
"vld1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -711,7 +711,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
(ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u,
"vld1", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
@@ -720,7 +720,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
(ins AddrMode:$Rn), IIC_VLD1x2u,
"vld1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -728,7 +728,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
(ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
"vld1", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
@@ -747,7 +747,7 @@ defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
(ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt,
- "$Vd, $Rn", "", []> {
+ "$Vd, $Rn", "", []>, Sched<[WriteVLD3]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -756,7 +756,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
(ins AddrMode:$Rn), IIC_VLD1x2u,
"vld1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -764,7 +764,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
(ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
"vld1", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
@@ -780,15 +780,15 @@ defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
-def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
-def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
-def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
// ...with 4 registers
class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
(ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt,
- "$Vd, $Rn", "", []> {
+ "$Vd, $Rn", "", []>, Sched<[WriteVLD4]> {
let Rm = 0b1111;
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -797,7 +797,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
(ins AddrMode:$Rn), IIC_VLD1x2u,
"vld1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -805,7 +805,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
(ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
"vld1", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
@@ -821,9 +821,9 @@ defm VLD1d16Qwb : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
-def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
-def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
-def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
// VLD2 : Vector Load (multiple 2-element structures)
class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -837,22 +837,22 @@ class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
}
def VLD2d8 : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
def VLD2d16 : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
def VLD2d32 : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
def VLD2q8 : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVLD4]>;
def VLD2q16 : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVLD4]>;
def VLD2q32 : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVLD4]>;
-def VLD2q8Pseudo : VLDQQPseudo<IIC_VLD2x2>;
-def VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
-def VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def VLD2q8Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
+def VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
+def VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
// ...with address register writeback:
multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
@@ -875,45 +875,45 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
}
defm VLD2d8wb : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
defm VLD2q8wb : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVLD4]>;
defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVLD4]>;
defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVLD4]>;
-def VLD2q8PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q16PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q32PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q8PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
-def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
-def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q16PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q32PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q8PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
// ...with double-spaced registers
def VLD2b8 : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
def VLD2b16 : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
def VLD2b32 : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
defm VLD2b8wb : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVLD2]>;
// VLD3 : Vector Load (multiple 3-element structures)
class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
(ins addrmode6:$Rn), IIC_VLD3,
- "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+ "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []>, Sched<[WriteVLD3]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST3Instruction";
@@ -923,9 +923,9 @@ def VLD3d8 : VLD3D<0b0100, {0,0,0,?}, "8">;
def VLD3d16 : VLD3D<0b0100, {0,1,0,?}, "16">;
def VLD3d32 : VLD3D<0b0100, {1,0,0,?}, "32">;
-def VLD3d8Pseudo : VLDQQPseudo<IIC_VLD3>;
-def VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>;
-def VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>;
+def VLD3d8Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
// ...with address register writeback:
class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -933,7 +933,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
(outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
(ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u,
"vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST3Instruction";
}
@@ -942,9 +942,9 @@ def VLD3d8_UPD : VLD3DWB<0b0100, {0,0,0,?}, "8">;
def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">;
def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">;
-def VLD3d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
-def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
-def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
// ...with double-spaced registers:
def VLD3q8 : VLD3D<0b0101, {0,0,0,?}, "8">;
@@ -954,25 +954,26 @@ def VLD3q8_UPD : VLD3DWB<0b0101, {0,0,0,?}, "8">;
def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">;
def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">;
-def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
// ...alternate versions to be allocated odd register numbers:
-def VLD3q8oddPseudo : VLDQQQQPseudo<IIC_VLD3>;
-def VLD3q16oddPseudo : VLDQQQQPseudo<IIC_VLD3>;
-def VLD3q32oddPseudo : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q8oddPseudo : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def VLD3q16oddPseudo : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def VLD3q32oddPseudo : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
-def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
// VLD4 : Vector Load (multiple 4-element structures)
class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdSt<0, 0b10, op11_8, op7_4,
(outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
(ins addrmode6:$Rn), IIC_VLD4,
- "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+ "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []>,
+ Sched<[WriteVLD4]> {
let Rm = 0b1111;
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST4Instruction";
@@ -982,9 +983,9 @@ def VLD4d8 : VLD4D<0b0000, {0,0,?,?}, "8">;
def VLD4d16 : VLD4D<0b0000, {0,1,?,?}, "16">;
def VLD4d32 : VLD4D<0b0000, {1,0,?,?}, "32">;
-def VLD4d8Pseudo : VLDQQPseudo<IIC_VLD4>;
-def VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>;
-def VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>;
+def VLD4d8Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
// ...with address register writeback:
class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -992,7 +993,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
(outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
(ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u,
"vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST4Instruction";
}
@@ -1001,9 +1002,9 @@ def VLD4d8_UPD : VLD4DWB<0b0000, {0,0,?,?}, "8">;
def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">;
def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">;
-def VLD4d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
-def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
-def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
// ...with double-spaced registers:
def VLD4q8 : VLD4D<0b0001, {0,0,?,?}, "8">;
@@ -1013,18 +1014,18 @@ def VLD4q8_UPD : VLD4DWB<0b0001, {0,0,?,?}, "8">;
def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">;
def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">;
-def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
// ...alternate versions to be allocated odd register numbers:
-def VLD4q8oddPseudo : VLDQQQQPseudo<IIC_VLD4>;
-def VLD4q16oddPseudo : VLDQQQQPseudo<IIC_VLD4>;
-def VLD4q32oddPseudo : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q8oddPseudo : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def VLD4q16oddPseudo : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def VLD4q32oddPseudo : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
-def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
@@ -1076,11 +1077,12 @@ class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
"$src = $Vd",
[(set DPR:$Vd, (vector_insert (Ty DPR:$src),
(i32 (LoadOp addrmode6oneL32:$Rn)),
- imm:$lane))]> {
+ imm:$lane))]>, Sched<[WriteVLD1]> {
let Rm = 0b1111;
let DecoderMethod = "DecodeVLD1LN";
}
-class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
+class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln>,
+ Sched<[WriteVLD1]> {
let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
(i32 (LoadOp addrmode6:$addr)),
imm:$lane))];
@@ -1117,7 +1119,7 @@ class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
(ins addrmode6:$Rn, am6offset:$Rm,
DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
"\\{$Vd[$lane]\\}, $Rn$Rm",
- "$src = $Vd, $Rn.addr = $wb", []> {
+ "$src = $Vd, $Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
let DecoderMethod = "DecodeVLD1LN";
}
@@ -1134,16 +1136,16 @@ def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> {
let Inst{4} = Rn{4};
}
-def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
-def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
-def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
+def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
+def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
// VLD2LN : Vector Load (single 2-element structure to one lane)
class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
(ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane),
IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn",
- "$src1 = $Vd, $src2 = $dst2", []> {
+ "$src1 = $Vd, $src2 = $dst2", []>, Sched<[WriteVLD1]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD2LN";
@@ -1159,9 +1161,9 @@ def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> {
let Inst{7} = lane{0};
}
-def VLD2LNd8Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
-def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
-def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd8Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
// ...with double-spaced registers:
def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> {
@@ -1171,8 +1173,8 @@ def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> {
let Inst{7} = lane{0};
}
-def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
-def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
// ...with address register writeback:
class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1195,9 +1197,9 @@ def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> {
let Inst{7} = lane{0};
}
-def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> {
let Inst{7-6} = lane{1-0};
@@ -1206,8 +1208,8 @@ def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> {
let Inst{7} = lane{0};
}
-def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
// VLD3LN : Vector Load (single 3-element structure to one lane)
class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1215,7 +1217,7 @@ class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
(ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3,
nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt,
"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
- "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
+ "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []>, Sched<[WriteVLD2]> {
let Rm = 0b1111;
let DecoderMethod = "DecodeVLD3LN";
}
@@ -1230,9 +1232,9 @@ def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> {
let Inst{7} = lane{0};
}
-def VLD3LNd8Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd8Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
// ...with double-spaced registers:
def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> {
@@ -1242,8 +1244,8 @@ def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> {
let Inst{7} = lane{0};
}
-def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
// ...with address register writeback:
class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1254,7 +1256,7 @@ class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
IIC_VLD3lnu, "vld3", Dt,
"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
- []> {
+ []>, Sched<[WriteVLD2]> {
let DecoderMethod = "DecodeVLD3LN";
}
@@ -1268,9 +1270,9 @@ def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> {
let Inst{7} = lane{0};
}
-def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> {
let Inst{7-6} = lane{1-0};
@@ -1279,8 +1281,8 @@ def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> {
let Inst{7} = lane{0};
}
-def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
// VLD4LN : Vector Load (single 4-element structure to one lane)
class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1289,7 +1291,8 @@ class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
(ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt,
"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn",
- "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
+ "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>,
+ Sched<[WriteVLD2]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD4LN";
@@ -1306,9 +1309,9 @@ def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> {
let Inst{5} = Rn{5};
}
-def VLD4LNd8Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd8Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
// ...with double-spaced registers:
def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> {
@@ -1319,8 +1322,8 @@ def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> {
let Inst{5} = Rn{5};
}
-def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
// ...with address register writeback:
class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1347,9 +1350,9 @@ def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> {
let Inst{5} = Rn{5};
}
-def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> {
let Inst{7-6} = lane{1-0};
@@ -1359,8 +1362,8 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
let Inst{5} = Rn{5};
}
-def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
@@ -1371,7 +1374,8 @@ class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
(ins AddrMode:$Rn),
IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
[(set VecListOneDAllLanes:$Vd,
- (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+ (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>,
+ Sched<[WriteVLD2]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1434,7 +1438,7 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
(outs VecListDPairAllLanes:$Vd, GPR:$wb),
(ins AddrMode:$Rn), IIC_VLD1dupu,
"vld1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1491,7 +1495,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
(outs VdTy:$Vd, GPR:$wb),
(ins AddrMode:$Rn), IIC_VLD2dupu,
"vld2", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD2DupInstruction";
@@ -1500,7 +1504,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
(outs VdTy:$Vd, GPR:$wb),
(ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu,
"vld2", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD2DupInstruction";
}
@@ -1524,7 +1528,8 @@ defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
class VLD3DUP<bits<4> op7_4, string Dt>
: NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
(ins addrmode6dup:$Rn), IIC_VLD3dup,
- "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+ "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []>,
+ Sched<[WriteVLD2]> {
let Rm = 0b1111;
let Inst{4} = 0;
let DecoderMethod = "DecodeVLD3DupInstruction";
@@ -1534,9 +1539,9 @@ def VLD3DUPd8 : VLD3DUP<{0,0,0,?}, "8">;
def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
-def VLD3DUPd8Pseudo : VLDQQPseudo<IIC_VLD3dup>;
-def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
-def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd8Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
// ...with double-spaced registers (not used for codegen):
def VLD3DUPq8 : VLD3DUP<{0,0,1,?}, "8">;
@@ -1548,7 +1553,7 @@ class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
(ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu,
"vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
let Inst{4} = 0;
let DecoderMethod = "DecodeVLD3DupInstruction";
}
@@ -1561,9 +1566,9 @@ def VLD3DUPq8_UPD : VLD3DUPWB<{0,0,1,0}, "8", addrmode6dupalign64>;
def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>;
def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>;
-def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
-def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
-def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
// VLD4DUP : Vector Load (single 4-element structure to all lanes)
class VLD4DUP<bits<4> op7_4, string Dt>
@@ -1580,9 +1585,9 @@ def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8">;
def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">;
def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
-def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>;
-def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
-def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
// ...with double-spaced registers (not used for codegen):
def VLD4DUPq8 : VLD4DUP<{0,0,1,?}, "8">;
@@ -1595,7 +1600,7 @@ class VLD4DUPWB<bits<4> op7_4, string Dt>
(outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
(ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu,
"vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD4DupInstruction";
}
@@ -1608,9 +1613,9 @@ def VLD4DUPq8_UPD : VLD4DUPWB<{0,0,1,0}, "8">;
def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">;
def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
-def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
-def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
-def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
@@ -1657,14 +1662,14 @@ class VSTQQQQWBPseudo<InstrItinClass itin>
// VST1 : Vector Store (multiple single elements)
class VST1D<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd),
- IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
+ IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST1]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd),
- IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
+ IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST2]> {
let Rm = 0b1111;
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1685,7 +1690,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
(ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u,
"vst1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST1]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1694,7 +1699,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
(ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd),
IIC_VLD1u,
"vst1", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST1]> {
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
@@ -1703,7 +1708,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
(ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
"vst1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1712,7 +1717,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
(ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd),
IIC_VLD1x2u,
"vst1", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
@@ -1732,7 +1737,7 @@ defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0, 0b00, 0b0110, op7_4, (outs),
(ins AddrMode:$Rn, VecListThreeD:$Vd),
- IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
+ IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST3]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1741,7 +1746,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
(ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
"vst1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1750,7 +1755,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
(ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
IIC_VLD1x3u,
"vst1", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
@@ -1766,16 +1771,16 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
-def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>;
-def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>;
-def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
+def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
+def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
// ...with 4 registers
class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
: NLdSt<0, 0b00, 0b0010, op7_4, (outs),
(ins AddrMode:$Rn, VecListFourD:$Vd),
IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
- []> {
+ []>, Sched<[WriteVST4]> {
let Rm = 0b1111;
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1784,7 +1789,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
(ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
"vst1", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1793,7 +1798,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
(ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
IIC_VLD1x4u,
"vst1", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST1Instruction";
}
@@ -1809,9 +1814,9 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
-def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>;
-def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>;
-def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
+def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
+def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
// VST2 : Vector Store (multiple 2-element structures)
class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1824,22 +1829,22 @@ class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
}
def VST2d8 : VST2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VST2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVST2]>;
def VST2d16 : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVST2]>;
def VST2d32 : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2,
- addrmode6align64or128>;
+ addrmode6align64or128>, Sched<[WriteVST2]>;
def VST2q8 : VST2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VST2x2,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVST4]>;
def VST2q16 : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVST4]>;
def VST2q32 : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2,
- addrmode6align64or128or256>;
+ addrmode6align64or128or256>, Sched<[WriteVST4]>;
-def VST2q8Pseudo : VSTQQPseudo<IIC_VST2x2>;
-def VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
-def VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def VST2q8Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
+def VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
+def VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
// ...with address register writeback:
multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
@@ -1847,7 +1852,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
(ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u,
"vst2", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST2Instruction";
@@ -1855,7 +1860,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
(ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
"vst2", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST2Instruction";
}
@@ -1864,7 +1869,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
(ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u,
"vst2", Dt, "$Vd, $Rn!",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST2Instruction";
@@ -1873,7 +1878,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
(ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
IIC_VLD1u,
"vst2", Dt, "$Vd, $Rn, $Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST2Instruction";
}
@@ -1890,12 +1895,12 @@ defm VST2q8wb : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>;
defm VST2q16wb : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>;
defm VST2q32wb : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>;
-def VST2q8PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q16PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q32PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q8PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
-def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
-def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q16PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q32PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q8PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
// ...with double-spaced registers
def VST2b8 : VST2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VST2,
@@ -1915,7 +1920,7 @@ defm VST2b32wb : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced,
class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdSt<0, 0b00, op11_8, op7_4, (outs),
(ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3,
- "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+ "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []>, Sched<[WriteVST3]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST3Instruction";
@@ -1925,9 +1930,9 @@ def VST3d8 : VST3D<0b0100, {0,0,0,?}, "8">;
def VST3d16 : VST3D<0b0100, {0,1,0,?}, "16">;
def VST3d32 : VST3D<0b0100, {1,0,0,?}, "32">;
-def VST3d8Pseudo : VSTQQPseudo<IIC_VST3>;
-def VST3d16Pseudo : VSTQQPseudo<IIC_VST3>;
-def VST3d32Pseudo : VSTQQPseudo<IIC_VST3>;
+def VST3d8Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def VST3d16Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def VST3d32Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
// ...with address register writeback:
class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1935,7 +1940,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
(ins addrmode6:$Rn, am6offset:$Rm,
DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u,
"vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLDST3Instruction";
}
@@ -1944,9 +1949,9 @@ def VST3d8_UPD : VST3DWB<0b0100, {0,0,0,?}, "8">;
def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">;
def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">;
-def VST3d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
-def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
-def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
// ...with double-spaced registers:
def VST3q8 : VST3D<0b0101, {0,0,0,?}, "8">;
@@ -1956,25 +1961,25 @@ def VST3q8_UPD : VST3DWB<0b0101, {0,0,0,?}, "8">;
def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">;
def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">;
-def VST3q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
// ...alternate versions to be allocated odd register numbers:
-def VST3q8oddPseudo : VSTQQQQPseudo<IIC_VST3>;
-def VST3q16oddPseudo : VSTQQQQPseudo<IIC_VST3>;
-def VST3q32oddPseudo : VSTQQQQPseudo<IIC_VST3>;
+def VST3q8oddPseudo : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def VST3q16oddPseudo : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def VST3q32oddPseudo : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
-def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
// VST4 : Vector Store (multiple 4-element structures)
class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdSt<0, 0b00, op11_8, op7_4, (outs),
(ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
- "", []> {
+ "", []>, Sched<[WriteVST4]> {
let Rm = 0b1111;
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST4Instruction";
@@ -1984,9 +1989,9 @@ def VST4d8 : VST4D<0b0000, {0,0,?,?}, "8">;
def VST4d16 : VST4D<0b0000, {0,1,?,?}, "16">;
def VST4d32 : VST4D<0b0000, {1,0,?,?}, "32">;
-def VST4d8Pseudo : VSTQQPseudo<IIC_VST4>;
-def VST4d16Pseudo : VSTQQPseudo<IIC_VST4>;
-def VST4d32Pseudo : VSTQQPseudo<IIC_VST4>;
+def VST4d8Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def VST4d16Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def VST4d32Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
// ...with address register writeback:
class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1994,7 +1999,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
(ins addrmode6:$Rn, am6offset:$Rm,
DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u,
"vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
- "$Rn.addr = $wb", []> {
+ "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
let Inst{5-4} = Rn{5-4};
let DecoderMethod = "DecodeVLDST4Instruction";
}
@@ -2003,9 +2008,9 @@ def VST4d8_UPD : VST4DWB<0b0000, {0,0,?,?}, "8">;
def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">;
def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">;
-def VST4d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
-def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
-def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
// ...with double-spaced registers:
def VST4q8 : VST4D<0b0001, {0,0,?,?}, "8">;
@@ -2015,18 +2020,18 @@ def VST4q8_UPD : VST4DWB<0b0001, {0,0,?,?}, "8">;
def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">;
def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">;
-def VST4q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
// ...alternate versions to be allocated odd register numbers:
-def VST4q8oddPseudo : VSTQQQQPseudo<IIC_VST4>;
-def VST4q16oddPseudo : VSTQQQQPseudo<IIC_VST4>;
-def VST4q32oddPseudo : VSTQQQQPseudo<IIC_VST4>;
+def VST4q8oddPseudo : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def VST4q16oddPseudo : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def VST4q32oddPseudo : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
-def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
@@ -2060,12 +2065,13 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
: NLdStLn<1, 0b00, op11_8, op7_4, (outs),
(ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane),
IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
- [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> {
+ [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]>,
+ Sched<[WriteVST1]> {
let Rm = 0b1111;
let DecoderMethod = "DecodeVST1LN";
}
class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
- : VSTQLNPseudo<IIC_VST1ln> {
+ : VSTQLNPseudo<IIC_VST1ln>, Sched<[WriteVST1]> {
let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
addrmode6:$addr)];
}
@@ -2104,11 +2110,12 @@ class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
"\\{$Vd[$lane]\\}, $Rn$Rm",
"$Rn.addr = $wb",
[(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
- AdrMode:$Rn, am6offset:$Rm))]> {
+ AdrMode:$Rn, am6offset:$Rm))]>,
+ Sched<[WriteVST1]> {
let DecoderMethod = "DecodeVST1LN";
}
class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
- : VSTQLNWBPseudo<IIC_VST1lnu> {
+ : VSTQLNWBPseudo<IIC_VST1lnu>, Sched<[WriteVST1]> {
let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
addrmode6:$addr, am6offset:$offset))];
}
@@ -2139,7 +2146,7 @@ class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdStLn<1, 0b00, op11_8, op7_4, (outs),
(ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane),
IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn",
- "", []> {
+ "", []>, Sched<[WriteVST1]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVST2LN";
@@ -2155,9 +2162,9 @@ def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> {
let Inst{7} = lane{0};
}
-def VST2LNd8Pseudo : VSTQLNPseudo<IIC_VST2ln>;
-def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>;
-def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd8Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
// ...with double-spaced registers:
def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> {
@@ -2169,8 +2176,8 @@ def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> {
let Inst{4} = Rn{4};
}
-def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
-def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
// ...with address register writeback:
class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2193,9 +2200,9 @@ def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> {
let Inst{7} = lane{0};
}
-def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> {
let Inst{7-6} = lane{1-0};
@@ -2204,15 +2211,16 @@ def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> {
let Inst{7} = lane{0};
}
-def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
// VST3LN : Vector Store (single 3-element structure from one lane)
class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdStLn<1, 0b00, op11_8, op7_4, (outs),
(ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3,
nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
- "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
+ "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []>,
+ Sched<[WriteVST2]> {
let Rm = 0b1111;
let DecoderMethod = "DecodeVST3LN";
}
@@ -2227,9 +2235,9 @@ def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> {
let Inst{7} = lane{0};
}
-def VST3LNd8Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
-def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
-def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd8Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
+def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
+def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
// ...with double-spaced registers:
def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> {
@@ -2263,9 +2271,9 @@ def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> {
let Inst{7} = lane{0};
}
-def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> {
let Inst{7-6} = lane{1-0};
@@ -2274,8 +2282,8 @@ def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> {
let Inst{7} = lane{0};
}
-def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
// VST4LN : Vector Store (single 4-element structure from one lane)
class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2283,7 +2291,7 @@ class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
(ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4,
nohash_imm:$lane), IIC_VST4ln, "vst4", Dt,
"\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn",
- "", []> {
+ "", []>, Sched<[WriteVST2]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVST4LN";
@@ -2300,9 +2308,9 @@ def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> {
let Inst{5} = Rn{5};
}
-def VST4LNd8Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
-def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
-def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd8Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
// ...with double-spaced registers:
def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> {
@@ -2313,8 +2321,8 @@ def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> {
let Inst{5} = Rn{5};
}
-def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
-def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
// ...with address register writeback:
class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2339,9 +2347,9 @@ def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> {
let Inst{5} = Rn{5};
}
-def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> {
let Inst{7-6} = lane{1-0};
@@ -2351,8 +2359,8 @@ def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
let Inst{5} = Rn{5};
}
-def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 87eb4c2b9074..ec5b97cba8cd 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -131,6 +131,17 @@ def WriteFPDIV64 : SchedWrite;
def WriteFPSQRT32 : SchedWrite;
def WriteFPSQRT64 : SchedWrite;
+// Vector load and stores
+def WriteVLD1 : SchedWrite;
+def WriteVLD2 : SchedWrite;
+def WriteVLD3 : SchedWrite;
+def WriteVLD4 : SchedWrite;
+def WriteVST1 : SchedWrite;
+def WriteVST2 : SchedWrite;
+def WriteVST3 : SchedWrite;
+def WriteVST4 : SchedWrite;
+
+
// Define TII for use in SchedVariant Predicates.
def : PredicateProlog<[{
const ARMBaseInstrInfo *TII =
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 8fb8a2a3b6d2..4e72b13d94cb 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1981,6 +1981,15 @@ def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
// Reserve A9UnitFP for 2 consecutive cycles.
def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
let Latency = 4;
diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td
index 537e5da9669f..782be9b60a7a 100644
--- a/lib/Target/ARM/ARMScheduleR52.td
+++ b/lib/Target/ARM/ARMScheduleR52.td
@@ -120,6 +120,12 @@ def : WriteRes<WriteFPDIV64, [R52UnitDiv]> {
def : WriteRes<WriteFPSQRT32, [R52UnitDiv]> { let Latency = 7; }
def : WriteRes<WriteFPSQRT64, [R52UnitDiv]> { let Latency = 17; }
+// Overriden via InstRW for this processor.
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
def : ReadAdvance<ReadFPMUL, 1>; // mul operand read in F1
def : ReadAdvance<ReadFPMAC, 1>; // fp-mac operand read in F1
@@ -712,20 +718,20 @@ def R52WriteSTM : SchedWriteVariant<[
// Vector Load/Stores. Can issue only in slot-0. Can dual-issue with
// another instruction in slot-1, but only in the last issue.
-def R52WriteVLD1Mem : SchedWriteRes<[R52UnitLd]> { let Latency = 5;}
-def R52WriteVLD2Mem : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD1, [R52UnitLd]> { let Latency = 5;}
+def : WriteRes<WriteVLD2, [R52UnitLd]> {
let Latency = 6;
let NumMicroOps = 3;
let ResourceCycles = [2];
let SingleIssue = 1;
}
-def R52WriteVLD3Mem : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD3, [R52UnitLd]> {
let Latency = 7;
let NumMicroOps = 5;
let ResourceCycles = [3];
let SingleIssue = 1;
}
-def R52WriteVLD4Mem : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD4, [R52UnitLd]> {
let Latency = 8;
let NumMicroOps = 7;
let ResourceCycles = [4];
@@ -829,95 +835,6 @@ def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VRSHL", "VR
def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
//---
-// VLDx. Vector Loads
-//---
-// 1-element structure load
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD1q(8|16|32|64)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)T$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Q$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d64TPseudo$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d64QPseudo$")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)d(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1LNdAsm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo$")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)wb")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1q(8|16|32|64)wb")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Twb")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Qwb")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64TPseudoWB")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64QPseudoWB")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNd(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNdWB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1DUP(d|q)(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo_UPD")>;
-
-// 2-element structure load
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)PseudoWB")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNdAsm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNqAsm_(16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNdWB_(fixed|register)_Asm_(8|16|32)")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNqWB_(fixed|register)_Asm_(16|32)")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo_UPD")>;
-
-// 3-element structure load
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
-
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
-
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
-
-// 4-element structure load
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
-
-
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4LN(d|q)(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4DUPd(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
-
-//---
// VSTx. Vector Stores
//---
// 1-element structure store
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index dc041c6c6006..b838688c6f04 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1070,6 +1070,16 @@ let SchedModel = SwiftModel in {
def : ReadAdvance<ReadFPMUL, 0>;
def : ReadAdvance<ReadFPMAC, 0>;
+ // Overriden via InstRW for this processor.
+ def : WriteRes<WriteVLD1, []>;
+ def : WriteRes<WriteVLD2, []>;
+ def : WriteRes<WriteVLD3, []>;
+ def : WriteRes<WriteVLD4, []>;
+ def : WriteRes<WriteVST1, []>;
+ def : WriteRes<WriteVST2, []>;
+ def : WriteRes<WriteVST3, []>;
+ def : WriteRes<WriteVST4, []>;
+
// Not specified.
def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
// Preload.
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 1979cbf50125..c4f23c66e4ea 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -85,9 +85,9 @@ namespace llvm {
extern "C" void LLVMInitializeARMTarget() {
// Register the target.
RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
+ RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget());
RegisterTargetMachine<ARMBETargetMachine> Y(getTheARMBETarget());
- RegisterTargetMachine<ThumbLETargetMachine> A(getTheThumbLETarget());
- RegisterTargetMachine<ThumbBETargetMachine> B(getTheThumbBETarget());
+ RegisterTargetMachine<ARMBETargetMachine> B(getTheThumbBETarget());
PassRegistry &Registry = *PassRegistry::getPassRegistry();
initializeGlobalISel(Registry);
@@ -263,6 +263,11 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
else
this->Options.EABIVersion = EABI::EABI5;
}
+
+ initAsmInfo();
+ if (!Subtarget.isThumb() && !Subtarget.hasARMOps())
+ report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
+ "support ARM mode execution!");
}
ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
@@ -355,22 +360,6 @@ TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
});
}
-void ARMTargetMachine::anchor() {}
-
-ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
- StringRef CPU, StringRef FS,
- const TargetOptions &Options,
- Optional<Reloc::Model> RM,
- CodeModel::Model CM, CodeGenOpt::Level OL,
- bool isLittle)
- : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
- initAsmInfo();
- if (!Subtarget.hasARMOps())
- report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
- "support ARM mode execution!");
-}
-
-void ARMLETargetMachine::anchor() {}
ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -378,9 +367,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
Optional<Reloc::Model> RM,
CodeModel::Model CM,
CodeGenOpt::Level OL)
- : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
-void ARMBETargetMachine::anchor() {}
+ : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
@@ -388,39 +375,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
Optional<Reloc::Model> RM,
CodeModel::Model CM,
CodeGenOpt::Level OL)
- : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
-
-void ThumbTargetMachine::anchor() {}
-
-ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
- StringRef CPU, StringRef FS,
- const TargetOptions &Options,
- Optional<Reloc::Model> RM,
- CodeModel::Model CM,
- CodeGenOpt::Level OL, bool isLittle)
- : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
- initAsmInfo();
-}
-
-void ThumbLETargetMachine::anchor() {}
-
-ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
- StringRef CPU, StringRef FS,
- const TargetOptions &Options,
- Optional<Reloc::Model> RM,
- CodeModel::Model CM,
- CodeGenOpt::Level OL)
- : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
-void ThumbBETargetMachine::anchor() {}
-
-ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
- StringRef CPU, StringRef FS,
- const TargetOptions &Options,
- Optional<Reloc::Model> RM,
- CodeModel::Model CM,
- CodeGenOpt::Level OL)
- : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+ : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
namespace {
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index f0ca9427d9fb..e5eb27114c72 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -62,23 +62,9 @@ public:
}
};
-/// ARM target machine.
+/// ARM/Thumb little endian target machine.
///
-class ARMTargetMachine : public ARMBaseTargetMachine {
- virtual void anchor();
-
-public:
- ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
- StringRef FS, const TargetOptions &Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL, bool isLittle);
-};
-
-/// ARM little endian target machine.
-///
-class ARMLETargetMachine : public ARMTargetMachine {
- void anchor() override;
-
+class ARMLETargetMachine : public ARMBaseTargetMachine {
public:
ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
@@ -86,11 +72,9 @@ public:
CodeGenOpt::Level OL);
};
-/// ARM big endian target machine.
+/// ARM/Thumb big endian target machine.
///
-class ARMBETargetMachine : public ARMTargetMachine {
- void anchor() override;
-
+class ARMBETargetMachine : public ARMBaseTargetMachine {
public:
ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
@@ -98,44 +82,6 @@ public:
CodeGenOpt::Level OL);
};
-/// Thumb target machine.
-/// Due to the way architectures are handled, this represents both
-/// Thumb-1 and Thumb-2.
-///
-class ThumbTargetMachine : public ARMBaseTargetMachine {
- virtual void anchor();
-
-public:
- ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
- StringRef FS, const TargetOptions &Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL, bool isLittle);
-};
-
-/// Thumb little endian target machine.
-///
-class ThumbLETargetMachine : public ThumbTargetMachine {
- void anchor() override;
-
-public:
- ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
- StringRef FS, const TargetOptions &Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL);
-};
-
-/// Thumb big endian target machine.
-///
-class ThumbBETargetMachine : public ThumbTargetMachine {
- void anchor() override;
-
-public:
- ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
- StringRef FS, const TargetOptions &Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL);
-};
-
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 94f9e8dfebbf..edbf2b99126c 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -30,8 +30,8 @@ using namespace dwarf;
void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
const TargetMachine &TM) {
- const ARMTargetMachine &ARM_TM = static_cast<const ARMTargetMachine &>(TM);
- bool isAAPCS_ABI = ARM_TM.TargetABI == ARMTargetMachine::ARMABI::ARM_ABI_AAPCS;
+ const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM);
+ bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS;
genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
TargetLoweringObjectFileELF::Initialize(Ctx, TM);
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 1a17d4e33e4f..f917c35b9ceb 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -535,14 +535,14 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
// Look for a temporary register to use.
// First, compute the liveness information.
- LivePhysRegs UsedRegs(STI.getRegisterInfo());
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ LivePhysRegs UsedRegs(TRI);
UsedRegs.addLiveOuts(MBB);
// The semantic of pristines changed recently and now,
// the callee-saved registers that are touched in the function
// are not part of the pristines set anymore.
// Add those callee-saved now.
- const TargetRegisterInfo *TRI = STI.getRegisterInfo();
- const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
for (unsigned i = 0; CSRegs[i]; ++i)
UsedRegs.addReg(CSRegs[i]);
@@ -561,12 +561,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
// And some temporary register, just in case.
unsigned TemporaryReg = 0;
BitVector PopFriendly =
- TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID));
+ TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID));
assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
// Rebuild the GPRs from the high registers because they are removed
// form the GPR reg class for thumb1.
BitVector GPRsNoLRSP =
- TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID));
+ TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::hGPRRegClassID));
GPRsNoLRSP |= PopFriendly;
GPRsNoLRSP.reset(ARM::LR);
GPRsNoLRSP.reset(ARM::SP);
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index 06ad2b3ffdf8..f10ca394f36c 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -902,7 +902,6 @@ let Defs = [SREG] in
// CPI Rd, K
// Compares a register with an 8 bit immediate.
- let Uses = [SREG] in
def CPIRdK : FRdK<0b0011,
(outs),
(ins GPR8:$rd, imm_ldi8:$k),
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 6897161c903c..cc7a7c3849bc 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -132,6 +132,10 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
}
+bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ return false;
+}
+
SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
case ISD::BR_CC:
@@ -496,8 +500,11 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
+ auto N = cast<GlobalAddressSDNode>(Op);
+ assert(N->getOffset() == 0 && "Invalid offset for global address");
+
SDLoc DL(Op);
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ const GlobalValue *GV = N->getGlobal();
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i64);
return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h
index 3d1726be286e..0b8a8ca20c3b 100644
--- a/lib/Target/BPF/BPFISelLowering.h
+++ b/lib/Target/BPF/BPFISelLowering.h
@@ -42,6 +42,10 @@ public:
// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
+ // This method decides whether folding a constant offset
+ // with the given GlobalAddress is legal.
+ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index a04aca4afa0f..25018b9ed510 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1657,7 +1657,7 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
// defined. From the point of view of the liveness tracking, it is ok to
// store it as a whole, but if we break it up we may end up storing a
// register that is entirely undefined.
- LivePhysRegs LPR(&HRI);
+ LivePhysRegs LPR(HRI);
LPR.addLiveIns(B);
SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers;
for (auto R = B.begin(); R != It; ++R) {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 03794511414e..66e07c67958e 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1254,7 +1254,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
const MachineOperand &Op1 = MI.getOperand(1);
const MachineOperand &Op2 = MI.getOperand(2);
const MachineOperand &Op3 = MI.getOperand(3);
- LivePhysRegs LiveAtMI(&HRI);
+ LivePhysRegs LiveAtMI(HRI);
getLiveRegsAt(LiveAtMI, MI);
bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
if (Op0.getReg() != Op2.getReg()) {
@@ -1283,7 +1283,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineOperand &Op1 = MI.getOperand(1);
MachineOperand &Op2 = MI.getOperand(2);
MachineOperand &Op3 = MI.getOperand(3);
- LivePhysRegs LiveAtMI(&HRI);
+ LivePhysRegs LiveAtMI(HRI);
getLiveRegsAt(LiveAtMI, MI);
bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index 0f99dfe342b8..93fb688fc1c0 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -412,6 +412,15 @@ def PS_vstorerwu_ai: STrivv_template<VecDblRegs, V6_vS32Ub_ai>,
def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32Ub_ai_128B>,
Requires<[HasV60T,UseHVXDbl]>;
+let isPseudo = 1, isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0 in {
+ def PS_vstorerq_ai: Pseudo<(outs),
+ (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs:$Qt), "", []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+ def PS_vstorerq_ai_128B: Pseudo<(outs),
+ (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs128B:$Qt), "", []>,
+ Requires<[HasV60T,UseHVXDbl]>;
+}
+
// Vector load pseudos
let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
mayLoad = 1, hasSideEffects = 0 in
@@ -429,30 +438,16 @@ def PS_vloadrwu_ai: LDrivv_template<VecDblRegs, V6_vL32Ub_ai>,
def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32Ub_ai_128B>,
Requires<[HasV60T,UseHVXDbl]>;
-// Store vector predicate pseudo.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
- isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
- def PS_vstorerq_ai : STInst<(outs),
- (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
- ".error \"should not emit\" ", []>,
- Requires<[HasV60T,UseHVXSgl]>;
-
- def PS_vstorerq_ai_128B : STInst<(outs),
- (ins IntRegs:$base, s32_0Imm:$offset, VectorRegs:$src1),
- ".error \"should not emit\" ", []>,
- Requires<[HasV60T,UseHVXSgl]>;
-
- def PS_vloadrq_ai : STInst<(outs),
- (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
- ".error \"should not emit\" ", []>,
- Requires<[HasV60T,UseHVXDbl]>;
-
- def PS_vloadrq_ai_128B : STInst<(outs),
- (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
- ".error \"should not emit\" ", []>,
- Requires<[HasV60T,UseHVXDbl]>;
+let isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in {
+ def PS_vloadrq_ai: Pseudo<(outs VecPredRegs:$Qd),
+ (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>,
+ Requires<[HasV60T,UseHVXSgl]>;
+ def PS_vloadrq_ai_128B: Pseudo<(outs VecPredRegs128B:$Qd),
+ (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>,
+ Requires<[HasV60T,UseHVXDbl]>;
}
+
let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
class VSELInst<dag outs, dag ins, InstHexagon rootInst>
: InstHexagon<outs, ins, "", [], "", rootInst.Itinerary, rootInst.Type>;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2a1bb63af789..1fc157900ed5 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -50,11 +50,6 @@ bool HexagonRegisterInfo::isEHReturnCalleeSaveReg(unsigned R) const {
R == Hexagon::R3 || R == Hexagon::D0 || R == Hexagon::D1;
}
-bool HexagonRegisterInfo::isCalleeSaveReg(unsigned Reg) const {
- return Hexagon::R16 <= Reg && Reg <= Hexagon::R27;
-}
-
-
const MCPhysReg *
HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF,
const TargetRegisterClass *RC) const {
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 8a3f175b8488..5f65fad2cc04 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -77,7 +77,6 @@ public:
unsigned getFirstCallerSavedNonParamReg() const;
bool isEHReturnCalleeSaveReg(unsigned Reg) const;
- bool isCalleeSaveReg(unsigned Reg) const;
};
} // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index c21b6e2515d3..cd474921d4bc 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -214,12 +214,12 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
for (auto &MB : MF) {
auto Begin = MB.begin(), End = MB.end();
while (Begin != End) {
- // First the first non-boundary starting from the end of the last
+ // Find the first non-boundary starting from the end of the last
// scheduling region.
MachineBasicBlock::iterator RB = Begin;
while (RB != End && HII->isSchedulingBoundary(*RB, &MB, MF))
++RB;
- // First the first boundary starting from the beginning of the new
+ // Find the first boundary starting from the beginning of the new
// region.
MachineBasicBlock::iterator RE = RB;
while (RE != End && !HII->isSchedulingBoundary(*RE, &MB, MF))
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 8be2a898e380..34b966df7761 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -29,6 +29,7 @@ subdirectories =
MSP430
NVPTX
Mips
+ Nios2
PowerPC
RISCV
Sparc
diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td
index dfea669f3ba1..203864dd4065 100644
--- a/lib/Target/MSP430/MSP430.td
+++ b/lib/Target/MSP430/MSP430.td
@@ -22,6 +22,18 @@ def FeatureX
: SubtargetFeature<"ext", "ExtendedInsts", "true",
"Enable MSP430-X extensions">;
+def FeatureHWMult16
+ : SubtargetFeature<"hwmult16", "HWMultMode", "HWMult16",
+ "Enable 16-bit hardware multiplier">;
+
+def FeatureHWMult32
+ : SubtargetFeature<"hwmult32", "HWMultMode", "HWMult32",
+ "Enable 32-bit hardware multiplier">;
+
+def FeatureHWMultF5
+ : SubtargetFeature<"hwmultf5", "HWMultMode", "HWMultF5",
+ "Enable F5 series hardware multiplier">;
+
//===----------------------------------------------------------------------===//
// MSP430 supported processors.
//===----------------------------------------------------------------------===//
@@ -29,6 +41,8 @@ class Proc<string Name, list<SubtargetFeature> Features>
: Processor<Name, NoItineraries, Features>;
def : Proc<"generic", []>;
+def : Proc<"msp430", []>;
+def : Proc<"msp430x", [FeatureX]>;
//===----------------------------------------------------------------------===//
// Register File Description
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index cd58eda5d924..0b02f79f472a 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -403,12 +403,12 @@ void MSP430DAGToDAGISel::Select(SDNode *Node) {
int FI = cast<FrameIndexSDNode>(Node)->getIndex();
SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
if (Node->hasOneUse()) {
- CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI,
+ CurDAG->SelectNodeTo(Node, MSP430::ADDframe, MVT::i16, TFI,
CurDAG->getTargetConstant(0, dl, MVT::i16));
return;
}
ReplaceNode(Node, CurDAG->getMachineNode(
- MSP430::ADD16ri, dl, MVT::i16, TFI,
+ MSP430::ADDframe, dl, MVT::i16, TFI,
CurDAG->getTargetConstant(0, dl, MVT::i16)));
return;
}
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index cc6e64043f54..dae14fd301ee 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -38,27 +38,6 @@ using namespace llvm;
#define DEBUG_TYPE "msp430-lower"
-typedef enum {
- NoHWMult,
- HWMult16,
- HWMult32,
- HWMultF5
-} HWMultUseMode;
-
-static cl::opt<HWMultUseMode>
-HWMultMode("mhwmult", cl::Hidden,
- cl::desc("Hardware multiplier use mode"),
- cl::init(NoHWMult),
- cl::values(
- clEnumValN(NoHWMult, "none",
- "Do not use hardware multiplier"),
- clEnumValN(HWMult16, "16bit",
- "Use 16-bit hardware multiplier"),
- clEnumValN(HWMult32, "32bit",
- "Use 32-bit hardware multiplier"),
- clEnumValN(HWMultF5, "f5series",
- "Use F5 series hardware multiplier")));
-
MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
const MSP430Subtarget &STI)
: TargetLowering(TM) {
@@ -262,7 +241,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
setCmpLibcallCC(LC.Op, LC.Cond);
}
- if (HWMultMode == HWMult16) {
+ if (STI.hasHWMult16()) {
const struct {
const RTLIB::Libcall Op;
const char * const Name;
@@ -277,7 +256,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
for (const auto &LC : LibraryCalls) {
setLibcallName(LC.Op, LC.Name);
}
- } else if (HWMultMode == HWMult32) {
+ } else if (STI.hasHWMult32()) {
const struct {
const RTLIB::Libcall Op;
const char * const Name;
@@ -292,7 +271,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
for (const auto &LC : LibraryCalls) {
setLibcallName(LC.Op, LC.Name);
}
- } else if (HWMultMode == HWMultF5) {
+ } else if (STI.hasHWMultF5()) {
const struct {
const RTLIB::Libcall Op;
const char * const Name;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 1cd18611e52c..cec43040f60d 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -122,6 +122,11 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
[(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
}
+let Defs = [SR], Uses = [SP] in {
+def ADDframe : Pseudo<(outs GR16:$dst), (ins i16imm:$base, i16imm:$offset),
+ "# ADDframe PSEUDO", []>;
+}
+
let usesCustomInserter = 1 in {
let Uses = [SR] in {
def Select8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 9600bc28f100..7a3b7a8bd5ff 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -127,7 +127,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// Fold imm into offset
Offset += MI.getOperand(FIOperandNum + 1).getImm();
- if (MI.getOpcode() == MSP430::ADD16ri) {
+ if (MI.getOpcode() == MSP430::ADDframe) {
// This is actually "load effective address" of the stack slot
// instruction. We have only two-address instructions, thus we need to
// expand it into mov + add
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index 6216348e4d71..776a9dcb11d4 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -19,6 +19,20 @@ using namespace llvm;
#define DEBUG_TYPE "msp430-subtarget"
+static cl::opt<MSP430Subtarget::HWMultEnum>
+HWMultModeOption("mhwmult", cl::Hidden,
+ cl::desc("Hardware multiplier use mode for MSP430"),
+ cl::init(MSP430Subtarget::NoHWMult),
+ cl::values(
+ clEnumValN(MSP430Subtarget::NoHWMult, "none",
+ "Do not use hardware multiplier"),
+ clEnumValN(MSP430Subtarget::HWMult16, "16bit",
+ "Use 16-bit hardware multiplier"),
+ clEnumValN(MSP430Subtarget::HWMult32, "32bit",
+ "Use 32-bit hardware multiplier"),
+ clEnumValN(MSP430Subtarget::HWMultF5, "f5series",
+ "Use F5 series hardware multiplier")));
+
#define GET_SUBTARGETINFO_TARGET_DESC
#define GET_SUBTARGETINFO_CTOR
#include "MSP430GenSubtargetInfo.inc"
@@ -27,7 +41,18 @@ void MSP430Subtarget::anchor() { }
MSP430Subtarget &
MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
- ParseSubtargetFeatures("generic", FS);
+ ExtendedInsts = false;
+ HWMultMode = NoHWMult;
+
+ std::string CPUName = CPU;
+ if (CPUName.empty())
+ CPUName = "msp430";
+
+ ParseSubtargetFeatures(CPUName, FS);
+
+ if (HWMultModeOption != NoHWMult)
+ HWMultMode = HWMultModeOption;
+
return *this;
}
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index 1a00d85e01cb..8828dfd65878 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -30,8 +30,15 @@ namespace llvm {
class StringRef;
class MSP430Subtarget : public MSP430GenSubtargetInfo {
+public:
+ enum HWMultEnum {
+ NoHWMult, HWMult16, HWMult32, HWMultF5
+ };
+
+private:
virtual void anchor();
bool ExtendedInsts;
+ HWMultEnum HWMultMode;
MSP430FrameLowering FrameLowering;
MSP430InstrInfo InstrInfo;
MSP430TargetLowering TLInfo;
@@ -50,6 +57,10 @@ public:
/// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+ bool hasHWMult16() const { return HWMultMode == HWMult16; }
+ bool hasHWMult32() const { return HWMultMode == HWMult32; }
+ bool hasHWMultF5() const { return HWMultMode == HWMultF5; }
+
const TargetFrameLowering *getFrameLowering() const override {
return &FrameLowering;
}
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 3641a70d61b5..8fe4e75f3e18 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -813,28 +813,28 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
!isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
return SDValue();
- // The shift masks must have the same position and size.
- if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
- return SDValue();
+ // The shift masks must have the same position and size.
+ if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+ return SDValue();
- SDValue Shl = And1.getOperand(0);
+ SDValue Shl = And1.getOperand(0);
- if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
- return SDValue();
+ if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+ return SDValue();
- unsigned Shamt = CN->getZExtValue();
+ unsigned Shamt = CN->getZExtValue();
- // Return if the shift amount and the first bit position of mask are not the
- // same.
- EVT ValTy = N->getValueType(0);
- if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
- return SDValue();
+ // Return if the shift amount and the first bit position of mask are not the
+ // same.
+ EVT ValTy = N->getValueType(0);
+ if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+ return SDValue();
- SDLoc DL(N);
- return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
- DAG.getConstant(SMPos0, DL, MVT::i32),
- DAG.getConstant(SMSize0, DL, MVT::i32),
- And0.getOperand(0));
+ SDLoc DL(N);
+ return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
+ DAG.getConstant(SMPos0, DL, MVT::i32),
+ DAG.getConstant(SMSize0, DL, MVT::i32),
+ And0.getOperand(0));
} else {
// Pattern match DINS.
// $dst = or (and $src, mask0), mask1
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 8f5ecadecdea..1f4e933db2a2 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -59,9 +59,8 @@ static cl::opt<bool>
void MipsSubtarget::anchor() { }
-MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
- const std::string &FS, bool little,
- const MipsTargetMachine &TM)
+MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+ bool little, const MipsTargetMachine &TM)
: MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
NoABICalls(false), IsFP64bit(false), UseOddSPReg(true),
@@ -77,8 +76,6 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
FrameLowering(MipsFrameLowering::create(*this)),
TLInfo(MipsTargetLowering::create(TM, *this)) {
- PreviousInMips16Mode = InMips16Mode;
-
if (MipsArchVersion == MipsDefault)
MipsArchVersion = Mips32;
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index cca2cb8a4660..b4d15ee361ff 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -119,9 +119,6 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
// Mips16 hard float
bool InMips16HardFloat;
- // PreviousInMips16 -- the function we just processed was in Mips 16 Mode
- bool PreviousInMips16Mode;
-
// InMicroMips -- can process MicroMips instructions
bool InMicroMipsMode;
@@ -178,8 +175,8 @@ public:
/// This constructor initializes the data members to match that
/// of the specified triple.
- MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
- bool little, const MipsTargetMachine &TM);
+ MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little,
+ const MipsTargetMachine &TM);
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
diff --git a/lib/Target/Nios2/CMakeLists.txt b/lib/Target/Nios2/CMakeLists.txt
new file mode 100644
index 000000000000..78db452094bd
--- /dev/null
+++ b/lib/Target/Nios2/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(LLVM_TARGET_DEFINITIONS Nios2.td)
+
+#Generate Nios2GenRegisterInfo.inc and Nios2GenInstrInfo.inc which included by
+#your hand code C++ files.
+#Nios2GenRegisterInfo.inc came from Nios2RegisterInfo.td, Nios2GenInstrInfo.inc
+#came from Nios2InstrInfo.td.
+tablegen(LLVM Nios2GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM Nios2GenInstrInfo.inc -gen-instr-info)
+
+#Nios2CommonTableGen must be defined
+add_public_tablegen_target(Nios2CommonTableGen)
+
+#Nios2CodeGen should match with LLVMBuild.txt Nios2CodeGen
+add_llvm_target(Nios2CodeGen Nios2TargetMachine.cpp)
+
+#Should match with "subdirectories = MCTargetDesc TargetInfo" in LLVMBuild.txt
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Nios2/LLVMBuild.txt b/lib/Target/Nios2/LLVMBuild.txt
new file mode 100644
index 000000000000..b40a76379706
--- /dev/null
+++ b/lib/Target/Nios2/LLVMBuild.txt
@@ -0,0 +1,61 @@
+;===- ./lib/Target/Nios2/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+#Following comments extracted from http: // llvm.org/docs/LLVMBuild.html
+
+[common]
+subdirectories =
+ MCTargetDesc
+ TargetInfo
+
+[component_0]
+#TargetGroup components are an extension of LibraryGroups, specifically for
+#defining LLVM targets(which are handled specially in a few places).
+type = TargetGroup
+#The name of the component should always be the name of the target.(should
+#match "def Nios2 : Target" in Nios2.td)
+name = Nios2
+#Nios2 component is located in directory Target /
+parent = Target
+#Whether this target defines an assembly parser, assembly printer, disassembler
+#, and supports JIT compilation.They are optional.
+
+[component_1]
+#component_1 is a Library type and name is Nios2CodeGen.After build it will
+#in lib / libLLVMNios2CodeGen.a of your build command directory.
+type = Library
+name = Nios2CodeGen
+#Nios2CodeGen component(Library) is located in directory Nios2 /
+parent = Nios2
+#If given, a list of the names of Library or LibraryGroup components which
+#must also be linked in whenever this library is used.That is, the link time
+#dependencies for this component.When tools are built, the build system will
+#include the transitive closure of all required_libraries for the components
+#the tool needs.
+required_libraries = CodeGen
+ Core
+ GlobalISel
+ MC
+ Nios2Desc
+ Nios2Info
+ Support
+ Target
+#end of required_libraries
+
+#All LLVMBuild.txt in Target / Nios2 and subdirectory use 'add_to_library_groups
+#= Nios2'
+add_to_library_groups = Nios2
diff --git a/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt b/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 000000000000..21def509a232
--- /dev/null
+++ b/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,2 @@
+#MCTargetDesc / CMakeLists.txt
+add_llvm_library(LLVMNios2Desc Nios2MCTargetDesc.cpp)
diff --git a/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt b/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 000000000000..4dc6995e7f5c
--- /dev/null
+++ b/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,25 @@
+;===- ./lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Nios2Desc
+parent = Nios2
+required_libraries = MC
+ Nios2Info
+ Support
+add_to_library_groups = Nios2
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp
new file mode 100644
index 000000000000..d913166399c6
--- /dev/null
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp
@@ -0,0 +1,25 @@
+//===-- Nios2MCTargetDesc.cpp - Nios2 Target Descriptions -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Nios2 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2MCTargetDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "Nios2GenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "Nios2GenRegisterInfo.inc"
+
+extern "C" void LLVMInitializeNios2TargetMC() {}
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
new file mode 100644
index 000000000000..d426062db168
--- /dev/null
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
@@ -0,0 +1,34 @@
+//===-- Nios2MCTargetDesc.h - Nios2 Target Descriptions ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Nios2 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H
+#define LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H
+
+namespace llvm {
+class Target;
+class Triple;
+
+Target &getTheNios2Target();
+
+} // namespace llvm
+
+// Defines symbolic names for Nios2 registers. This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "Nios2GenRegisterInfo.inc"
+
+// Defines symbolic names for the Nios2 instructions.
+#define GET_INSTRINFO_ENUM
+#include "Nios2GenInstrInfo.inc"
+
+#endif
diff --git a/lib/Target/Nios2/Nios2.h b/lib/Target/Nios2/Nios2.h
new file mode 100644
index 000000000000..87202f48cfbe
--- /dev/null
+++ b/lib/Target/Nios2/Nios2.h
@@ -0,0 +1,25 @@
+//===-- Nios2.h - Top-level interface for Nios2 representation --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Nios2 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2_H
+#define LLVM_LIB_TARGET_NIOS2_NIOS2_H
+
+#include "MCTargetDesc/Nios2MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class Nios2TargetMachine;
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Nios2/Nios2.td b/lib/Target/Nios2/Nios2.td
new file mode 100644
index 000000000000..e8abba863370
--- /dev/null
+++ b/lib/Target/Nios2/Nios2.td
@@ -0,0 +1,29 @@
+//===-- Nios2.td - Describe the Nios2 Target Machine -------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Target-dependent interfaces
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "Nios2RegisterInfo.td"
+include "Nios2InstrInfo.td"
+
+def Nios2InstrInfo : InstrInfo;
+
+def Nios2 : Target { let InstructionSet = Nios2InstrInfo; }
diff --git a/lib/Target/Nios2/Nios2InstrFormats.td b/lib/Target/Nios2/Nios2InstrFormats.td
new file mode 100644
index 000000000000..79868be48a48
--- /dev/null
+++ b/lib/Target/Nios2/Nios2InstrFormats.td
@@ -0,0 +1,117 @@
+//===-- Nios2InstrFormats.td - Nios2 Instruction Formats ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Describe NIOS2 instructions format
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<3> val> {
+ bits<3> Value = val;
+}
+
+def Pseudo : Format<0>;
+def FrmI : Format<1>;
+def FrmR : Format<2>;
+def FrmJ : Format<3>;
+def FrmOther : Format<4>; // Instruction w/ a custom format
+
+// Generic Nios2 Format
+class Nios2Inst<dag outs, dag ins, string asmstr, list<dag> pattern, Format f>
+ : Instruction {
+ field bits<32> Inst;
+ Format Form = f;
+
+ let Namespace = "Nios2";
+
+ let Size = 4;
+
+ bits<6> Opcode = 0;
+
+ // Bottom 6 bits are the 'opcode' field
+ let Inst{5 - 0} = Opcode;
+
+ let OutOperandList = outs;
+ let InOperandList = ins;
+
+ let AsmString = asmstr;
+ let Pattern = pattern;
+
+ //
+ // Attributes specific to Nios2 instructions:
+ //
+ bits<3> FormBits = Form.Value;
+
+ // TSFlags layout should be kept in sync with Nios2InstrInfo.h.
+ let TSFlags{2 - 0} = FormBits;
+
+ let DecoderNamespace = "Nios2";
+}
+
+// Nios2 Instruction Format
+class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern, Format f>
+ : Nios2Inst<outs, ins, asmstr, pattern, f> {
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Nios2 : <|A|B|immediate|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSE<outs, ins, asmstr, pattern, FrmI> {
+ bits<5> rA;
+ bits<5> rB;
+ bits<16> imm;
+
+ let Opcode = op;
+
+ let Inst{31 - 27} = rA;
+ let Inst{26 - 22} = rB;
+ let Inst{21 - 6} = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// Format R instruction : <|A|B|C|opx|imm|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> opx, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSE<outs, ins, asmstr, pattern, FrmR> {
+ bits<5> rA;
+ bits<5> rB;
+ bits<5> rC;
+ bits<5> imm = 0;
+
+ // opcode is always 0x3a for R instr.
+ let Opcode = 0x3a;
+
+ let Inst{31 - 27} = rA;
+ let Inst{26 - 22} = rB;
+ let Inst{21 - 17} = rC;
+ // opx stands for opcode extension
+ let Inst{16 - 11} = opx;
+ // optional 5-bit immediate value
+ let Inst{10 - 6} = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Nios2 : <|address|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSE<outs, ins, asmstr, pattern, FrmJ> {
+ bits<26> addr;
+
+ let Opcode = op;
+
+ let Inst{31 - 6} = addr;
+}
diff --git a/lib/Target/Nios2/Nios2InstrInfo.td b/lib/Target/Nios2/Nios2InstrInfo.td
new file mode 100644
index 000000000000..5e4815ab3e16
--- /dev/null
+++ b/lib/Target/Nios2/Nios2InstrInfo.td
@@ -0,0 +1,50 @@
+//===- Nios2InstrInfo.td - Target Description for Nios2 ------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Nios2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "Nios2InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Nios2 Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+def simm16 : Operand<i32> {
+ let DecoderMethod= "DecodeSimm16";
+}
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16 : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<bits<6> op, string instr_asm, SDNode OpNode,
+ Operand Od, PatLeaf imm_type, RegisterClass RC> :
+ FI<op, (outs RC:$rB), (ins RC:$rA, Od:$imm16),
+ !strconcat(instr_asm, "\t$rB, $rA, $imm16"),
+ [(set RC:$rB, (OpNode RC:$rA, imm_type:$imm16))]> {
+ let isReMaterializable = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Nios2 R1 Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def ADDi : ArithLogicI<0x04, "addi", add, simm16, immSExt16, CPURegs>;
diff --git a/lib/Target/Nios2/Nios2RegisterInfo.td b/lib/Target/Nios2/Nios2RegisterInfo.td
new file mode 100644
index 000000000000..1808815816f3
--- /dev/null
+++ b/lib/Target/Nios2/Nios2RegisterInfo.td
@@ -0,0 +1,60 @@
+//===-- Nios2RegisterInfo.td - Nios2 Register defs ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// We have bank of 32 registers.
+class Nios2Reg<string n> : Register<n> {
+ field bits<5> Num;
+ let Namespace = "Nios2";
+}
+
+// Nios2 CPU Registers
+class Nios2GPRReg<bits<5> num, string n> : Nios2Reg<n> {
+ let Num = num;
+}
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Nios2" in {
+ // General Purpose Registers
+ def ZERO : Nios2GPRReg<0, "zero">, DwarfRegNum<[ 0 ]>;
+ def AT : Nios2GPRReg<1, "at">, DwarfRegNum<[ 1 ]>;
+ foreach RegNum = 2 - 23 in {
+ def R #RegNum : Nios2GPRReg<RegNum, "r" #RegNum>, DwarfRegNum<[ RegNum ]>;
+ }
+ def ET : Nios2GPRReg<24, "et">, DwarfRegNum<[ 24 ]>;
+ def BT : Nios2GPRReg<25, "bt">, DwarfRegNum<[ 25 ]>;
+ def GP : Nios2GPRReg<26, "gp">, DwarfRegNum<[ 26 ]>;
+ def SP : Nios2GPRReg<27, "sp">, DwarfRegNum<[ 27 ]>;
+ def FP : Nios2GPRReg<28, "fp">, DwarfRegNum<[ 28 ]>;
+ def EA : Nios2GPRReg<29, "ea">, DwarfRegNum<[ 29 ]>;
+ def BA : Nios2GPRReg<30, "ba">, DwarfRegNum<[ 30 ]>;
+ def RA : Nios2GPRReg<31, "ra">, DwarfRegNum<[ 31 ]>;
+ def PC : Nios2Reg<"pc">, DwarfRegNum<[ 32 ]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+def CPURegs : RegisterClass<"Nios2", [ i32 ], 32,
+ (add
+ // Reserved
+ ZERO,
+ AT,
+ // Return Values and Arguments
+ (sequence "R%u", 2, 7),
+ // Not preserved across procedure calls
+ // Caller saved
+ (sequence "R%u", 8, 15),
+ // Callee saved
+ (sequence "R%u", 16, 23),
+ // Reserved
+ ET, BT, GP, SP, FP, EA, BA, RA, PC)>;
diff --git a/lib/Target/Nios2/Nios2TargetMachine.cpp b/lib/Target/Nios2/Nios2TargetMachine.cpp
new file mode 100644
index 000000000000..16d4eabcfaf7
--- /dev/null
+++ b/lib/Target/Nios2/Nios2TargetMachine.cpp
@@ -0,0 +1,46 @@
+//===-- Nios2TargetMachine.cpp - Define TargetMachine for Nios2 -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Nios2 target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2TargetMachine.h"
+#include "Nios2.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nios2"
+
+extern "C" void LLVMInitializeNios2Target() {
+ // Register the target.
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+ const TargetOptions &Options) {
+ return "e-p:32:32:32-i8:8:32-i16:16:32-n32";
+}
+
+static Reloc::Model getEffectiveRelocModel(CodeModel::Model CM,
+ Optional<Reloc::Model> RM) {
+ if (!RM.hasValue() || CM == CodeModel::JITDefault)
+ return Reloc::Static;
+ return *RM;
+}
+
+Nios2TargetMachine::Nios2TargetMachine(const Target &T, const Triple &TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Optional<Reloc::Model> RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
+ Options, getEffectiveRelocModel(CM, RM), CM, OL) {}
+
+Nios2TargetMachine::~Nios2TargetMachine() {}
diff --git a/lib/Target/Nios2/Nios2TargetMachine.h b/lib/Target/Nios2/Nios2TargetMachine.h
new file mode 100644
index 000000000000..7f145c82f32c
--- /dev/null
+++ b/lib/Target/Nios2/Nios2TargetMachine.h
@@ -0,0 +1,30 @@
+//===-- Nios2TargetMachine.h - Define TargetMachine for Nios2 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Nios2 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H
+#define LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class Nios2TargetMachine : public LLVMTargetMachine {
+public:
+ Nios2TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Optional<Reloc::Model> RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+ ~Nios2TargetMachine() override;
+};
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Nios2/TargetInfo/CMakeLists.txt b/lib/Target/Nios2/TargetInfo/CMakeLists.txt
new file mode 100644
index 000000000000..394d2c2680b7
--- /dev/null
+++ b/lib/Target/Nios2/TargetInfo/CMakeLists.txt
@@ -0,0 +1 @@
+add_llvm_library(LLVMNios2Info Nios2TargetInfo.cpp)
diff --git a/lib/Target/Nios2/TargetInfo/LLVMBuild.txt b/lib/Target/Nios2/TargetInfo/LLVMBuild.txt
new file mode 100644
index 000000000000..558f7501ea6b
--- /dev/null
+++ b/lib/Target/Nios2/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/Nios2/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Nios2Info
+parent = Nios2
+required_libraries = Support
+add_to_library_groups = Nios2
diff --git a/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp b/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp
new file mode 100644
index 000000000000..e317686140f7
--- /dev/null
+++ b/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp
@@ -0,0 +1,24 @@
+//===-- Nios2TargetInfo.cpp - Nios2 Target Implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheNios2Target() {
+ static Target TheNios2Target;
+ return TheNios2Target;
+}
+
+extern "C" void LLVMInitializeNios2TargetInfo() {
+ RegisterTarget<Triple::nios2,
+ /*HasJIT=*/true>
+ X(getTheNios2Target(), "nios2", "Nios2");
+}
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index ebd414baf1d2..41e3190c3eec 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -339,7 +339,7 @@ void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL,
// Note: Cannot use stepBackward instead since we are using the Reg
// liveness state at the end of MBB (liveOut of MBB) as the liveIn for
// NewSuccessor. Otherwise, will cause cyclic dependence.
- LivePhysRegs LPR(MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
+ LivePhysRegs LPR(*MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
SmallVector<std::pair<unsigned, const MachineOperand *>, 2> Clobbers;
for (MachineInstr &MI : *MBB)
LPR.stepForward(MI, Clobbers);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index e65b1f1aa0a5..b90a5ee28342 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1596,9 +1596,8 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
return true;
}
-bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
- unsigned &InsertAtByte, bool &Swap, bool IsLE) {
// Check that the mask is shuffling words
+static bool isWordShuffleMask(ShuffleVectorSDNode *N) {
for (unsigned i = 0; i < 4; ++i) {
unsigned B0 = N->getMaskElt(i*4);
unsigned B1 = N->getMaskElt(i*4+1);
@@ -1610,6 +1609,14 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
return false;
}
+ return true;
+}
+
+bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+ unsigned &InsertAtByte, bool &Swap, bool IsLE) {
+ if (!isWordShuffleMask(N))
+ return false;
+
// Now we look at mask elements 0,4,8,12
unsigned M0 = N->getMaskElt(0) / 4;
unsigned M1 = N->getMaskElt(4) / 4;
@@ -1680,6 +1687,69 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
return false;
}
+bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+ bool &Swap, bool IsLE) {
+ assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
+ // Ensure each byte index of the word is consecutive.
+ if (!isWordShuffleMask(N))
+ return false;
+
+ // Now we look at mask elements 0,4,8,12, which are the beginning of words.
+ unsigned M0 = N->getMaskElt(0) / 4;
+ unsigned M1 = N->getMaskElt(4) / 4;
+ unsigned M2 = N->getMaskElt(8) / 4;
+ unsigned M3 = N->getMaskElt(12) / 4;
+
+ // If both vector operands for the shuffle are the same vector, the mask will
+ // contain only elements from the first one and the second one will be undef.
+ if (N->getOperand(1).isUndef()) {
+ assert(M0 < 4 && "Indexing into an undef vector?");
+ if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
+ return false;
+
+ ShiftElts = IsLE ? (4 - M0) % 4 : M0;
+ Swap = false;
+ return true;
+ }
+
+ // Ensure each word index of the ShuffleVector Mask is consecutive.
+ if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
+ return false;
+
+ if (IsLE) {
+ if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
+ // Input vectors don't need to be swapped if the leading element
+ // of the result is one of the 3 left elements of the second vector
+ // (or if there is no shift to be done at all).
+ Swap = false;
+ ShiftElts = (8 - M0) % 8;
+ } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
+ // Input vectors need to be swapped if the leading element
+ // of the result is one of the 3 left elements of the first vector
+ // (or if we're shifting by 4 - thereby simply swapping the vectors).
+ Swap = true;
+ ShiftElts = (4 - M0) % 4;
+ }
+
+ return true;
+ } else { // BE
+ if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
+ // Input vectors don't need to be swapped if the leading element
+ // of the result is one of the 4 elements of the first vector.
+ Swap = false;
+ ShiftElts = M0;
+ } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
+ // Input vectors need to be swapped if the leading element
+ // of the result is one of the 4 elements of the right vector.
+ Swap = true;
+ ShiftElts = M0 - 4;
+ }
+
+ return true;
+ }
+}
+
+
/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
@@ -7679,6 +7749,20 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
}
+
+ if (Subtarget.hasVSX() &&
+ PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
+ if (Swap)
+ std::swap(V1, V2);
+ SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+ SDValue Conv2 =
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);
+
+ SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
+ DAG.getConstant(ShiftElts, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
+ }
+
if (Subtarget.hasVSX()) {
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
@@ -8212,10 +8296,12 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDLoc DL(Op);
switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
case Intrinsic::ppc_cfence: {
+ assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
- Op.getOperand(ArgStart + 1))),
+ Op.getOperand(ArgStart + 1)),
+ Op.getOperand(0)),
0);
}
default:
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index acb77943b118..2f9eb95f6de6 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -450,7 +450,11 @@ namespace llvm {
/// a VMRGEW or VMRGOW instruction
bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
unsigned ShuffleKind, SelectionDAG &DAG);
-
+ /// isXXSLDWIShuffleMask - Return true if this is a shuffle mask suitable
+ /// for a XXSLDWI instruction.
+ bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+ bool &Swap, bool IsLE);
+
/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
/// shift amount, otherwise return -1.
int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index a3f894c81a01..165970f9678c 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1001,7 +1001,9 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
isPPC64;
// LR8 is a true define, while the rest of the Defs are clobbers. X3 is
// explicitly defined when this op is created, so not mentioned here.
-let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+// This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be
+// correct because the branch select pass is relying on it.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8,
Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
"#GETtlsADDR",
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 46f103141bc1..fd6785e963a6 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1931,6 +1931,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case PPC::DFSTOREf64: {
assert(Subtarget.hasP9Vector() &&
"Invalid D-Form Pseudo-ops on non-P9 target.");
+ assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
+ "D-form op must have register and immediate operands");
unsigned UpperOpcode, LowerOpcode;
switch (MI.getOpcode()) {
case PPC::DFLOADf32:
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 0766cfe4a987..26b99eced23c 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -46,7 +46,7 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
]>;
def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
- SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+ SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
]>;
def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index b98140fedfc0..1589ab03e507 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1066,6 +1066,10 @@ def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;
+// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and
+// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable.
+def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>;
+
// Selects.
def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
(SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
@@ -2379,8 +2383,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Load Vector Indexed
def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc,
- [(set v2f64:$XT, (load xoaddr:$src))]>;
-
+ [(set v2f64:$XT, (load xaddr:$src))]>;
// Load Vector (Left-justified) with Length
def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
"lxvl $XT, $src, $rB", IIC_LdStLoad,
@@ -2430,7 +2433,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
// Store Vector Indexed
def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc,
- [(store v2f64:$XT, xoaddr:$dst)]>;
+ [(store v2f64:$XT, xaddr:$dst)]>;
// Store Vector (Left-justified) with Length
def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
@@ -2498,21 +2501,38 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
} // IsLittleEndian, HasP9Vector
- def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
- def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
- def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
- def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
- def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
- def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
- def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
- def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
- def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
- def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
- def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
- (STXVX $rS, xoaddr:$dst)>;
- def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
- (STXVX $rS, xoaddr:$dst)>;
-
+ // D-Form Load/Store
+ def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>;
+ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>;
+
+ def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst),
+ (STXV $rS, memrix16:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst),
+ (STXV $rS, memrix16:$dst)>;
+
+
+ def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>;
+ def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>;
+ def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>;
+ def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>;
+ def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>;
+ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>;
+ def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
+ def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
+ def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
+ def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst),
+ (STXVX $rS, xaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst),
+ (STXVX $rS, xaddr:$dst)>;
def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
(v4i32 (LXVWSX xoaddr:$src))>;
def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
@@ -2704,9 +2724,15 @@ def FltToUIntLoad {
def FltToLongLoad {
dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
}
+def FltToLongLoadP9 {
+ dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A)))));
+}
def FltToULongLoad {
dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
}
+def FltToULongLoadP9 {
+ dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A)))));
+}
def FltToLong {
dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
}
@@ -2728,9 +2754,15 @@ def DblToULong {
def DblToIntLoad {
dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
}
+def DblToIntLoadP9 {
+ dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A)))));
+}
def DblToUIntLoad {
dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
}
+def DblToUIntLoadP9 {
+ dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A)))));
+}
def DblToLongLoad {
dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
}
@@ -2898,17 +2930,17 @@ let AddedComplexity = 400 in {
(v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
(v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
- def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
+ def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
(v4i32 (XXSPLTW (COPY_TO_REGCLASS
(XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
- def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
+ def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
(v4i32 (XXSPLTW (COPY_TO_REGCLASS
(XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
- def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
+ def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
(v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
(DFLOADf32 iaddr:$A),
VSFRC)), 0))>;
- def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
+ def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
(v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
(DFLOADf32 iaddr:$A),
VSFRC)), 0))>;
diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
index 92ce8089c24f..d02db9a617a3 100644
--- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp
+++ b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
@@ -74,7 +74,7 @@ bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
unsigned CCValid = MI.getOperand(3).getImm();
unsigned CCMask = MI.getOperand(4).getImm();
- LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+ LivePhysRegs LiveRegs(TII->getRegisterInfo());
LiveRegs.addLiveOuts(MBB);
for (auto I = std::prev(MBB.end()); I != MBBI; --I)
LiveRegs.stepBackward(*I);
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index a30bf34857b5..b34c181124de 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -236,32 +236,30 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction &MF = *MBB->getParent();
- const unsigned Reg = MI->getOperand(0).getReg();
+ const unsigned Reg64 = MI->getOperand(0).getReg();
+ const unsigned Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32);
- // Conveniently, all 4 instructions are cloned from LOAD_STACK_GUARD,
- // so they already have operand 0 set to reg.
+ // EAR can only load the low subregister so us a shift for %a0 to produce
+ // the GR containing %a0 and %a1.
// ear <reg>, %a0
- MachineInstr *Ear1MI = MF.CloneMachineInstr(MI);
- MBB->insert(MI, Ear1MI);
- Ear1MI->setDesc(get(SystemZ::EAR));
- MachineInstrBuilder(MF, Ear1MI).addReg(SystemZ::A0);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32)
+ .addReg(SystemZ::A0)
+ .addReg(Reg64, RegState::ImplicitDefine);
// sllg <reg>, <reg>, 32
- MachineInstr *SllgMI = MF.CloneMachineInstr(MI);
- MBB->insert(MI, SllgMI);
- SllgMI->setDesc(get(SystemZ::SLLG));
- MachineInstrBuilder(MF, SllgMI).addReg(Reg).addReg(0).addImm(32);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::SLLG), Reg64)
+ .addReg(Reg64)
+ .addReg(0)
+ .addImm(32);
// ear <reg>, %a1
- MachineInstr *Ear2MI = MF.CloneMachineInstr(MI);
- MBB->insert(MI, Ear2MI);
- Ear2MI->setDesc(get(SystemZ::EAR));
- MachineInstrBuilder(MF, Ear2MI).addReg(SystemZ::A1);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32)
+ .addReg(SystemZ::A1);
// lg <reg>, 40(<reg>)
MI->setDesc(get(SystemZ::LG));
- MachineInstrBuilder(MF, MI).addReg(Reg).addImm(40).addReg(0);
+ MachineInstrBuilder(MF, MI).addReg(Reg64).addImm(40).addReg(0);
}
// Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 3766ed45b8c4..ad597f5c65f0 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -55,6 +55,7 @@ public:
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
+ bool prefersVectorizedAddressing() { return false; }
bool supportsEfficientVectorElementLoadStore() { return true; }
bool enableInterleavedAccessVectorization() { return true; }
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 32ab475f1186..e5d3209ec6a9 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1316,16 +1316,17 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
while (!Done) {
bool UpdateLocLex = true;
+ AsmToken::TokenKind TK = getLexer().getKind();
// The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
// identifier. Don't try an parse it as a register.
- if (PrevTK != AsmToken::Error && Tok.getString().startswith("."))
+ if (PrevTK != AsmToken::Error && Tok.getString().startswith(".") &&
+ TK != AsmToken::Identifier)
break;
// If we're parsing an immediate expression, we don't expect a '['.
if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
break;
- AsmToken::TokenKind TK = getLexer().getKind();
switch (TK) {
default: {
if (SM.isValidEndState()) {
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index fc4adddc149b..7471373334f6 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -11,6 +11,7 @@ tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables)
+tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables)
if(LLVM_BUILD_GLOBAL_ISEL)
tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 3a421fe77392..fe105298f5c1 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -127,6 +127,9 @@ def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
"Enable AVX-512 Conflict Detection Instructions",
[FeatureAVX512]>;
+def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
+ "true", "Enable AVX-512 Population Count Instructions",
+ [FeatureAVX512]>;
def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
"Enable AVX-512 PreFetch Instructions",
[FeatureAVX512]>;
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index a5489b9aa8b7..313920e02c3e 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -1655,8 +1655,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
}
void FPS::setKillFlags(MachineBasicBlock &MBB) const {
- const TargetRegisterInfo *TRI =
- MBB.getParent()->getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo &TRI =
+ *MBB.getParent()->getSubtarget().getRegisterInfo();
LivePhysRegs LPR(TRI);
LPR.addLiveOuts(MBB);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 37b248416e4a..86744b064132 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1364,6 +1364,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
+ if (Subtarget.hasVPOPCNTDQ()) {
+ // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
+ // version of popcntd/q.
+ for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
+ MVT::v4i32, MVT::v2i64})
+ setOperationAction(ISD::CTPOP, VT, Legal);
+ }
+
// Custom lower several nodes.
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index f9344413bbcf..d8702693884d 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -2693,22 +2693,22 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
}
multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- PatFrag st_frag, PatFrag mstore> {
+ PatFrag st_frag, PatFrag mstore, string Name> {
let hasSideEffects = 0 in {
def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
- [], _.ExeDomain>, EVEX;
+ [], _.ExeDomain>, EVEX, FoldGenData<Name#rr>;
def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
"${dst} {${mask}}, $src}",
- [], _.ExeDomain>, EVEX, EVEX_K;
+ [], _.ExeDomain>, EVEX, EVEX_K, FoldGenData<Name#rrk>;
def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
"${dst} {${mask}} {z}, $src}",
- [], _.ExeDomain>, EVEX, EVEX_KZ;
+ [], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>;
}
def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
@@ -2726,80 +2726,92 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo _, Predicate prd> {
+ AVX512VLVectorVTInfo _, Predicate prd,
+ string Name> {
let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
- masked_store_unaligned>, EVEX_V512;
+ masked_store_unaligned, Name#Z>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
- masked_store_unaligned>, EVEX_V256;
+ masked_store_unaligned, Name#Z256>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
- masked_store_unaligned>, EVEX_V128;
+ masked_store_unaligned, Name#Z128>, EVEX_V128;
}
}
multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo _, Predicate prd> {
+ AVX512VLVectorVTInfo _, Predicate prd,
+ string Name> {
let Predicates = [prd] in
defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
- masked_store_aligned512>, EVEX_V512;
+ masked_store_aligned512, Name#Z>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
- masked_store_aligned256>, EVEX_V256;
+ masked_store_aligned256, Name#Z256>, EVEX_V256;
defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
- masked_store_aligned128>, EVEX_V128;
+ masked_store_aligned128, Name#Z128>, EVEX_V128;
}
}
defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
HasAVX512>,
avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
- HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
+ HasAVX512, "VMOVAPS">,
+ PS, EVEX_CD8<32, CD8VF>;
defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
HasAVX512>,
avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
- HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+ HasAVX512, "VMOVAPD">,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
null_frag>,
- avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
+ avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
+ "VMOVUPS">,
PS, EVEX_CD8<32, CD8VF>;
defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
null_frag>,
- avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
+ avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
+ "VMOVUPD">,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
HasAVX512>,
avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
- HasAVX512>, PD, EVEX_CD8<32, CD8VF>;
+ HasAVX512, "VMOVDQA32">,
+ PD, EVEX_CD8<32, CD8VF>;
defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
HasAVX512>,
avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
- HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+ HasAVX512, "VMOVDQA64">,
+ PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
- avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
- HasBWI>, XD, EVEX_CD8<8, CD8VF>;
+ avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
+ HasBWI, "VMOVDQU8">,
+ XD, EVEX_CD8<8, CD8VF>;
defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
- HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
+ HasBWI, "VMOVDQU16">,
+ XD, VEX_W, EVEX_CD8<16, CD8VF>;
defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
null_frag>,
avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
- HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
+ HasAVX512, "VMOVDQU32">,
+ XS, EVEX_CD8<32, CD8VF>;
defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
null_frag>,
avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
- HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+ HasAVX512, "VMOVDQU64">,
+ XS, VEX_W, EVEX_CD8<64, CD8VF>;
// Special instructions to help with spilling when we don't have VLX. We need
// to load or store from a ZMM register instead. These are converted in
@@ -3354,17 +3366,52 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
(VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
(COPY_TO_REGCLASS VR128X:$src, FR32X))>;
-let hasSideEffects = 0 in
-defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
- (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2),
- "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
- XS, EVEX_4V, VEX_LIG;
-
-let hasSideEffects = 0 in
-defm VMOVSDZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
- (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2),
- "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
- XD, EVEX_4V, VEX_LIG, VEX_W;
+let hasSideEffects = 0 in {
+ def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, FR32X:$src2),
+ "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], NoItinerary>, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrr">;
+
+let Constraints = "$src0 = $dst" in
+ def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
+ VR128X:$src1, FR32X:$src2),
+ "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
+ "$dst {${mask}}, $src1, $src2}",
+ [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrrk">;
+
+ def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2),
+ "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, $src1, $src2}",
+ [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrrkz">;
+
+ def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, FR64X:$src2),
+ "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
+ FoldGenData<"VMOVSDZrr">;
+
+let Constraints = "$src0 = $dst" in
+ def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
+ VR128X:$src1, FR64X:$src2),
+ "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
+ "$dst {${mask}}, $src1, $src2}",
+ [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+ VEX_W, FoldGenData<"VMOVSDZrrk">;
+
+ def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+ (ins f64x_info.KRCWM:$mask, VR128X:$src1,
+ FR64X:$src2),
+ "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+ "$dst {${mask}} {z}, $src1, $src2}",
+ [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+ VEX_W, FoldGenData<"VMOVSDZrrkz">;
+}
let Predicates = [HasAVX512] in {
let AddedComplexity = 15 in {
@@ -8649,6 +8696,41 @@ let Predicates = [HasCDI, NoVLX] in {
}
//===---------------------------------------------------------------------===//
+// Counts number of ones - VPOPCNTD and VPOPCNTQ
+//===---------------------------------------------------------------------===//
+
+multiclass avx512_unary_rmb_popcnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo VTInfo> {
+ let Predicates = [HasVPOPCNTDQ] in
+ defm Z : avx512_unary_rmb<opc, OpcodeStr, ctpop, VTInfo>, EVEX_V512;
+}
+
+// Use 512bit version to implement 128/256 bit.
+multiclass avx512_unary_lowering<SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd] in {
+ def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME # "Zrr")
+ (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+ _.info256.RC:$src1,
+ _.info256.SubRegIdx)),
+ _.info256.SubRegIdx)>;
+
+ def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(NAME # "Zrr")
+ (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+ _.info128.RC:$src1,
+ _.info128.SubRegIdx)),
+ _.info128.SubRegIdx)>;
+ }
+}
+
+defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>,
+ avx512_unary_lowering<ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
+defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>,
+ avx512_unary_lowering<ctpop, avx512vl_i64_info, HasVPOPCNTDQ>, VEX_W;
+
+//===---------------------------------------------------------------------===//
// Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//
multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
@@ -8795,7 +8877,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
(ins _.RC:$src1, u8imm:$src2),
OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- EVEX, TAPD;
+ EVEX, TAPD, FoldGenData<NAME#rr>;
defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
}
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 66382014f6e8..e38bbc9b3d36 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -964,10 +964,10 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
} // isConvertibleToThreeAddress
} // isCommutable
- def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
- def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
- def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
- def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+ def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+ def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+ def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+ def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
@@ -1049,10 +1049,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
} // isConvertibleToThreeAddress
} // isCommutable
- def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
- def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
- def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
- def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
+ def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+ def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+ def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+ def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
@@ -1129,10 +1129,10 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
}
} // isCommutable
- def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
- def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
- def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
- def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+ def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+ def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+ def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+ def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 1941ae57f0f1..3a3cdc9fa574 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -297,7 +297,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
- VEX_LIG;
+ VEX_LIG, FoldGenData<NAME#rr>;
}
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
@@ -321,6 +321,12 @@ let isCodeGenOnly = 1 in {
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+let hasSideEffects = 0 in
+ def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, VEX_LIG, FoldGenData<NAME#rr_Int>;
} // isCodeGenOnly = 1
}
@@ -372,12 +378,13 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+ FoldGenData<NAME#rr>;
def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
- VEX_L;
+ VEX_L, FoldGenData<NAME#Yrr>;
} // isCodeGenOnly = 1
}
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index c2fe786732dc..bfcbf71d252f 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -225,6 +225,12 @@ class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; }
class XOP { Encoding OpEnc = EncXOP; }
class XOP_4V : XOP { bit hasVEX_4V = 1; }
+// Specify the alternative register form instruction to replace the current
+// instruction in case it was picked during generation of memory folding tables
+class FoldGenData<string _RegisterForm> {
+ string FoldGenRegForm = _RegisterForm;
+}
+
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
string AsmStr,
InstrItinClass itin,
@@ -304,6 +310,10 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
CD8_EltSize,
!srl(VectSize, CD8_Form{1-0}))), 0);
+ // Used in the memory folding generation (TableGen backend) to point to an alternative
+ // instruction to replace the current one in case it got picked during generation.
+ string FoldGenRegForm = ?;
+
// TSFlags layout should be kept in sync with X86BaseInfo.h.
let TSFlags{6-0} = FormBits;
let TSFlags{8-7} = OpSizeBits;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index f7083a7448ce..33fbd41bb631 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -121,172 +121,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
(STI.is64Bit() ? X86::RETQ : X86::RETL)),
Subtarget(STI), RI(STI.getTargetTriple()) {
- static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
- { X86::ADC32ri, X86::ADC32mi, 0 },
- { X86::ADC32ri8, X86::ADC32mi8, 0 },
- { X86::ADC32rr, X86::ADC32mr, 0 },
- { X86::ADC64ri32, X86::ADC64mi32, 0 },
- { X86::ADC64ri8, X86::ADC64mi8, 0 },
- { X86::ADC64rr, X86::ADC64mr, 0 },
- { X86::ADD16ri, X86::ADD16mi, 0 },
- { X86::ADD16ri8, X86::ADD16mi8, 0 },
- { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
- { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
- { X86::ADD16rr, X86::ADD16mr, 0 },
- { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
- { X86::ADD32ri, X86::ADD32mi, 0 },
- { X86::ADD32ri8, X86::ADD32mi8, 0 },
- { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
- { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
- { X86::ADD32rr, X86::ADD32mr, 0 },
- { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
- { X86::ADD64ri32, X86::ADD64mi32, 0 },
- { X86::ADD64ri8, X86::ADD64mi8, 0 },
- { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
- { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
- { X86::ADD64rr, X86::ADD64mr, 0 },
- { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
- { X86::ADD8ri, X86::ADD8mi, 0 },
- { X86::ADD8rr, X86::ADD8mr, 0 },
- { X86::AND16ri, X86::AND16mi, 0 },
- { X86::AND16ri8, X86::AND16mi8, 0 },
- { X86::AND16rr, X86::AND16mr, 0 },
- { X86::AND32ri, X86::AND32mi, 0 },
- { X86::AND32ri8, X86::AND32mi8, 0 },
- { X86::AND32rr, X86::AND32mr, 0 },
- { X86::AND64ri32, X86::AND64mi32, 0 },
- { X86::AND64ri8, X86::AND64mi8, 0 },
- { X86::AND64rr, X86::AND64mr, 0 },
- { X86::AND8ri, X86::AND8mi, 0 },
- { X86::AND8rr, X86::AND8mr, 0 },
- { X86::DEC16r, X86::DEC16m, 0 },
- { X86::DEC32r, X86::DEC32m, 0 },
- { X86::DEC64r, X86::DEC64m, 0 },
- { X86::DEC8r, X86::DEC8m, 0 },
- { X86::INC16r, X86::INC16m, 0 },
- { X86::INC32r, X86::INC32m, 0 },
- { X86::INC64r, X86::INC64m, 0 },
- { X86::INC8r, X86::INC8m, 0 },
- { X86::NEG16r, X86::NEG16m, 0 },
- { X86::NEG32r, X86::NEG32m, 0 },
- { X86::NEG64r, X86::NEG64m, 0 },
- { X86::NEG8r, X86::NEG8m, 0 },
- { X86::NOT16r, X86::NOT16m, 0 },
- { X86::NOT32r, X86::NOT32m, 0 },
- { X86::NOT64r, X86::NOT64m, 0 },
- { X86::NOT8r, X86::NOT8m, 0 },
- { X86::OR16ri, X86::OR16mi, 0 },
- { X86::OR16ri8, X86::OR16mi8, 0 },
- { X86::OR16rr, X86::OR16mr, 0 },
- { X86::OR32ri, X86::OR32mi, 0 },
- { X86::OR32ri8, X86::OR32mi8, 0 },
- { X86::OR32rr, X86::OR32mr, 0 },
- { X86::OR64ri32, X86::OR64mi32, 0 },
- { X86::OR64ri8, X86::OR64mi8, 0 },
- { X86::OR64rr, X86::OR64mr, 0 },
- { X86::OR8ri, X86::OR8mi, 0 },
- { X86::OR8rr, X86::OR8mr, 0 },
- { X86::ROL16r1, X86::ROL16m1, 0 },
- { X86::ROL16rCL, X86::ROL16mCL, 0 },
- { X86::ROL16ri, X86::ROL16mi, 0 },
- { X86::ROL32r1, X86::ROL32m1, 0 },
- { X86::ROL32rCL, X86::ROL32mCL, 0 },
- { X86::ROL32ri, X86::ROL32mi, 0 },
- { X86::ROL64r1, X86::ROL64m1, 0 },
- { X86::ROL64rCL, X86::ROL64mCL, 0 },
- { X86::ROL64ri, X86::ROL64mi, 0 },
- { X86::ROL8r1, X86::ROL8m1, 0 },
- { X86::ROL8rCL, X86::ROL8mCL, 0 },
- { X86::ROL8ri, X86::ROL8mi, 0 },
- { X86::ROR16r1, X86::ROR16m1, 0 },
- { X86::ROR16rCL, X86::ROR16mCL, 0 },
- { X86::ROR16ri, X86::ROR16mi, 0 },
- { X86::ROR32r1, X86::ROR32m1, 0 },
- { X86::ROR32rCL, X86::ROR32mCL, 0 },
- { X86::ROR32ri, X86::ROR32mi, 0 },
- { X86::ROR64r1, X86::ROR64m1, 0 },
- { X86::ROR64rCL, X86::ROR64mCL, 0 },
- { X86::ROR64ri, X86::ROR64mi, 0 },
- { X86::ROR8r1, X86::ROR8m1, 0 },
- { X86::ROR8rCL, X86::ROR8mCL, 0 },
- { X86::ROR8ri, X86::ROR8mi, 0 },
- { X86::SAR16r1, X86::SAR16m1, 0 },
- { X86::SAR16rCL, X86::SAR16mCL, 0 },
- { X86::SAR16ri, X86::SAR16mi, 0 },
- { X86::SAR32r1, X86::SAR32m1, 0 },
- { X86::SAR32rCL, X86::SAR32mCL, 0 },
- { X86::SAR32ri, X86::SAR32mi, 0 },
- { X86::SAR64r1, X86::SAR64m1, 0 },
- { X86::SAR64rCL, X86::SAR64mCL, 0 },
- { X86::SAR64ri, X86::SAR64mi, 0 },
- { X86::SAR8r1, X86::SAR8m1, 0 },
- { X86::SAR8rCL, X86::SAR8mCL, 0 },
- { X86::SAR8ri, X86::SAR8mi, 0 },
- { X86::SBB32ri, X86::SBB32mi, 0 },
- { X86::SBB32ri8, X86::SBB32mi8, 0 },
- { X86::SBB32rr, X86::SBB32mr, 0 },
- { X86::SBB64ri32, X86::SBB64mi32, 0 },
- { X86::SBB64ri8, X86::SBB64mi8, 0 },
- { X86::SBB64rr, X86::SBB64mr, 0 },
- { X86::SHL16r1, X86::SHL16m1, 0 },
- { X86::SHL16rCL, X86::SHL16mCL, 0 },
- { X86::SHL16ri, X86::SHL16mi, 0 },
- { X86::SHL32r1, X86::SHL32m1, 0 },
- { X86::SHL32rCL, X86::SHL32mCL, 0 },
- { X86::SHL32ri, X86::SHL32mi, 0 },
- { X86::SHL64r1, X86::SHL64m1, 0 },
- { X86::SHL64rCL, X86::SHL64mCL, 0 },
- { X86::SHL64ri, X86::SHL64mi, 0 },
- { X86::SHL8r1, X86::SHL8m1, 0 },
- { X86::SHL8rCL, X86::SHL8mCL, 0 },
- { X86::SHL8ri, X86::SHL8mi, 0 },
- { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
- { X86::SHLD16rri8, X86::SHLD16mri8, 0 },
- { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
- { X86::SHLD32rri8, X86::SHLD32mri8, 0 },
- { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
- { X86::SHLD64rri8, X86::SHLD64mri8, 0 },
- { X86::SHR16r1, X86::SHR16m1, 0 },
- { X86::SHR16rCL, X86::SHR16mCL, 0 },
- { X86::SHR16ri, X86::SHR16mi, 0 },
- { X86::SHR32r1, X86::SHR32m1, 0 },
- { X86::SHR32rCL, X86::SHR32mCL, 0 },
- { X86::SHR32ri, X86::SHR32mi, 0 },
- { X86::SHR64r1, X86::SHR64m1, 0 },
- { X86::SHR64rCL, X86::SHR64mCL, 0 },
- { X86::SHR64ri, X86::SHR64mi, 0 },
- { X86::SHR8r1, X86::SHR8m1, 0 },
- { X86::SHR8rCL, X86::SHR8mCL, 0 },
- { X86::SHR8ri, X86::SHR8mi, 0 },
- { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
- { X86::SHRD16rri8, X86::SHRD16mri8, 0 },
- { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
- { X86::SHRD32rri8, X86::SHRD32mri8, 0 },
- { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
- { X86::SHRD64rri8, X86::SHRD64mri8, 0 },
- { X86::SUB16ri, X86::SUB16mi, 0 },
- { X86::SUB16ri8, X86::SUB16mi8, 0 },
- { X86::SUB16rr, X86::SUB16mr, 0 },
- { X86::SUB32ri, X86::SUB32mi, 0 },
- { X86::SUB32ri8, X86::SUB32mi8, 0 },
- { X86::SUB32rr, X86::SUB32mr, 0 },
- { X86::SUB64ri32, X86::SUB64mi32, 0 },
- { X86::SUB64ri8, X86::SUB64mi8, 0 },
- { X86::SUB64rr, X86::SUB64mr, 0 },
- { X86::SUB8ri, X86::SUB8mi, 0 },
- { X86::SUB8rr, X86::SUB8mr, 0 },
- { X86::XOR16ri, X86::XOR16mi, 0 },
- { X86::XOR16ri8, X86::XOR16mi8, 0 },
- { X86::XOR16rr, X86::XOR16mr, 0 },
- { X86::XOR32ri, X86::XOR32mi, 0 },
- { X86::XOR32ri8, X86::XOR32mi8, 0 },
- { X86::XOR32rr, X86::XOR32mr, 0 },
- { X86::XOR64ri32, X86::XOR64mi32, 0 },
- { X86::XOR64ri8, X86::XOR64mi8, 0 },
- { X86::XOR64rr, X86::XOR64mr, 0 },
- { X86::XOR8ri, X86::XOR8mi, 0 },
- { X86::XOR8rr, X86::XOR8mr, 0 }
- };
+// Generated memory folding tables.
+#include "X86GenFoldTables.inc"
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
@@ -295,744 +131,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
}
- static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
- { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
- { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
- { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
- { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
- { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
- { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
- { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
- { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
- { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
- { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
- { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
- { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
- { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
- { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
- { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
- { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
- { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
- { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
- { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
- { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
- { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
- { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
- { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
- { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
- { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
- { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
- { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
- { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
- { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
- { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
- { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
- { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
- { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
- { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
- { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
- { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
- { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
- { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
- { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
- { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
- { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
- { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
- { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
- { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
- { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
- { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
- { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
- { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
- { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
- { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
- { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
- { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
- { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
- { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
- { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
- { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
- { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
- { X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
- { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
- { X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
- { X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
- { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
- { X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
- { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
- { X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
- { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
- { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
- { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
- { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
- { X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
- { X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
- { X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
- { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
- { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
- { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
- { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
- { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
- { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
- { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
-
- // AVX 128-bit versions of foldable instructions
- { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
- { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
- { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
- { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
- { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
- { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
- { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
- { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
- { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
- { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
-
- // AVX 256-bit foldable instructions
- { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
- { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
- { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
-
- // AVX-512 foldable instructions
- { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
- { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
- { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
- { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
- { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
- { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
- { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
- { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
- { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
- { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
- { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
- { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
- { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
- { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE },
- { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
- { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
- { X86::VPEXTRDZrr, X86::VPEXTRDZmr, TB_FOLDED_STORE },
- { X86::VPEXTRQZrr, X86::VPEXTRQZmr, TB_FOLDED_STORE },
- { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
- { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
- { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
- { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
- { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
- { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
- { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
- { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
- { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
- { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
- { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },
-
- // AVX-512 foldable instructions (256-bit versions)
- { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
- { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
- { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
- { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
- { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
- { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
- { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
- { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
- { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
- { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
- { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
- { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
- { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },
-
- // AVX-512 foldable instructions (128-bit versions)
- { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
- { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
-
- // F16C foldable instructions
- { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
- { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
- };
-
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
}
- static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
- { X86::BSF16rr, X86::BSF16rm, 0 },
- { X86::BSF32rr, X86::BSF32rm, 0 },
- { X86::BSF64rr, X86::BSF64rm, 0 },
- { X86::BSR16rr, X86::BSR16rm, 0 },
- { X86::BSR32rr, X86::BSR32rm, 0 },
- { X86::BSR64rr, X86::BSR64rm, 0 },
- { X86::CMP16rr, X86::CMP16rm, 0 },
- { X86::CMP32rr, X86::CMP32rm, 0 },
- { X86::CMP64rr, X86::CMP64rm, 0 },
- { X86::CMP8rr, X86::CMP8rm, 0 },
- { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
- { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
- { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
- { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
- { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
- { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
- { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
- { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
- { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
- { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
- { X86::IMUL16rri, X86::IMUL16rmi, 0 },
- { X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
- { X86::IMUL32rri, X86::IMUL32rmi, 0 },
- { X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
- { X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
- { X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
- { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE },
- { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE },
- { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE },
- { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE },
- { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE },
- { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE },
- { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
- { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
- { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
- { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
- { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
- { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
- { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
- { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
- { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE },
- { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE },
- { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE },
- { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE },
- { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE },
- { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE },
- { X86::MOV16rr, X86::MOV16rm, 0 },
- { X86::MOV32rr, X86::MOV32rm, 0 },
- { X86::MOV64rr, X86::MOV64rm, 0 },
- { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
- { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
- { X86::MOV8rr, X86::MOV8rm, 0 },
- { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
- { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
- { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
- { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
- { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
- { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
- { X86::MOVDQUrr, X86::MOVDQUrm, 0 },
- { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
- { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
- { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
- { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
- { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
- { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
- { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
- { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
- { X86::MOVUPDrr, X86::MOVUPDrm, 0 },
- { X86::MOVUPSrr, X86::MOVUPSrm, 0 },
- { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
- { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
- { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
- { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
- { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
- { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
- { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
- { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
- { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
- { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
- { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
- { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
- { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
- { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
- { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
- { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
- { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
- { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
- { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
- { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
- { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
- { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
- { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
- { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
- { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
- { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
- { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
- { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
- { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
- { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
- { X86::RCPSSr, X86::RCPSSm, 0 },
- { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
- { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
- { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
- { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
- { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
- { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
- { X86::RSQRTSSr, X86::RSQRTSSm, 0 },
- { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
- { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
- { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
- { X86::SQRTSDr, X86::SQRTSDm, 0 },
- { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
- { X86::SQRTSSr, X86::SQRTSSm, 0 },
- { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
- { X86::TEST16rr, X86::TEST16rm, 0 },
- { X86::TEST32rr, X86::TEST32rm, 0 },
- { X86::TEST64rr, X86::TEST64rm, 0 },
- { X86::TEST8rr, X86::TEST8rm, 0 },
- // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
- { X86::UCOMISDrr, X86::UCOMISDrm, 0 },
- { X86::UCOMISSrr, X86::UCOMISSrm, 0 },
-
- // MMX version of foldable instructions
- { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 },
- { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
- { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 },
- { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 },
- { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 },
- { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
- { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 },
- { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 },
- { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 },
- { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
-
- // 3DNow! version of foldable instructions
- { X86::PF2IDrr, X86::PF2IDrm, 0 },
- { X86::PF2IWrr, X86::PF2IWrm, 0 },
- { X86::PFRCPrr, X86::PFRCPrm, 0 },
- { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
- { X86::PI2FDrr, X86::PI2FDrm, 0 },
- { X86::PI2FWrr, X86::PI2FWrm, 0 },
- { X86::PSWAPDrr, X86::PSWAPDrm, 0 },
-
- // AVX 128-bit versions of foldable instructions
- { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE },
- { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE },
- { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE },
- { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE },
- { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
- { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
- { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
- { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE },
- { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
- { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
- { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
- { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE },
- { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE },
- { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE },
- { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE },
- { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE },
- { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
- { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
- { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
- { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
- { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
- { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
- { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
- { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
- { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
- { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
- { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
- { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
- { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
- { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
- { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
- { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
- { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
- { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
- { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
- { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
- { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
- { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE },
- { X86::VPABSBrr, X86::VPABSBrm, 0 },
- { X86::VPABSDrr, X86::VPABSDrm, 0 },
- { X86::VPABSWrr, X86::VPABSWrm, 0 },
- { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
- { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
- { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
- { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
- { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
- { X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
- { X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
- { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
- { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
- { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
- { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
- { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
- { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
- { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
- { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
- { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
- { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
- { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
- { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
- { X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
- { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
- { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
- { X86::VPTESTrr, X86::VPTESTrm, 0 },
- { X86::VRCPPSr, X86::VRCPPSm, 0 },
- { X86::VROUNDPDr, X86::VROUNDPDm, 0 },
- { X86::VROUNDPSr, X86::VROUNDPSm, 0 },
- { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
- { X86::VSQRTPDr, X86::VSQRTPDm, 0 },
- { X86::VSQRTPSr, X86::VSQRTPSm, 0 },
- { X86::VTESTPDrr, X86::VTESTPDrm, 0 },
- { X86::VTESTPSrr, X86::VTESTPSrm, 0 },
- { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
- { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
-
- // AVX 256-bit foldable instructions
- { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE },
- { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
- { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
- { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
- { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
- { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE },
- { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
- { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
- { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
- { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
- { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
- { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
- { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
- { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
- { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
- { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
- { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
- { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
- { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
- { X86::VPTESTYrr, X86::VPTESTYrm, 0 },
- { X86::VRCPPSYr, X86::VRCPPSYm, 0 },
- { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
- { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
- { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
- { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
- { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
- { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
- { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
-
- // AVX2 foldable instructions
-
- // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
- // VBROADCASTS{SD}rm memory instructions were available from AVX1.
- // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
- // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
- // so they don't need an equivalent limitation.
- { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
- { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
- { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
- { X86::VPABSBYrr, X86::VPABSBYrm, 0 },
- { X86::VPABSDYrr, X86::VPABSDYrm, 0 },
- { X86::VPABSWYrr, X86::VPABSWYrm, 0 },
- { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
- { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
- { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
- { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
- { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
- { X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
- { X86::VPERMQYri, X86::VPERMQYmi, 0 },
- { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
- { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
- { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
- { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
- { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
- { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
- { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
- { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
- { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
- { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
- { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
- { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
- { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
- { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
- { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
-
- // XOP foldable instructions
- { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
- { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 },
- { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
- { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 },
- { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 },
- { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 },
- { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
- { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
- { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
- { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
- { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
- { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
- { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
- { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
- { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
- { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
- { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
- { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
- { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
- { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
- { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
- { X86::VPROTBri, X86::VPROTBmi, 0 },
- { X86::VPROTBrr, X86::VPROTBmr, 0 },
- { X86::VPROTDri, X86::VPROTDmi, 0 },
- { X86::VPROTDrr, X86::VPROTDmr, 0 },
- { X86::VPROTQri, X86::VPROTQmi, 0 },
- { X86::VPROTQrr, X86::VPROTQmr, 0 },
- { X86::VPROTWri, X86::VPROTWmi, 0 },
- { X86::VPROTWrr, X86::VPROTWmr, 0 },
- { X86::VPSHABrr, X86::VPSHABmr, 0 },
- { X86::VPSHADrr, X86::VPSHADmr, 0 },
- { X86::VPSHAQrr, X86::VPSHAQmr, 0 },
- { X86::VPSHAWrr, X86::VPSHAWmr, 0 },
- { X86::VPSHLBrr, X86::VPSHLBmr, 0 },
- { X86::VPSHLDrr, X86::VPSHLDmr, 0 },
- { X86::VPSHLQrr, X86::VPSHLQmr, 0 },
- { X86::VPSHLWrr, X86::VPSHLWmr, 0 },
-
- // LWP foldable instructions
- { X86::LWPINS32rri, X86::LWPINS32rmi, 0 },
- { X86::LWPINS64rri, X86::LWPINS64rmi, 0 },
- { X86::LWPVAL32rri, X86::LWPVAL32rmi, 0 },
- { X86::LWPVAL64rri, X86::LWPVAL64rmi, 0 },
-
- // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
- { X86::BEXTR32rr, X86::BEXTR32rm, 0 },
- { X86::BEXTR64rr, X86::BEXTR64rm, 0 },
- { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
- { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
- { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
- { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
- { X86::BLCI32rr, X86::BLCI32rm, 0 },
- { X86::BLCI64rr, X86::BLCI64rm, 0 },
- { X86::BLCIC32rr, X86::BLCIC32rm, 0 },
- { X86::BLCIC64rr, X86::BLCIC64rm, 0 },
- { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
- { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
- { X86::BLCS32rr, X86::BLCS32rm, 0 },
- { X86::BLCS64rr, X86::BLCS64rm, 0 },
- { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
- { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
- { X86::BLSI32rr, X86::BLSI32rm, 0 },
- { X86::BLSI64rr, X86::BLSI64rm, 0 },
- { X86::BLSIC32rr, X86::BLSIC32rm, 0 },
- { X86::BLSIC64rr, X86::BLSIC64rm, 0 },
- { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
- { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
- { X86::BLSR32rr, X86::BLSR32rm, 0 },
- { X86::BLSR64rr, X86::BLSR64rm, 0 },
- { X86::BZHI32rr, X86::BZHI32rm, 0 },
- { X86::BZHI64rr, X86::BZHI64rm, 0 },
- { X86::LZCNT16rr, X86::LZCNT16rm, 0 },
- { X86::LZCNT32rr, X86::LZCNT32rm, 0 },
- { X86::LZCNT64rr, X86::LZCNT64rm, 0 },
- { X86::POPCNT16rr, X86::POPCNT16rm, 0 },
- { X86::POPCNT32rr, X86::POPCNT32rm, 0 },
- { X86::POPCNT64rr, X86::POPCNT64rm, 0 },
- { X86::RORX32ri, X86::RORX32mi, 0 },
- { X86::RORX64ri, X86::RORX64mi, 0 },
- { X86::SARX32rr, X86::SARX32rm, 0 },
- { X86::SARX64rr, X86::SARX64rm, 0 },
- { X86::SHRX32rr, X86::SHRX32rm, 0 },
- { X86::SHRX64rr, X86::SHRX64rm, 0 },
- { X86::SHLX32rr, X86::SHLX32rm, 0 },
- { X86::SHLX64rr, X86::SHLX64rm, 0 },
- { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
- { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
- { X86::TZCNT16rr, X86::TZCNT16rm, 0 },
- { X86::TZCNT32rr, X86::TZCNT32rm, 0 },
- { X86::TZCNT64rr, X86::TZCNT64rm, 0 },
- { X86::TZMSK32rr, X86::TZMSK32rm, 0 },
- { X86::TZMSK64rr, X86::TZMSK64rm, 0 },
-
- // AVX-512 foldable instructions
- { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
- { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
- { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
- { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 },
- { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 },
- { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
- { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
- { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
- { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
- { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
- { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
- { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
- { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
- { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
- { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
- { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
- { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
- { X86::VPABSBZrr, X86::VPABSBZrm, 0 },
- { X86::VPABSDZrr, X86::VPABSDZrm, 0 },
- { X86::VPABSQZrr, X86::VPABSQZrm, 0 },
- { X86::VPABSWZrr, X86::VPABSWZrm, 0 },
- { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
- { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
- { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
- { X86::VPERMQZri, X86::VPERMQZmi, 0 },
- { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
- { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
- { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
- { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
- { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
- { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
- { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
- { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
- { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
- { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
- { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
- { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
- { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
- { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
- { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
- { X86::VPSLLDQZ512rr, X86::VPSLLDQZ512rm, 0 },
- { X86::VPSLLDZri, X86::VPSLLDZmi, 0 },
- { X86::VPSLLQZri, X86::VPSLLQZmi, 0 },
- { X86::VPSLLWZri, X86::VPSLLWZmi, 0 },
- { X86::VPSRADZri, X86::VPSRADZmi, 0 },
- { X86::VPSRAQZri, X86::VPSRAQZmi, 0 },
- { X86::VPSRAWZri, X86::VPSRAWZmi, 0 },
- { X86::VPSRLDQZ512rr, X86::VPSRLDQZ512rm, 0 },
- { X86::VPSRLDZri, X86::VPSRLDZmi, 0 },
- { X86::VPSRLQZri, X86::VPSRLQZmi, 0 },
- { X86::VPSRLWZri, X86::VPSRLWZmi, 0 },
-
- // AVX-512 foldable instructions (256-bit versions)
- { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
- { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
- { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
- { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
- { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
- { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
- { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
- { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
- { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
- { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
- { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
- { X86::VPABSBZ256rr, X86::VPABSBZ256rm, 0 },
- { X86::VPABSDZ256rr, X86::VPABSDZ256rm, 0 },
- { X86::VPABSQZ256rr, X86::VPABSQZ256rm, 0 },
- { X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0 },
- { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
- { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
- { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
- { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
- { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
- { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
- { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
- { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
- { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
- { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
- { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
- { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
- { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
- { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
- { X86::VPSLLDQZ256rr, X86::VPSLLDQZ256rm, 0 },
- { X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 },
- { X86::VPSLLQZ256ri, X86::VPSLLQZ256mi, 0 },
- { X86::VPSLLWZ256ri, X86::VPSLLWZ256mi, 0 },
- { X86::VPSRADZ256ri, X86::VPSRADZ256mi, 0 },
- { X86::VPSRAQZ256ri, X86::VPSRAQZ256mi, 0 },
- { X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 },
- { X86::VPSRLDQZ256rr, X86::VPSRLDQZ256rm, 0 },
- { X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 },
- { X86::VPSRLQZ256ri, X86::VPSRLQZ256mi, 0 },
- { X86::VPSRLWZ256ri, X86::VPSRLWZ256mi, 0 },
-
- // AVX-512 foldable instructions (128-bit versions)
- { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
- { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
- { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
- { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
- { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
- { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
- { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
- { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
- { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
- { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
- { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
- { X86::VPABSBZ128rr, X86::VPABSBZ128rm, 0 },
- { X86::VPABSDZ128rr, X86::VPABSDZ128rm, 0 },
- { X86::VPABSQZ128rr, X86::VPABSQZ128rm, 0 },
- { X86::VPABSWZ128rr, X86::VPABSWZ128rm, 0 },
- { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
- { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
- { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
- { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
- { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
- { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
- { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
- { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },
- { X86::VPSLLDQZ128rr, X86::VPSLLDQZ128rm, 0 },
- { X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 },
- { X86::VPSLLQZ128ri, X86::VPSLLQZ128mi, 0 },
- { X86::VPSLLWZ128ri, X86::VPSLLWZ128mi, 0 },
- { X86::VPSRADZ128ri, X86::VPSRADZ128mi, 0 },
- { X86::VPSRAQZ128ri, X86::VPSRAQZ128mi, 0 },
- { X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 },
- { X86::VPSRLDQZ128rr, X86::VPSRLDQZ128rm, 0 },
- { X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 },
- { X86::VPSRLQZ128ri, X86::VPSRLQZ128mi, 0 },
- { X86::VPSRLWZ128ri, X86::VPSRLWZ128mi, 0 },
-
- // F16C foldable instructions
- { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
- { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
-
- // AES foldable instructions
- { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
- { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
- { X86::VAESIMCrr, X86::VAESIMCrm, 0 },
- { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
- };
-
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp,
@@ -1040,1394 +143,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
}
- static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
- { X86::ADC32rr, X86::ADC32rm, 0 },
- { X86::ADC64rr, X86::ADC64rm, 0 },
- { X86::ADD16rr, X86::ADD16rm, 0 },
- { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
- { X86::ADD32rr, X86::ADD32rm, 0 },
- { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
- { X86::ADD64rr, X86::ADD64rm, 0 },
- { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
- { X86::ADD8rr, X86::ADD8rm, 0 },
- { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
- { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
- { X86::ADDSDrr, X86::ADDSDrm, 0 },
- { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
- { X86::ADDSSrr, X86::ADDSSrm, 0 },
- { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
- { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
- { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
- { X86::AND16rr, X86::AND16rm, 0 },
- { X86::AND32rr, X86::AND32rm, 0 },
- { X86::AND64rr, X86::AND64rm, 0 },
- { X86::AND8rr, X86::AND8rm, 0 },
- { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
- { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
- { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
- { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
- { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
- { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
- { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
- { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
- { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
- { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
- { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
- { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
- { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
- { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
- { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
- { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
- { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
- { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
- { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
- { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
- { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
- { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
- { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
- { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
- { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
- { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
- { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
- { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
- { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
- { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
- { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
- { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
- { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
- { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
- { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
- { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
- { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
- { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
- { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
- { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
- { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
- { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
- { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
- { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
- { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
- { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
- { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
- { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
- { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
- { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
- { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
- { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
- { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
- { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
- { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
- { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
- { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
- { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
- { X86::CMPSDrr, X86::CMPSDrm, 0 },
- { X86::CMPSSrr, X86::CMPSSrm, 0 },
- { X86::CRC32r32r32, X86::CRC32r32m32, 0 },
- { X86::CRC32r64r64, X86::CRC32r64m64, 0 },
- { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
- { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
- { X86::DIVSDrr, X86::DIVSDrm, 0 },
- { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
- { X86::DIVSSrr, X86::DIVSSrm, 0 },
- { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
- { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
- { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
- { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
- { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
- { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
- { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
- { X86::IMUL16rr, X86::IMUL16rm, 0 },
- { X86::IMUL32rr, X86::IMUL32rm, 0 },
- { X86::IMUL64rr, X86::IMUL64rm, 0 },
- { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE },
- { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE },
- { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE },
- { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
- { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
- { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
- { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
- { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE },
- { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
- { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
- { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
- { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
- { X86::MAXSDrr, X86::MAXSDrm, 0 },
- { X86::MAXCSDrr, X86::MAXCSDrm, 0 },
- { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
- { X86::MAXSSrr, X86::MAXSSrm, 0 },
- { X86::MAXCSSrr, X86::MAXCSSrm, 0 },
- { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
- { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
- { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
- { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
- { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
- { X86::MINSDrr, X86::MINSDrm, 0 },
- { X86::MINCSDrr, X86::MINCSDrm, 0 },
- { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
- { X86::MINSSrr, X86::MINSSrm, 0 },
- { X86::MINCSSrr, X86::MINCSSrm, 0 },
- { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
- { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
- { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
- { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
- { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
- { X86::MULSDrr, X86::MULSDrm, 0 },
- { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
- { X86::MULSSrr, X86::MULSSrm, 0 },
- { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
- { X86::OR16rr, X86::OR16rm, 0 },
- { X86::OR32rr, X86::OR32rm, 0 },
- { X86::OR64rr, X86::OR64rm, 0 },
- { X86::OR8rr, X86::OR8rm, 0 },
- { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
- { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
- { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
- { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
- { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
- { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
- { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
- { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
- { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
- { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
- { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
- { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
- { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
- { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
- { X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 },
- { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
- { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
- { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
- { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
- { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
- { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
- { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
- { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
- { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
- { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
- { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
- { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
- { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
- { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
- { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
- { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
- { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
- { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
- { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
- { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
- { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
- { X86::PINSRBrr, X86::PINSRBrm, 0 },
- { X86::PINSRDrr, X86::PINSRDrm, 0 },
- { X86::PINSRQrr, X86::PINSRQrm, 0 },
- { X86::PINSRWrri, X86::PINSRWrmi, 0 },
- { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
- { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
- { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
- { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
- { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
- { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
- { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
- { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
- { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
- { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
- { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
- { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
- { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
- { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
- { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
- { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
- { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
- { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
- { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
- { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
- { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
- { X86::PORrr, X86::PORrm, TB_ALIGN_16 },
- { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
- { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
- { X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 },
- { X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 },
- { X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 },
- { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
- { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
- { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
- { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
- { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
- { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
- { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
- { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
- { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
- { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
- { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
- { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
- { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
- { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
- { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
- { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
- { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
- { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
- { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
- { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
- { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
- { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
- { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
- { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
- { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
- { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
- { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
- { X86::SBB32rr, X86::SBB32rm, 0 },
- { X86::SBB64rr, X86::SBB64rm, 0 },
- { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
- { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
- { X86::SUB16rr, X86::SUB16rm, 0 },
- { X86::SUB32rr, X86::SUB32rm, 0 },
- { X86::SUB64rr, X86::SUB64rm, 0 },
- { X86::SUB8rr, X86::SUB8rm, 0 },
- { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
- { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
- { X86::SUBSDrr, X86::SUBSDrm, 0 },
- { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
- { X86::SUBSSrr, X86::SUBSSrm, 0 },
- { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
- // FIXME: TEST*rr -> swapped operand of TEST*mr.
- { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
- { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
- { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
- { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
- { X86::XOR16rr, X86::XOR16rm, 0 },
- { X86::XOR32rr, X86::XOR32rm, 0 },
- { X86::XOR64rr, X86::XOR64rm, 0 },
- { X86::XOR8rr, X86::XOR8rm, 0 },
- { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
- { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
-
- // MMX version of foldable instructions
- { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
- { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
- { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
- { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
- { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
- { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
- { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
- { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
- { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
- { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
- { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
- { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
- { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 },
- { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
- { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
- { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
- { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
- { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
- { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
- { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
- { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
- { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
- { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
- { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 },
- { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 },
- { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 },
- { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 },
- { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 },
- { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 },
- { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 },
- { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
- { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
- { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
- { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
- { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
- { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
- { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 },
- { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
- { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
- { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
- { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
- { X86::MMX_PORirr, X86::MMX_PORirm, 0 },
- { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
- { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 },
- { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 },
- { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 },
- { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 },
- { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
- { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
- { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
- { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
- { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
- { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
- { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
- { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
- { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
- { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
- { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
- { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
- { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
- { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
- { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
- { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
- { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
- { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
- { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
- { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 },
- { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 },
- { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 },
- { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
-
- // 3DNow! version of foldable instructions
- { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
- { X86::PFACCrr, X86::PFACCrm, 0 },
- { X86::PFADDrr, X86::PFADDrm, 0 },
- { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
- { X86::PFCMPGErr, X86::PFCMPGErm, 0 },
- { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
- { X86::PFMAXrr, X86::PFMAXrm, 0 },
- { X86::PFMINrr, X86::PFMINrm, 0 },
- { X86::PFMULrr, X86::PFMULrm, 0 },
- { X86::PFNACCrr, X86::PFNACCrm, 0 },
- { X86::PFPNACCrr, X86::PFPNACCrm, 0 },
- { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
- { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
- { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
- { X86::PFSUBrr, X86::PFSUBrm, 0 },
- { X86::PFSUBRrr, X86::PFSUBRrm, 0 },
- { X86::PMULHRWrr, X86::PMULHRWrm, 0 },
-
- // AVX 128-bit versions of foldable instructions
- { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
- { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
- { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
- { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 },
- { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 },
- { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 },
- { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
- { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
- { X86::VADDPDrr, X86::VADDPDrm, 0 },
- { X86::VADDPSrr, X86::VADDPSrm, 0 },
- { X86::VADDSDrr, X86::VADDSDrm, 0 },
- { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
- { X86::VADDSSrr, X86::VADDSSrm, 0 },
- { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
- { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
- { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
- { X86::VANDNPDrr, X86::VANDNPDrm, 0 },
- { X86::VANDNPSrr, X86::VANDNPSrm, 0 },
- { X86::VANDPDrr, X86::VANDPDrm, 0 },
- { X86::VANDPSrr, X86::VANDPSrm, 0 },
- { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
- { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
- { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
- { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
- { X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
- { X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
- { X86::VCMPSDrr, X86::VCMPSDrm, 0 },
- { X86::VCMPSSrr, X86::VCMPSSrm, 0 },
- { X86::VDIVPDrr, X86::VDIVPDrm, 0 },
- { X86::VDIVPSrr, X86::VDIVPSrm, 0 },
- { X86::VDIVSDrr, X86::VDIVSDrm, 0 },
- { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
- { X86::VDIVSSrr, X86::VDIVSSrm, 0 },
- { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
- { X86::VDPPDrri, X86::VDPPDrmi, 0 },
- { X86::VDPPSrri, X86::VDPPSrmi, 0 },
- { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
- { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
- { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
- { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
- { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE },
- { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE },
- { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
- { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
- { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
- { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
- { X86::VMAXPDrr, X86::VMAXPDrm, 0 },
- { X86::VMAXPSrr, X86::VMAXPSrm, 0 },
- { X86::VMAXSDrr, X86::VMAXSDrm, 0 },
- { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
- { X86::VMAXSSrr, X86::VMAXSSrm, 0 },
- { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
- { X86::VMINCPDrr, X86::VMINCPDrm, 0 },
- { X86::VMINCPSrr, X86::VMINCPSrm, 0 },
- { X86::VMINCSDrr, X86::VMINCSDrm, 0 },
- { X86::VMINCSSrr, X86::VMINCSSrm, 0 },
- { X86::VMINPDrr, X86::VMINPDrm, 0 },
- { X86::VMINPSrr, X86::VMINPSrm, 0 },
- { X86::VMINSDrr, X86::VMINSDrm, 0 },
- { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
- { X86::VMINSSrr, X86::VMINSSrm, 0 },
- { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
- { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
- { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
- { X86::VMULPDrr, X86::VMULPDrm, 0 },
- { X86::VMULPSrr, X86::VMULPSrm, 0 },
- { X86::VMULSDrr, X86::VMULSDrm, 0 },
- { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
- { X86::VMULSSrr, X86::VMULSSrm, 0 },
- { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
- { X86::VORPDrr, X86::VORPDrm, 0 },
- { X86::VORPSrr, X86::VORPSrm, 0 },
- { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
- { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
- { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
- { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
- { X86::VPADDBrr, X86::VPADDBrm, 0 },
- { X86::VPADDDrr, X86::VPADDDrm, 0 },
- { X86::VPADDQrr, X86::VPADDQrm, 0 },
- { X86::VPADDSBrr, X86::VPADDSBrm, 0 },
- { X86::VPADDSWrr, X86::VPADDSWrm, 0 },
- { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
- { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
- { X86::VPADDWrr, X86::VPADDWrm, 0 },
- { X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 },
- { X86::VPANDNrr, X86::VPANDNrm, 0 },
- { X86::VPANDrr, X86::VPANDrm, 0 },
- { X86::VPAVGBrr, X86::VPAVGBrm, 0 },
- { X86::VPAVGWrr, X86::VPAVGWrm, 0 },
- { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
- { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
- { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
- { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
- { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
- { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
- { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
- { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
- { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
- { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
- { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
- { X86::VPHADDDrr, X86::VPHADDDrm, 0 },
- { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
- { X86::VPHADDWrr, X86::VPHADDWrm, 0 },
- { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
- { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
- { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
- { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
- { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
- { X86::VPINSRBrr, X86::VPINSRBrm, 0 },
- { X86::VPINSRDrr, X86::VPINSRDrm, 0 },
- { X86::VPINSRQrr, X86::VPINSRQrm, 0 },
- { X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
- { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
- { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
- { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
- { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
- { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
- { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
- { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
- { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
- { X86::VPMINSBrr, X86::VPMINSBrm, 0 },
- { X86::VPMINSDrr, X86::VPMINSDrm, 0 },
- { X86::VPMINSWrr, X86::VPMINSWrm, 0 },
- { X86::VPMINUBrr, X86::VPMINUBrm, 0 },
- { X86::VPMINUDrr, X86::VPMINUDrm, 0 },
- { X86::VPMINUWrr, X86::VPMINUWrm, 0 },
- { X86::VPMULDQrr, X86::VPMULDQrm, 0 },
- { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
- { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
- { X86::VPMULHWrr, X86::VPMULHWrm, 0 },
- { X86::VPMULLDrr, X86::VPMULLDrm, 0 },
- { X86::VPMULLWrr, X86::VPMULLWrm, 0 },
- { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
- { X86::VPORrr, X86::VPORrm, 0 },
- { X86::VPSADBWrr, X86::VPSADBWrm, 0 },
- { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
- { X86::VPSIGNBrr128, X86::VPSIGNBrm128, 0 },
- { X86::VPSIGNWrr128, X86::VPSIGNWrm128, 0 },
- { X86::VPSIGNDrr128, X86::VPSIGNDrm128, 0 },
- { X86::VPSLLDrr, X86::VPSLLDrm, 0 },
- { X86::VPSLLQrr, X86::VPSLLQrm, 0 },
- { X86::VPSLLWrr, X86::VPSLLWrm, 0 },
- { X86::VPSRADrr, X86::VPSRADrm, 0 },
- { X86::VPSRAWrr, X86::VPSRAWrm, 0 },
- { X86::VPSRLDrr, X86::VPSRLDrm, 0 },
- { X86::VPSRLQrr, X86::VPSRLQrm, 0 },
- { X86::VPSRLWrr, X86::VPSRLWrm, 0 },
- { X86::VPSUBBrr, X86::VPSUBBrm, 0 },
- { X86::VPSUBDrr, X86::VPSUBDrm, 0 },
- { X86::VPSUBQrr, X86::VPSUBQrm, 0 },
- { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
- { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
- { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
- { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
- { X86::VPSUBWrr, X86::VPSUBWrm, 0 },
- { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
- { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
- { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
- { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
- { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
- { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
- { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
- { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
- { X86::VPXORrr, X86::VPXORrm, 0 },
- { X86::VRCPSSr, X86::VRCPSSm, 0 },
- { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
- { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
- { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
- { X86::VROUNDSDr, X86::VROUNDSDm, 0 },
- { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
- { X86::VROUNDSSr, X86::VROUNDSSm, 0 },
- { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
- { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
- { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
- { X86::VSQRTSDr, X86::VSQRTSDm, 0 },
- { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
- { X86::VSQRTSSr, X86::VSQRTSSm, 0 },
- { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
- { X86::VSUBPDrr, X86::VSUBPDrm, 0 },
- { X86::VSUBPSrr, X86::VSUBPSrm, 0 },
- { X86::VSUBSDrr, X86::VSUBSDrm, 0 },
- { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
- { X86::VSUBSSrr, X86::VSUBSSrm, 0 },
- { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
- { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
- { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
- { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
- { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
- { X86::VXORPDrr, X86::VXORPDrm, 0 },
- { X86::VXORPSrr, X86::VXORPSrm, 0 },
-
- // AVX 256-bit foldable instructions
- { X86::VADDPDYrr, X86::VADDPDYrm, 0 },
- { X86::VADDPSYrr, X86::VADDPSYrm, 0 },
- { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
- { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
- { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
- { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
- { X86::VANDPDYrr, X86::VANDPDYrm, 0 },
- { X86::VANDPSYrr, X86::VANDPSYrm, 0 },
- { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
- { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
- { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
- { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
- { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
- { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
- { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
- { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
- { X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
- { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
- { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
- { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
- { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
- { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
- { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
- { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
- { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
- { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
- { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
- { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
- { X86::VMINPDYrr, X86::VMINPDYrm, 0 },
- { X86::VMINPSYrr, X86::VMINPSYrm, 0 },
- { X86::VMULPDYrr, X86::VMULPDYrm, 0 },
- { X86::VMULPSYrr, X86::VMULPSYrm, 0 },
- { X86::VORPDYrr, X86::VORPDYrm, 0 },
- { X86::VORPSYrr, X86::VORPSYrm, 0 },
- { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
- { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
- { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
- { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
- { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
- { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
- { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
- { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
- { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
- { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
- { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
- { X86::VXORPDYrr, X86::VXORPDYrm, 0 },
- { X86::VXORPSYrr, X86::VXORPSYrm, 0 },
-
- // AVX2 foldable instructions
- { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
- { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
- { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
- { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
- { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
- { X86::VPADDBYrr, X86::VPADDBYrm, 0 },
- { X86::VPADDDYrr, X86::VPADDDYrm, 0 },
- { X86::VPADDQYrr, X86::VPADDQYrm, 0 },
- { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
- { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
- { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
- { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
- { X86::VPADDWYrr, X86::VPADDWYrm, 0 },
- { X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 },
- { X86::VPANDNYrr, X86::VPANDNYrm, 0 },
- { X86::VPANDYrr, X86::VPANDYrm, 0 },
- { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
- { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
- { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
- { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
- { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
- { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
- { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
- { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
- { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
- { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
- { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
- { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
- { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
- { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
- { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
- { X86::VPERMDYrr, X86::VPERMDYrm, 0 },
- { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
- { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
- { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
- { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
- { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
- { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
- { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
- { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
- { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
- { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
- { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
- { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
- { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
- { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
- { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
- { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
- { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
- { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
- { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
- { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
- { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
- { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
- { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
- { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
- { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
- { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
- { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
- { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
- { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
- { X86::VPORYrr, X86::VPORYrm, 0 },
- { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
- { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
- { X86::VPSIGNBYrr256, X86::VPSIGNBYrm256, 0 },
- { X86::VPSIGNWYrr256, X86::VPSIGNWYrm256, 0 },
- { X86::VPSIGNDYrr256, X86::VPSIGNDYrm256, 0 },
- { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
- { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
- { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
- { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
- { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
- { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
- { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
- { X86::VPSRADYrr, X86::VPSRADYrm, 0 },
- { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
- { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
- { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
- { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
- { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
- { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
- { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
- { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
- { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
- { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
- { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
- { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
- { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
- { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
- { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
- { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
- { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
- { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
- { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
- { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
- { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
- { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
- { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
- { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
- { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
- { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
- { X86::VPXORYrr, X86::VPXORYrm, 0 },
-
- // FMA4 foldable patterns
- { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
- { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
- { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
- { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
- { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
- { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
- { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE },
- { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE },
- { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
- { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
- { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
- { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
- { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
- { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
- { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE },
- { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE },
- { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
- { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
- { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
- { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
- { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE },
- { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE },
- { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
- { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
- { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE },
-
- // XOP foldable instructions
- { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
- { X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0 },
- { X86::VPCOMBri, X86::VPCOMBmi, 0 },
- { X86::VPCOMDri, X86::VPCOMDmi, 0 },
- { X86::VPCOMQri, X86::VPCOMQmi, 0 },
- { X86::VPCOMWri, X86::VPCOMWmi, 0 },
- { X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
- { X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
- { X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
- { X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
- { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
- { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYmr, 0 },
- { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
- { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYmr, 0 },
- { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
- { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
- { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
- { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
- { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
- { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
- { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
- { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
- { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
- { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
- { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
- { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
- { X86::VPPERMrrr, X86::VPPERMrmr, 0 },
- { X86::VPROTBrr, X86::VPROTBrm, 0 },
- { X86::VPROTDrr, X86::VPROTDrm, 0 },
- { X86::VPROTQrr, X86::VPROTQrm, 0 },
- { X86::VPROTWrr, X86::VPROTWrm, 0 },
- { X86::VPSHABrr, X86::VPSHABrm, 0 },
- { X86::VPSHADrr, X86::VPSHADrm, 0 },
- { X86::VPSHAQrr, X86::VPSHAQrm, 0 },
- { X86::VPSHAWrr, X86::VPSHAWrm, 0 },
- { X86::VPSHLBrr, X86::VPSHLBrm, 0 },
- { X86::VPSHLDrr, X86::VPSHLDrm, 0 },
- { X86::VPSHLQrr, X86::VPSHLQrm, 0 },
- { X86::VPSHLWrr, X86::VPSHLWrm, 0 },
-
- // BMI/BMI2 foldable instructions
- { X86::ANDN32rr, X86::ANDN32rm, 0 },
- { X86::ANDN64rr, X86::ANDN64rm, 0 },
- { X86::MULX32rr, X86::MULX32rm, 0 },
- { X86::MULX64rr, X86::MULX64rm, 0 },
- { X86::PDEP32rr, X86::PDEP32rm, 0 },
- { X86::PDEP64rr, X86::PDEP64rm, 0 },
- { X86::PEXT32rr, X86::PEXT32rm, 0 },
- { X86::PEXT64rr, X86::PEXT64rm, 0 },
-
- // ADX foldable instructions
- { X86::ADCX32rr, X86::ADCX32rm, 0 },
- { X86::ADCX64rr, X86::ADCX64rm, 0 },
- { X86::ADOX32rr, X86::ADOX32rm, 0 },
- { X86::ADOX64rr, X86::ADOX64rm, 0 },
-
- // AVX-512 foldable instructions
- { X86::VADDPDZrr, X86::VADDPDZrm, 0 },
- { X86::VADDPSZrr, X86::VADDPSZrm, 0 },
- { X86::VADDSDZrr, X86::VADDSDZrm, 0 },
- { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
- { X86::VADDSSZrr, X86::VADDSSZrm, 0 },
- { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
- { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
- { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
- { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
- { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
- { X86::VANDPDZrr, X86::VANDPDZrm, 0 },
- { X86::VANDPSZrr, X86::VANDPSZrm, 0 },
- { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
- { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
- { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
- { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
- { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
- { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
- { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
- { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
- { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
- { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
- { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
- { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
- { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
- { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
- { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
- { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
- { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
- { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
- { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
- { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
- { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
- { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
- { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
- { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
- { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
- { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
- { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
- { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
- { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
- { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
- { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
- { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
- { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
- { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
- { X86::VMINPDZrr, X86::VMINPDZrm, 0 },
- { X86::VMINPSZrr, X86::VMINPSZrm, 0 },
- { X86::VMINSDZrr, X86::VMINSDZrm, 0 },
- { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
- { X86::VMINSSZrr, X86::VMINSSZrm, 0 },
- { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
- { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE },
- { X86::VMULPDZrr, X86::VMULPDZrm, 0 },
- { X86::VMULPSZrr, X86::VMULPSZrm, 0 },
- { X86::VMULSDZrr, X86::VMULSDZrm, 0 },
- { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
- { X86::VMULSSZrr, X86::VMULSSZrm, 0 },
- { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
- { X86::VORPDZrr, X86::VORPDZrm, 0 },
- { X86::VORPSZrr, X86::VORPSZrm, 0 },
- { X86::VPACKSSDWZrr, X86::VPACKSSDWZrm, 0 },
- { X86::VPACKSSWBZrr, X86::VPACKSSWBZrm, 0 },
- { X86::VPACKUSDWZrr, X86::VPACKUSDWZrm, 0 },
- { X86::VPACKUSWBZrr, X86::VPACKUSWBZrm, 0 },
- { X86::VPADDBZrr, X86::VPADDBZrm, 0 },
- { X86::VPADDDZrr, X86::VPADDDZrm, 0 },
- { X86::VPADDQZrr, X86::VPADDQZrm, 0 },
- { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
- { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
- { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
- { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
- { X86::VPADDWZrr, X86::VPADDWZrm, 0 },
- { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
- { X86::VPANDDZrr, X86::VPANDDZrm, 0 },
- { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
- { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
- { X86::VPANDQZrr, X86::VPANDQZrm, 0 },
- { X86::VPAVGBZrr, X86::VPAVGBZrm, 0 },
- { X86::VPAVGWZrr, X86::VPAVGWZrm, 0 },
- { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
- { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
- { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
- { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
- { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
- { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
- { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
- { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
- { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
- { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
- { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
- { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
- { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
- { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
- { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
- { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
- { X86::VPERMBZrr, X86::VPERMBZrm, 0 },
- { X86::VPERMDZrr, X86::VPERMDZrm, 0 },
- { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
- { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
- { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
- { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
- { X86::VPERMQZrr, X86::VPERMQZrm, 0 },
- { X86::VPERMWZrr, X86::VPERMWZrm, 0 },
- { X86::VPINSRBZrr, X86::VPINSRBZrm, 0 },
- { X86::VPINSRDZrr, X86::VPINSRDZrm, 0 },
- { X86::VPINSRQZrr, X86::VPINSRQZrm, 0 },
- { X86::VPINSRWZrr, X86::VPINSRWZrm, 0 },
- { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
- { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
- { X86::VPMAXSBZrr, X86::VPMAXSBZrm, 0 },
- { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
- { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
- { X86::VPMAXSWZrr, X86::VPMAXSWZrm, 0 },
- { X86::VPMAXUBZrr, X86::VPMAXUBZrm, 0 },
- { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
- { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
- { X86::VPMAXUWZrr, X86::VPMAXUWZrm, 0 },
- { X86::VPMINSBZrr, X86::VPMINSBZrm, 0 },
- { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
- { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
- { X86::VPMINSWZrr, X86::VPMINSWZrm, 0 },
- { X86::VPMINUBZrr, X86::VPMINUBZrm, 0 },
- { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
- { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
- { X86::VPMINUWZrr, X86::VPMINUWZrm, 0 },
- { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
- { X86::VPMULLDZrr, X86::VPMULLDZrm, 0 },
- { X86::VPMULLQZrr, X86::VPMULLQZrm, 0 },
- { X86::VPMULLWZrr, X86::VPMULLWZrm, 0 },
- { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
- { X86::VPORDZrr, X86::VPORDZrm, 0 },
- { X86::VPORQZrr, X86::VPORQZrm, 0 },
- { X86::VPSADBWZ512rr, X86::VPSADBWZ512rm, 0 },
- { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
- { X86::VPSLLDZrr, X86::VPSLLDZrm, 0 },
- { X86::VPSLLQZrr, X86::VPSLLQZrm, 0 },
- { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
- { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
- { X86::VPSLLVWZrr, X86::VPSLLVWZrm, 0 },
- { X86::VPSLLWZrr, X86::VPSLLWZrm, 0 },
- { X86::VPSRADZrr, X86::VPSRADZrm, 0 },
- { X86::VPSRAQZrr, X86::VPSRAQZrm, 0 },
- { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
- { X86::VPSRAVQZrr, X86::VPSRAVQZrm, 0 },
- { X86::VPSRAVWZrr, X86::VPSRAVWZrm, 0 },
- { X86::VPSRAWZrr, X86::VPSRAWZrm, 0 },
- { X86::VPSRLDZrr, X86::VPSRLDZrm, 0 },
- { X86::VPSRLQZrr, X86::VPSRLQZrm, 0 },
- { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
- { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
- { X86::VPSRLVWZrr, X86::VPSRLVWZrm, 0 },
- { X86::VPSRLWZrr, X86::VPSRLWZrm, 0 },
- { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
- { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
- { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
- { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
- { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
- { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
- { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
- { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
- { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
- { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
- { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
- { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
- { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
- { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
- { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
- { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
- { X86::VPXORDZrr, X86::VPXORDZrm, 0 },
- { X86::VPXORQZrr, X86::VPXORQZrm, 0 },
- { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
- { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
- { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
- { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
- { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
- { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
- { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
- { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
- { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
- { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
- { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
- { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
- { X86::VXORPDZrr, X86::VXORPDZrm, 0 },
- { X86::VXORPSZrr, X86::VXORPSZrm, 0 },
-
- // AVX-512{F,VL} foldable instructions
- { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
- { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
- { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
- { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
- { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
- { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
- { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
- { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
- { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
- { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
- { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
- { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
- { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
- { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
- { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
- { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
- { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
- { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
- { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
- { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
- { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
- { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
- { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
- { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
- { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 },
- { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 },
- { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 },
- { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 },
- { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
- { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
- { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
- { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
- { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
- { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
- { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
- { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
- { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
- { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
- { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
- { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
- { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
- { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
- { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
- { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
- { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
- { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
- { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
- { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
- { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
- { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
- { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
- { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
- { X86::VPACKSSDWZ256rr, X86::VPACKSSDWZ256rm, 0 },
- { X86::VPACKSSDWZ128rr, X86::VPACKSSDWZ128rm, 0 },
- { X86::VPACKSSWBZ256rr, X86::VPACKSSWBZ256rm, 0 },
- { X86::VPACKSSWBZ128rr, X86::VPACKSSWBZ128rm, 0 },
- { X86::VPACKUSDWZ256rr, X86::VPACKUSDWZ256rm, 0 },
- { X86::VPACKUSDWZ128rr, X86::VPACKUSDWZ128rm, 0 },
- { X86::VPACKUSWBZ256rr, X86::VPACKUSWBZ256rm, 0 },
- { X86::VPACKUSWBZ128rr, X86::VPACKUSWBZ128rm, 0 },
- { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
- { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
- { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
- { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
- { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
- { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
- { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
- { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
- { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
- { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
- { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
- { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
- { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
- { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
- { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
- { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
- { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
- { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
- { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
- { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
- { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
- { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
- { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
- { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
- { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
- { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
- { X86::VPAVGBZ128rr, X86::VPAVGBZ128rm, 0 },
- { X86::VPAVGBZ256rr, X86::VPAVGBZ256rm, 0 },
- { X86::VPAVGWZ128rr, X86::VPAVGWZ128rm, 0 },
- { X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0 },
- { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
- { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
- { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
- { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
- { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
- { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
- { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
- { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
- { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
- { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
- { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
- { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
- { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
- { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
- { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
- { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
- { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
- { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
- { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
- { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
- { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
- { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
- { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
- { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
- { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
- { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
- { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
- { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
- { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
- { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
- { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
- { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
- { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
- { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
- { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
- { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
- { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
- { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
- { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
- { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
- { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
- { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
- { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
- { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
- { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
- { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
- { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
- { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
- { X86::VPMAXSBZ128rr, X86::VPMAXSBZ128rm, 0 },
- { X86::VPMAXSBZ256rr, X86::VPMAXSBZ256rm, 0 },
- { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rm, 0 },
- { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rm, 0 },
- { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rm, 0 },
- { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rm, 0 },
- { X86::VPMAXSWZ128rr, X86::VPMAXSWZ128rm, 0 },
- { X86::VPMAXSWZ256rr, X86::VPMAXSWZ256rm, 0 },
- { X86::VPMAXUBZ128rr, X86::VPMAXUBZ128rm, 0 },
- { X86::VPMAXUBZ256rr, X86::VPMAXUBZ256rm, 0 },
- { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rm, 0 },
- { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rm, 0 },
- { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rm, 0 },
- { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rm, 0 },
- { X86::VPMAXUWZ128rr, X86::VPMAXUWZ128rm, 0 },
- { X86::VPMAXUWZ256rr, X86::VPMAXUWZ256rm, 0 },
- { X86::VPMINSBZ128rr, X86::VPMINSBZ128rm, 0 },
- { X86::VPMINSBZ256rr, X86::VPMINSBZ256rm, 0 },
- { X86::VPMINSDZ128rr, X86::VPMINSDZ128rm, 0 },
- { X86::VPMINSDZ256rr, X86::VPMINSDZ256rm, 0 },
- { X86::VPMINSQZ128rr, X86::VPMINSQZ128rm, 0 },
- { X86::VPMINSQZ256rr, X86::VPMINSQZ256rm, 0 },
- { X86::VPMINSWZ128rr, X86::VPMINSWZ128rm, 0 },
- { X86::VPMINSWZ256rr, X86::VPMINSWZ256rm, 0 },
- { X86::VPMINUBZ128rr, X86::VPMINUBZ128rm, 0 },
- { X86::VPMINUBZ256rr, X86::VPMINUBZ256rm, 0 },
- { X86::VPMINUDZ128rr, X86::VPMINUDZ128rm, 0 },
- { X86::VPMINUDZ256rr, X86::VPMINUDZ256rm, 0 },
- { X86::VPMINUQZ128rr, X86::VPMINUQZ128rm, 0 },
- { X86::VPMINUQZ256rr, X86::VPMINUQZ256rm, 0 },
- { X86::VPMINUWZ128rr, X86::VPMINUWZ128rm, 0 },
- { X86::VPMINUWZ256rr, X86::VPMINUWZ256rm, 0 },
- { X86::VPMULDQZ128rr, X86::VPMULDQZ128rm, 0 },
- { X86::VPMULDQZ256rr, X86::VPMULDQZ256rm, 0 },
- { X86::VPMULLDZ128rr, X86::VPMULLDZ128rm, 0 },
- { X86::VPMULLDZ256rr, X86::VPMULLDZ256rm, 0 },
- { X86::VPMULLQZ128rr, X86::VPMULLQZ128rm, 0 },
- { X86::VPMULLQZ256rr, X86::VPMULLQZ256rm, 0 },
- { X86::VPMULLWZ128rr, X86::VPMULLWZ128rm, 0 },
- { X86::VPMULLWZ256rr, X86::VPMULLWZ256rm, 0 },
- { X86::VPMULUDQZ128rr, X86::VPMULUDQZ128rm, 0 },
- { X86::VPMULUDQZ256rr, X86::VPMULUDQZ256rm, 0 },
- { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
- { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
- { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
- { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
- { X86::VPSADBWZ128rr, X86::VPSADBWZ128rm, 0 },
- { X86::VPSADBWZ256rr, X86::VPSADBWZ256rm, 0 },
- { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
- { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
- { X86::VPSLLDZ128rr, X86::VPSLLDZ128rm, 0 },
- { X86::VPSLLDZ256rr, X86::VPSLLDZ256rm, 0 },
- { X86::VPSLLQZ128rr, X86::VPSLLQZ128rm, 0 },
- { X86::VPSLLQZ256rr, X86::VPSLLQZ256rm, 0 },
- { X86::VPSLLVDZ128rr, X86::VPSLLVDZ128rm, 0 },
- { X86::VPSLLVDZ256rr, X86::VPSLLVDZ256rm, 0 },
- { X86::VPSLLVQZ128rr, X86::VPSLLVQZ128rm, 0 },
- { X86::VPSLLVQZ256rr, X86::VPSLLVQZ256rm, 0 },
- { X86::VPSLLVWZ128rr, X86::VPSLLVWZ128rm, 0 },
- { X86::VPSLLVWZ256rr, X86::VPSLLVWZ256rm, 0 },
- { X86::VPSLLWZ128rr, X86::VPSLLWZ128rm, 0 },
- { X86::VPSLLWZ256rr, X86::VPSLLWZ256rm, 0 },
- { X86::VPSRADZ128rr, X86::VPSRADZ128rm, 0 },
- { X86::VPSRADZ256rr, X86::VPSRADZ256rm, 0 },
- { X86::VPSRAQZ128rr, X86::VPSRAQZ128rm, 0 },
- { X86::VPSRAQZ256rr, X86::VPSRAQZ256rm, 0 },
- { X86::VPSRAVDZ128rr, X86::VPSRAVDZ128rm, 0 },
- { X86::VPSRAVDZ256rr, X86::VPSRAVDZ256rm, 0 },
- { X86::VPSRAVQZ128rr, X86::VPSRAVQZ128rm, 0 },
- { X86::VPSRAVQZ256rr, X86::VPSRAVQZ256rm, 0 },
- { X86::VPSRAVWZ128rr, X86::VPSRAVWZ128rm, 0 },
- { X86::VPSRAVWZ256rr, X86::VPSRAVWZ256rm, 0 },
- { X86::VPSRAWZ128rr, X86::VPSRAWZ128rm, 0 },
- { X86::VPSRAWZ256rr, X86::VPSRAWZ256rm, 0 },
- { X86::VPSRLDZ128rr, X86::VPSRLDZ128rm, 0 },
- { X86::VPSRLDZ256rr, X86::VPSRLDZ256rm, 0 },
- { X86::VPSRLQZ128rr, X86::VPSRLQZ128rm, 0 },
- { X86::VPSRLQZ256rr, X86::VPSRLQZ256rm, 0 },
- { X86::VPSRLVDZ128rr, X86::VPSRLVDZ128rm, 0 },
- { X86::VPSRLVDZ256rr, X86::VPSRLVDZ256rm, 0 },
- { X86::VPSRLVQZ128rr, X86::VPSRLVQZ128rm, 0 },
- { X86::VPSRLVQZ256rr, X86::VPSRLVQZ256rm, 0 },
- { X86::VPSRLVWZ128rr, X86::VPSRLVWZ128rm, 0 },
- { X86::VPSRLVWZ256rr, X86::VPSRLVWZ256rm, 0 },
- { X86::VPSRLWZ128rr, X86::VPSRLWZ128rm, 0 },
- { X86::VPSRLWZ256rr, X86::VPSRLWZ256rm, 0 },
- { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
- { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
- { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
- { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
- { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
- { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
- { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
- { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
- { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
- { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
- { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
- { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
- { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
- { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
- { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
- { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
- { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
- { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
- { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
- { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
- { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
- { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
- { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
- { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
- { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
- { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
- { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
- { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
- { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
- { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
- { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
- { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
- { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
- { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
- { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
- { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
- { X86::VSHUFPDZ128rri, X86::VSHUFPDZ128rmi, 0 },
- { X86::VSHUFPDZ256rri, X86::VSHUFPDZ256rmi, 0 },
- { X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmi, 0 },
- { X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmi, 0 },
- { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
- { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
- { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
- { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
- { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
- { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
- { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
- { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
- { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
- { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
- { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
- { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
- { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
- { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
- { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
- { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
-
- // AVX-512 masked foldable instructions
- { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
- { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
- { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 },
- { X86::VPABSDZrrkz, X86::VPABSDZrmkz, 0 },
- { X86::VPABSQZrrkz, X86::VPABSQZrmkz, 0 },
- { X86::VPABSWZrrkz, X86::VPABSWZrmkz, 0 },
- { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
- { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
- { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
- { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
- { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
- { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
- { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
- { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
- { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
- { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
- { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
- { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
- { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
- { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
- { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
- { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
- { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
- { X86::VPSLLDZrikz, X86::VPSLLDZmikz, 0 },
- { X86::VPSLLQZrikz, X86::VPSLLQZmikz, 0 },
- { X86::VPSLLWZrikz, X86::VPSLLWZmikz, 0 },
- { X86::VPSRADZrikz, X86::VPSRADZmikz, 0 },
- { X86::VPSRAQZrikz, X86::VPSRAQZmikz, 0 },
- { X86::VPSRAWZrikz, X86::VPSRAWZmikz, 0 },
- { X86::VPSRLDZrikz, X86::VPSRLDZmikz, 0 },
- { X86::VPSRLQZrikz, X86::VPSRLQZmikz, 0 },
- { X86::VPSRLWZrikz, X86::VPSRLWZmikz, 0 },
-
- // AVX-512VL 256-bit masked foldable instructions
- { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
- { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 },
- { X86::VPABSDZ256rrkz, X86::VPABSDZ256rmkz, 0 },
- { X86::VPABSQZ256rrkz, X86::VPABSQZ256rmkz, 0 },
- { X86::VPABSWZ256rrkz, X86::VPABSWZ256rmkz, 0 },
- { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
- { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
- { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
- { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
- { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
- { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
- { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
- { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
- { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
- { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
- { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
- { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
- { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
- { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },
- { X86::VPSLLDZ256rikz, X86::VPSLLDZ256mikz, 0 },
- { X86::VPSLLQZ256rikz, X86::VPSLLQZ256mikz, 0 },
- { X86::VPSLLWZ256rikz, X86::VPSLLWZ256mikz, 0 },
- { X86::VPSRADZ256rikz, X86::VPSRADZ256mikz, 0 },
- { X86::VPSRAQZ256rikz, X86::VPSRAQZ256mikz, 0 },
- { X86::VPSRAWZ256rikz, X86::VPSRAWZ256mikz, 0 },
- { X86::VPSRLDZ256rikz, X86::VPSRLDZ256mikz, 0 },
- { X86::VPSRLQZ256rikz, X86::VPSRLQZ256mikz, 0 },
- { X86::VPSRLWZ256rikz, X86::VPSRLWZ256mikz, 0 },
-
- // AVX-512VL 128-bit masked foldable instructions
- { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
- { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 },
- { X86::VPABSDZ128rrkz, X86::VPABSDZ128rmkz, 0 },
- { X86::VPABSQZ128rrkz, X86::VPABSQZ128rmkz, 0 },
- { X86::VPABSWZ128rrkz, X86::VPABSWZ128rmkz, 0 },
- { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
- { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
- { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
- { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
- { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
- { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
- { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },
- { X86::VPSLLDZ128rikz, X86::VPSLLDZ128mikz, 0 },
- { X86::VPSLLQZ128rikz, X86::VPSLLQZ128mikz, 0 },
- { X86::VPSLLWZ128rikz, X86::VPSLLWZ128mikz, 0 },
- { X86::VPSRADZ128rikz, X86::VPSRADZ128mikz, 0 },
- { X86::VPSRAQZ128rikz, X86::VPSRAQZ128mikz, 0 },
- { X86::VPSRAWZ128rikz, X86::VPSRAWZ128mikz, 0 },
- { X86::VPSRLDZ128rikz, X86::VPSRLDZ128mikz, 0 },
- { X86::VPSRLQZ128rikz, X86::VPSRLQZ128mikz, 0 },
- { X86::VPSRLWZ128rikz, X86::VPSRLWZ128mikz, 0 },
-
- // AES foldable instructions
- { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
- { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
- { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
- { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
- { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
- { X86::VAESDECrr, X86::VAESDECrm, 0 },
- { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
- { X86::VAESENCrr, X86::VAESENCrm, 0 },
-
- // SHA foldable instructions
- { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
- { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
- { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
- { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
- { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
- { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
- { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
- };
-
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp,
@@ -2435,1103 +150,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
}
- static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
- // FMA4 foldable patterns
- { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
- { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
- { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
- { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
- { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
- { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
- { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE },
- { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE },
- { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
- { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
- { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
- { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
- { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
- { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
- { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE },
- { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE },
- { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
- { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
- { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
- { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
- { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE },
- { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE },
- { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
- { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
- { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE },
- { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE },
- { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE },
- { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE },
-
- // XOP foldable instructions
- { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
- { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 },
- { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
- { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYrm, 0 },
- { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
- { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYrm, 0 },
- { X86::VPPERMrrr, X86::VPPERMrrm, 0 },
-
- // AVX-512 instructions with 3 source operands.
- { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
- { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
- { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
- { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
- { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
- { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
- { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
- { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
- { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
- { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
- { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
- { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
- { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
- { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
-
- // AVX-512VL 256-bit instructions with 3 source operands.
- { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
- { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
- { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
- { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
- { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
- { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
- { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
- { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
- { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
- { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
- { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
- { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
- { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
- { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
-
- // AVX-512VL 128-bit instructions with 3 source operands.
- { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
- { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
- { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
- { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
- { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
- { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
- { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
- { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
- { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
- { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
- { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
- { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
- { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
- { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
-
- // AVX-512 masked instructions
- { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
- { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
- { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE },
- { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE },
- { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
- { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
- { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
- { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
- { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
- { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
- { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
- { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
- { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE },
- { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE },
- { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
- { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
- { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
- { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
- { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
- { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
- { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
- { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
- { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
- { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
- { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
- { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
- { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, 0 },
- { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, 0 },
- { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
- { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
- { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
- { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
- { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, 0 },
- { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, 0 },
- { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
- { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
- { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE },
- { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE },
- { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
- { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
- { X86::VPACKSSDWZrrkz, X86::VPACKSSDWZrmkz, 0 },
- { X86::VPACKSSWBZrrkz, X86::VPACKSSWBZrmkz, 0 },
- { X86::VPACKUSDWZrrkz, X86::VPACKUSDWZrmkz, 0 },
- { X86::VPACKUSWBZrrkz, X86::VPACKUSWBZrmkz, 0 },
- { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
- { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
- { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
- { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
- { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
- { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
- { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
- { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
- { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
- { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
- { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
- { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
- { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
- { X86::VPAVGBZrrkz, X86::VPAVGBZrmkz, 0 },
- { X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0 },
- { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
- { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
- { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
- { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
- { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
- { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
- { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
- { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
- { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
- { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
- { X86::VPMAXSBZrrkz, X86::VPMAXSBZrmkz, 0 },
- { X86::VPMAXSDZrrkz, X86::VPMAXSDZrmkz, 0 },
- { X86::VPMAXSQZrrkz, X86::VPMAXSQZrmkz, 0 },
- { X86::VPMAXSWZrrkz, X86::VPMAXSWZrmkz, 0 },
- { X86::VPMAXUBZrrkz, X86::VPMAXUBZrmkz, 0 },
- { X86::VPMAXUDZrrkz, X86::VPMAXUDZrmkz, 0 },
- { X86::VPMAXUQZrrkz, X86::VPMAXUQZrmkz, 0 },
- { X86::VPMAXUWZrrkz, X86::VPMAXUWZrmkz, 0 },
- { X86::VPMINSBZrrkz, X86::VPMINSBZrmkz, 0 },
- { X86::VPMINSDZrrkz, X86::VPMINSDZrmkz, 0 },
- { X86::VPMINSQZrrkz, X86::VPMINSQZrmkz, 0 },
- { X86::VPMINSWZrrkz, X86::VPMINSWZrmkz, 0 },
- { X86::VPMINUBZrrkz, X86::VPMINUBZrmkz, 0 },
- { X86::VPMINUDZrrkz, X86::VPMINUDZrmkz, 0 },
- { X86::VPMINUQZrrkz, X86::VPMINUQZrmkz, 0 },
- { X86::VPMINUWZrrkz, X86::VPMINUWZrmkz, 0 },
- { X86::VPMULLDZrrkz, X86::VPMULLDZrmkz, 0 },
- { X86::VPMULLQZrrkz, X86::VPMULLQZrmkz, 0 },
- { X86::VPMULLWZrrkz, X86::VPMULLWZrmkz, 0 },
- { X86::VPMULDQZrrkz, X86::VPMULDQZrmkz, 0 },
- { X86::VPMULUDQZrrkz, X86::VPMULUDQZrmkz, 0 },
- { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
- { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
- { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
- { X86::VPSLLDZrrkz, X86::VPSLLDZrmkz, 0 },
- { X86::VPSLLQZrrkz, X86::VPSLLQZrmkz, 0 },
- { X86::VPSLLVDZrrkz, X86::VPSLLVDZrmkz, 0 },
- { X86::VPSLLVQZrrkz, X86::VPSLLVQZrmkz, 0 },
- { X86::VPSLLVWZrrkz, X86::VPSLLVWZrmkz, 0 },
- { X86::VPSLLWZrrkz, X86::VPSLLWZrmkz, 0 },
- { X86::VPSRADZrrkz, X86::VPSRADZrmkz, 0 },
- { X86::VPSRAQZrrkz, X86::VPSRAQZrmkz, 0 },
- { X86::VPSRAVDZrrkz, X86::VPSRAVDZrmkz, 0 },
- { X86::VPSRAVQZrrkz, X86::VPSRAVQZrmkz, 0 },
- { X86::VPSRAVWZrrkz, X86::VPSRAVWZrmkz, 0 },
- { X86::VPSRAWZrrkz, X86::VPSRAWZrmkz, 0 },
- { X86::VPSRLDZrrkz, X86::VPSRLDZrmkz, 0 },
- { X86::VPSRLQZrrkz, X86::VPSRLQZrmkz, 0 },
- { X86::VPSRLVDZrrkz, X86::VPSRLVDZrmkz, 0 },
- { X86::VPSRLVQZrrkz, X86::VPSRLVQZrmkz, 0 },
- { X86::VPSRLVWZrrkz, X86::VPSRLVWZrmkz, 0 },
- { X86::VPSRLWZrrkz, X86::VPSRLWZrmkz, 0 },
- { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
- { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
- { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
- { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
- { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
- { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
- { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
- { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
- { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
- { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
- { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
- { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
- { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
- { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
- { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
- { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
- { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
- { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
- { X86::VSHUFPDZrrikz, X86::VSHUFPDZrmikz, 0 },
- { X86::VSHUFPSZrrikz, X86::VSHUFPSZrmikz, 0 },
- { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
- { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
- { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE },
- { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE },
- { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
- { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
- { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
- { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
- { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
- { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },
-
- // AVX-512{F,VL} masked arithmetic instructions 256-bit
- { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
- { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
- { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
- { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
- { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
- { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
- { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
- { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
- { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
- { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
- { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
- { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
- { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
- { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
- { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
- { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
- { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
- { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
- { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
- { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
- { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
- { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
- { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
- { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
- { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
- { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
- { X86::VPACKSSDWZ256rrkz, X86::VPACKSSDWZ256rmkz, 0 },
- { X86::VPACKSSWBZ256rrkz, X86::VPACKSSWBZ256rmkz, 0 },
- { X86::VPACKUSDWZ256rrkz, X86::VPACKUSDWZ256rmkz, 0 },
- { X86::VPACKUSWBZ256rrkz, X86::VPACKUSWBZ256rmkz, 0 },
- { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
- { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
- { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
- { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
- { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
- { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
- { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
- { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
- { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
- { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
- { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
- { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
- { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
- { X86::VPAVGBZ256rrkz, X86::VPAVGBZ256rmkz, 0 },
- { X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0 },
- { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
- { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
- { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
- { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
- { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
- { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
- { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
- { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
- { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
- { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
- { X86::VPMAXSBZ256rrkz, X86::VPMAXSBZ256rmkz, 0 },
- { X86::VPMAXSDZ256rrkz, X86::VPMAXSDZ256rmkz, 0 },
- { X86::VPMAXSQZ256rrkz, X86::VPMAXSQZ256rmkz, 0 },
- { X86::VPMAXSWZ256rrkz, X86::VPMAXSWZ256rmkz, 0 },
- { X86::VPMAXUBZ256rrkz, X86::VPMAXUBZ256rmkz, 0 },
- { X86::VPMAXUDZ256rrkz, X86::VPMAXUDZ256rmkz, 0 },
- { X86::VPMAXUQZ256rrkz, X86::VPMAXUQZ256rmkz, 0 },
- { X86::VPMAXUWZ256rrkz, X86::VPMAXUWZ256rmkz, 0 },
- { X86::VPMINSBZ256rrkz, X86::VPMINSBZ256rmkz, 0 },
- { X86::VPMINSDZ256rrkz, X86::VPMINSDZ256rmkz, 0 },
- { X86::VPMINSQZ256rrkz, X86::VPMINSQZ256rmkz, 0 },
- { X86::VPMINSWZ256rrkz, X86::VPMINSWZ256rmkz, 0 },
- { X86::VPMINUBZ256rrkz, X86::VPMINUBZ256rmkz, 0 },
- { X86::VPMINUDZ256rrkz, X86::VPMINUDZ256rmkz, 0 },
- { X86::VPMINUQZ256rrkz, X86::VPMINUQZ256rmkz, 0 },
- { X86::VPMINUWZ256rrkz, X86::VPMINUWZ256rmkz, 0 },
- { X86::VPMULDQZ256rrkz, X86::VPMULDQZ256rmkz, 0 },
- { X86::VPMULLDZ256rrkz, X86::VPMULLDZ256rmkz, 0 },
- { X86::VPMULLQZ256rrkz, X86::VPMULLQZ256rmkz, 0 },
- { X86::VPMULLWZ256rrkz, X86::VPMULLWZ256rmkz, 0 },
- { X86::VPMULUDQZ256rrkz, X86::VPMULUDQZ256rmkz, 0 },
- { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
- { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
- { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
- { X86::VPSLLDZ256rrkz, X86::VPSLLDZ256rmkz, 0 },
- { X86::VPSLLQZ256rrkz, X86::VPSLLQZ256rmkz, 0 },
- { X86::VPSLLVDZ256rrkz, X86::VPSLLVDZ256rmkz, 0 },
- { X86::VPSLLVQZ256rrkz, X86::VPSLLVQZ256rmkz, 0 },
- { X86::VPSLLVWZ256rrkz, X86::VPSLLVWZ256rmkz, 0 },
- { X86::VPSLLWZ256rrkz, X86::VPSLLWZ256rmkz, 0 },
- { X86::VPSRADZ256rrkz, X86::VPSRADZ256rmkz, 0 },
- { X86::VPSRAQZ256rrkz, X86::VPSRAQZ256rmkz, 0 },
- { X86::VPSRAVDZ256rrkz, X86::VPSRAVDZ256rmkz, 0 },
- { X86::VPSRAVQZ256rrkz, X86::VPSRAVQZ256rmkz, 0 },
- { X86::VPSRAVWZ256rrkz, X86::VPSRAVWZ256rmkz, 0 },
- { X86::VPSRAWZ256rrkz, X86::VPSRAWZ256rmkz, 0 },
- { X86::VPSRLDZ256rrkz, X86::VPSRLDZ256rmkz, 0 },
- { X86::VPSRLQZ256rrkz, X86::VPSRLQZ256rmkz, 0 },
- { X86::VPSRLVDZ256rrkz, X86::VPSRLVDZ256rmkz, 0 },
- { X86::VPSRLVQZ256rrkz, X86::VPSRLVQZ256rmkz, 0 },
- { X86::VPSRLVWZ256rrkz, X86::VPSRLVWZ256rmkz, 0 },
- { X86::VPSRLWZ256rrkz, X86::VPSRLWZ256rmkz, 0 },
- { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
- { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
- { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
- { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
- { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
- { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
- { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
- { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
- { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
- { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
- { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
- { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
- { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
- { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
- { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
- { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
- { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
- { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
- { X86::VSHUFPDZ256rrikz, X86::VSHUFPDZ256rmikz, 0 },
- { X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmikz, 0 },
- { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
- { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
- { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
- { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
- { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
- { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
- { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
- { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },
-
- // AVX-512{F,VL} masked arithmetic instructions 128-bit
- { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
- { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
- { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
- { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
- { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
- { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
- { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
- { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
- { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
- { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
- { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
- { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
- { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
- { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
- { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
- { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
- { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
- { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
- { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
- { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
- { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
- { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
- { X86::VPACKSSDWZ128rrkz, X86::VPACKSSDWZ128rmkz, 0 },
- { X86::VPACKSSWBZ128rrkz, X86::VPACKSSWBZ128rmkz, 0 },
- { X86::VPACKUSDWZ128rrkz, X86::VPACKUSDWZ128rmkz, 0 },
- { X86::VPACKUSWBZ128rrkz, X86::VPACKUSWBZ128rmkz, 0 },
- { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
- { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
- { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
- { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
- { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
- { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
- { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
- { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
- { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
- { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
- { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
- { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
- { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
- { X86::VPAVGBZ128rrkz, X86::VPAVGBZ128rmkz, 0 },
- { X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0 },
- { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
- { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
- { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
- { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
- { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
- { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
- { X86::VPMAXSBZ128rrkz, X86::VPMAXSBZ128rmkz, 0 },
- { X86::VPMAXSDZ128rrkz, X86::VPMAXSDZ128rmkz, 0 },
- { X86::VPMAXSQZ128rrkz, X86::VPMAXSQZ128rmkz, 0 },
- { X86::VPMAXSWZ128rrkz, X86::VPMAXSWZ128rmkz, 0 },
- { X86::VPMAXUBZ128rrkz, X86::VPMAXUBZ128rmkz, 0 },
- { X86::VPMAXUDZ128rrkz, X86::VPMAXUDZ128rmkz, 0 },
- { X86::VPMAXUQZ128rrkz, X86::VPMAXUQZ128rmkz, 0 },
- { X86::VPMAXUWZ128rrkz, X86::VPMAXUWZ128rmkz, 0 },
- { X86::VPMINSBZ128rrkz, X86::VPMINSBZ128rmkz, 0 },
- { X86::VPMINSDZ128rrkz, X86::VPMINSDZ128rmkz, 0 },
- { X86::VPMINSQZ128rrkz, X86::VPMINSQZ128rmkz, 0 },
- { X86::VPMINSWZ128rrkz, X86::VPMINSWZ128rmkz, 0 },
- { X86::VPMINUBZ128rrkz, X86::VPMINUBZ128rmkz, 0 },
- { X86::VPMINUDZ128rrkz, X86::VPMINUDZ128rmkz, 0 },
- { X86::VPMINUQZ128rrkz, X86::VPMINUQZ128rmkz, 0 },
- { X86::VPMINUWZ128rrkz, X86::VPMINUWZ128rmkz, 0 },
- { X86::VPMULDQZ128rrkz, X86::VPMULDQZ128rmkz, 0 },
- { X86::VPMULLDZ128rrkz, X86::VPMULLDZ128rmkz, 0 },
- { X86::VPMULLQZ128rrkz, X86::VPMULLQZ128rmkz, 0 },
- { X86::VPMULLWZ128rrkz, X86::VPMULLWZ128rmkz, 0 },
- { X86::VPMULUDQZ128rrkz, X86::VPMULUDQZ128rmkz, 0 },
- { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
- { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
- { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
- { X86::VPSLLDZ128rrkz, X86::VPSLLDZ128rmkz, 0 },
- { X86::VPSLLQZ128rrkz, X86::VPSLLQZ128rmkz, 0 },
- { X86::VPSLLVDZ128rrkz, X86::VPSLLVDZ128rmkz, 0 },
- { X86::VPSLLVQZ128rrkz, X86::VPSLLVQZ128rmkz, 0 },
- { X86::VPSLLVWZ128rrkz, X86::VPSLLVWZ128rmkz, 0 },
- { X86::VPSLLWZ128rrkz, X86::VPSLLWZ128rmkz, 0 },
- { X86::VPSRADZ128rrkz, X86::VPSRADZ128rmkz, 0 },
- { X86::VPSRAQZ128rrkz, X86::VPSRAQZ128rmkz, 0 },
- { X86::VPSRAVDZ128rrkz, X86::VPSRAVDZ128rmkz, 0 },
- { X86::VPSRAVQZ128rrkz, X86::VPSRAVQZ128rmkz, 0 },
- { X86::VPSRAVWZ128rrkz, X86::VPSRAVWZ128rmkz, 0 },
- { X86::VPSRAWZ128rrkz, X86::VPSRAWZ128rmkz, 0 },
- { X86::VPSRLDZ128rrkz, X86::VPSRLDZ128rmkz, 0 },
- { X86::VPSRLQZ128rrkz, X86::VPSRLQZ128rmkz, 0 },
- { X86::VPSRLVDZ128rrkz, X86::VPSRLVDZ128rmkz, 0 },
- { X86::VPSRLVQZ128rrkz, X86::VPSRLVQZ128rmkz, 0 },
- { X86::VPSRLVWZ128rrkz, X86::VPSRLVWZ128rmkz, 0 },
- { X86::VPSRLWZ128rrkz, X86::VPSRLWZ128rmkz, 0 },
- { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
- { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
- { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
- { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
- { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
- { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
- { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
- { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
- { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
- { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
- { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
- { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
- { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
- { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
- { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
- { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
- { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
- { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
- { X86::VSHUFPDZ128rrikz, X86::VSHUFPDZ128rmikz, 0 },
- { X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmikz, 0 },
- { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
- { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
- { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
- { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
- { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
- { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
- { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
- { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
-
- // AVX-512 masked foldable instructions
- { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
- { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
- { X86::VPABSBZrrk, X86::VPABSBZrmk, 0 },
- { X86::VPABSDZrrk, X86::VPABSDZrmk, 0 },
- { X86::VPABSQZrrk, X86::VPABSQZrmk, 0 },
- { X86::VPABSWZrrk, X86::VPABSWZrmk, 0 },
- { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
- { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
- { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
- { X86::VPERMQZrik, X86::VPERMQZmik, 0 },
- { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
- { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
- { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
- { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
- { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
- { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
- { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
- { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
- { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
- { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
- { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
- { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
- { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
- { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
- { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
- { X86::VPSLLDZrik, X86::VPSLLDZmik, 0 },
- { X86::VPSLLQZrik, X86::VPSLLQZmik, 0 },
- { X86::VPSLLWZrik, X86::VPSLLWZmik, 0 },
- { X86::VPSRADZrik, X86::VPSRADZmik, 0 },
- { X86::VPSRAQZrik, X86::VPSRAQZmik, 0 },
- { X86::VPSRAWZrik, X86::VPSRAWZmik, 0 },
- { X86::VPSRLDZrik, X86::VPSRLDZmik, 0 },
- { X86::VPSRLQZrik, X86::VPSRLQZmik, 0 },
- { X86::VPSRLWZrik, X86::VPSRLWZmik, 0 },
-
- // AVX-512VL 256-bit masked foldable instructions
- { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
- { X86::VPABSBZ256rrk, X86::VPABSBZ256rmk, 0 },
- { X86::VPABSDZ256rrk, X86::VPABSDZ256rmk, 0 },
- { X86::VPABSQZ256rrk, X86::VPABSQZ256rmk, 0 },
- { X86::VPABSWZ256rrk, X86::VPABSWZ256rmk, 0 },
- { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
- { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
- { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
- { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
- { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
- { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
- { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
- { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
- { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
- { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
- { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
- { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
- { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
- { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },
- { X86::VPSLLDZ256rik, X86::VPSLLDZ256mik, 0 },
- { X86::VPSLLQZ256rik, X86::VPSLLQZ256mik, 0 },
- { X86::VPSLLWZ256rik, X86::VPSLLWZ256mik, 0 },
- { X86::VPSRADZ256rik, X86::VPSRADZ256mik, 0 },
- { X86::VPSRAQZ256rik, X86::VPSRAQZ256mik, 0 },
- { X86::VPSRAWZ256rik, X86::VPSRAWZ256mik, 0 },
- { X86::VPSRLDZ256rik, X86::VPSRLDZ256mik, 0 },
- { X86::VPSRLQZ256rik, X86::VPSRLQZ256mik, 0 },
- { X86::VPSRLWZ256rik, X86::VPSRLWZ256mik, 0 },
-
- // AVX-512VL 128-bit masked foldable instructions
- { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
- { X86::VPABSBZ128rrk, X86::VPABSBZ128rmk, 0 },
- { X86::VPABSDZ128rrk, X86::VPABSDZ128rmk, 0 },
- { X86::VPABSQZ128rrk, X86::VPABSQZ128rmk, 0 },
- { X86::VPABSWZ128rrk, X86::VPABSWZ128rmk, 0 },
- { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
- { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
- { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
- { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
- { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
- { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
- { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
- { X86::VPSLLDZ128rik, X86::VPSLLDZ128mik, 0 },
- { X86::VPSLLQZ128rik, X86::VPSLLQZ128mik, 0 },
- { X86::VPSLLWZ128rik, X86::VPSLLWZ128mik, 0 },
- { X86::VPSRADZ128rik, X86::VPSRADZ128mik, 0 },
- { X86::VPSRAQZ128rik, X86::VPSRAQZ128mik, 0 },
- { X86::VPSRAWZ128rik, X86::VPSRAWZ128mik, 0 },
- { X86::VPSRLDZ128rik, X86::VPSRLDZ128mik, 0 },
- { X86::VPSRLQZ128rik, X86::VPSRLQZ128mik, 0 },
- { X86::VPSRLWZ128rik, X86::VPSRLWZ128mik, 0 },
- };
-
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp,
// Index 3, folded load
Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
}
- auto I = X86InstrFMA3Info::rm_begin();
- auto E = X86InstrFMA3Info::rm_end();
- for (; I != E; ++I) {
- if (!I.getGroup()->isKMasked()) {
- // Intrinsic forms need to pass TB_NO_REVERSE.
- if (I.getGroup()->isIntrinsic()) {
- AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
- I.getRegOpcode(), I.getMemOpcode(),
- TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
- } else {
- AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
- I.getRegOpcode(), I.getMemOpcode(),
- TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
- }
- }
- }
-
- static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
- // AVX-512 foldable masked instructions
- { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
- { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
- { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE },
- { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE },
- { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
- { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
- { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
- { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
- { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
- { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
- { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
- { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
- { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE },
- { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE },
- { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
- { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
- { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
- { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
- { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
- { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
- { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
- { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
- { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
- { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
- { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
- { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
- { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, 0 },
- { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, 0 },
- { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
- { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
- { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
- { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
- { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, 0 },
- { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, 0 },
- { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
- { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
- { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE },
- { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE },
- { X86::VORPDZrrk, X86::VORPDZrmk, 0 },
- { X86::VORPSZrrk, X86::VORPSZrmk, 0 },
- { X86::VPACKSSDWZrrk, X86::VPACKSSDWZrmk, 0 },
- { X86::VPACKSSWBZrrk, X86::VPACKSSWBZrmk, 0 },
- { X86::VPACKUSDWZrrk, X86::VPACKUSDWZrmk, 0 },
- { X86::VPACKUSWBZrrk, X86::VPACKUSWBZrmk, 0 },
- { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
- { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
- { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
- { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
- { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
- { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
- { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
- { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
- { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
- { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
- { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
- { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
- { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
- { X86::VPAVGBZrrk, X86::VPAVGBZrmk, 0 },
- { X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0 },
- { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
- { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
- { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
- { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
- { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
- { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
- { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
- { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
- { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
- { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
- { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
- { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
- { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
- { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
- { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
- { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
- { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
- { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
- { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
- { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
- { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
- { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
- { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 },
- { X86::VPMAXSDZrrk, X86::VPMAXSDZrmk, 0 },
- { X86::VPMAXSQZrrk, X86::VPMAXSQZrmk, 0 },
- { X86::VPMAXSWZrrk, X86::VPMAXSWZrmk, 0 },
- { X86::VPMAXUBZrrk, X86::VPMAXUBZrmk, 0 },
- { X86::VPMAXUDZrrk, X86::VPMAXUDZrmk, 0 },
- { X86::VPMAXUQZrrk, X86::VPMAXUQZrmk, 0 },
- { X86::VPMAXUWZrrk, X86::VPMAXUWZrmk, 0 },
- { X86::VPMINSBZrrk, X86::VPMINSBZrmk, 0 },
- { X86::VPMINSDZrrk, X86::VPMINSDZrmk, 0 },
- { X86::VPMINSQZrrk, X86::VPMINSQZrmk, 0 },
- { X86::VPMINSWZrrk, X86::VPMINSWZrmk, 0 },
- { X86::VPMINUBZrrk, X86::VPMINUBZrmk, 0 },
- { X86::VPMINUDZrrk, X86::VPMINUDZrmk, 0 },
- { X86::VPMINUQZrrk, X86::VPMINUQZrmk, 0 },
- { X86::VPMINUWZrrk, X86::VPMINUWZrmk, 0 },
- { X86::VPMULDQZrrk, X86::VPMULDQZrmk, 0 },
- { X86::VPMULLDZrrk, X86::VPMULLDZrmk, 0 },
- { X86::VPMULLQZrrk, X86::VPMULLQZrmk, 0 },
- { X86::VPMULLWZrrk, X86::VPMULLWZrmk, 0 },
- { X86::VPMULUDQZrrk, X86::VPMULUDQZrmk, 0 },
- { X86::VPORDZrrk, X86::VPORDZrmk, 0 },
- { X86::VPORQZrrk, X86::VPORQZrmk, 0 },
- { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
- { X86::VPSLLDZrrk, X86::VPSLLDZrmk, 0 },
- { X86::VPSLLQZrrk, X86::VPSLLQZrmk, 0 },
- { X86::VPSLLVDZrrk, X86::VPSLLVDZrmk, 0 },
- { X86::VPSLLVQZrrk, X86::VPSLLVQZrmk, 0 },
- { X86::VPSLLVWZrrk, X86::VPSLLVWZrmk, 0 },
- { X86::VPSLLWZrrk, X86::VPSLLWZrmk, 0 },
- { X86::VPSRADZrrk, X86::VPSRADZrmk, 0 },
- { X86::VPSRAQZrrk, X86::VPSRAQZrmk, 0 },
- { X86::VPSRAVDZrrk, X86::VPSRAVDZrmk, 0 },
- { X86::VPSRAVQZrrk, X86::VPSRAVQZrmk, 0 },
- { X86::VPSRAVWZrrk, X86::VPSRAVWZrmk, 0 },
- { X86::VPSRAWZrrk, X86::VPSRAWZrmk, 0 },
- { X86::VPSRLDZrrk, X86::VPSRLDZrmk, 0 },
- { X86::VPSRLQZrrk, X86::VPSRLQZrmk, 0 },
- { X86::VPSRLVDZrrk, X86::VPSRLVDZrmk, 0 },
- { X86::VPSRLVQZrrk, X86::VPSRLVQZrmk, 0 },
- { X86::VPSRLVWZrrk, X86::VPSRLVWZrmk, 0 },
- { X86::VPSRLWZrrk, X86::VPSRLWZrmk, 0 },
- { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
- { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
- { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
- { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
- { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
- { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
- { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
- { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
- { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
- { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
- { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
- { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
- { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
- { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
- { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
- { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
- { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
- { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
- { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
- { X86::VSHUFPDZrrik, X86::VSHUFPDZrmik, 0 },
- { X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 },
- { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
- { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
- { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE },
- { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE },
- { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
- { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
- { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
- { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
- { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
- { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
-
- // AVX-512{F,VL} foldable masked instructions 256-bit
- { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
- { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
- { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
- { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
- { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
- { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
- { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
- { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
- { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
- { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
- { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 },
- { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 },
- { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 },
- { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 },
- { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
- { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
- { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
- { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
- { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
- { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
- { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
- { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
- { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
- { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
- { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
- { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
- { X86::VPACKSSDWZ256rrk, X86::VPACKSSDWZ256rmk, 0 },
- { X86::VPACKSSWBZ256rrk, X86::VPACKSSWBZ256rmk, 0 },
- { X86::VPACKUSDWZ256rrk, X86::VPACKUSDWZ256rmk, 0 },
- { X86::VPACKUSWBZ256rrk, X86::VPACKUSWBZ256rmk, 0 },
- { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
- { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
- { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
- { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
- { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
- { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
- { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
- { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
- { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
- { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
- { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
- { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
- { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
- { X86::VPAVGBZ256rrk, X86::VPAVGBZ256rmk, 0 },
- { X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0 },
- { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
- { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
- { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
- { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
- { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
- { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
- { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
- { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
- { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
- { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
- { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
- { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
- { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
- { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
- { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
- { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
- { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
- { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
- { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
- { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
- { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
- { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
- { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 },
- { X86::VPMAXSDZ256rrk, X86::VPMAXSDZ256rmk, 0 },
- { X86::VPMAXSQZ256rrk, X86::VPMAXSQZ256rmk, 0 },
- { X86::VPMAXSWZ256rrk, X86::VPMAXSWZ256rmk, 0 },
- { X86::VPMAXUBZ256rrk, X86::VPMAXUBZ256rmk, 0 },
- { X86::VPMAXUDZ256rrk, X86::VPMAXUDZ256rmk, 0 },
- { X86::VPMAXUQZ256rrk, X86::VPMAXUQZ256rmk, 0 },
- { X86::VPMAXUWZ256rrk, X86::VPMAXUWZ256rmk, 0 },
- { X86::VPMINSBZ256rrk, X86::VPMINSBZ256rmk, 0 },
- { X86::VPMINSDZ256rrk, X86::VPMINSDZ256rmk, 0 },
- { X86::VPMINSQZ256rrk, X86::VPMINSQZ256rmk, 0 },
- { X86::VPMINSWZ256rrk, X86::VPMINSWZ256rmk, 0 },
- { X86::VPMINUBZ256rrk, X86::VPMINUBZ256rmk, 0 },
- { X86::VPMINUDZ256rrk, X86::VPMINUDZ256rmk, 0 },
- { X86::VPMINUQZ256rrk, X86::VPMINUQZ256rmk, 0 },
- { X86::VPMINUWZ256rrk, X86::VPMINUWZ256rmk, 0 },
- { X86::VPMULDQZ256rrk, X86::VPMULDQZ256rmk, 0 },
- { X86::VPMULLDZ256rrk, X86::VPMULLDZ256rmk, 0 },
- { X86::VPMULLQZ256rrk, X86::VPMULLQZ256rmk, 0 },
- { X86::VPMULLWZ256rrk, X86::VPMULLWZ256rmk, 0 },
- { X86::VPMULUDQZ256rrk, X86::VPMULUDQZ256rmk, 0 },
- { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
- { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
- { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
- { X86::VPSLLDZ256rrk, X86::VPSLLDZ256rmk, 0 },
- { X86::VPSLLQZ256rrk, X86::VPSLLQZ256rmk, 0 },
- { X86::VPSLLVDZ256rrk, X86::VPSLLVDZ256rmk, 0 },
- { X86::VPSLLVQZ256rrk, X86::VPSLLVQZ256rmk, 0 },
- { X86::VPSLLVWZ256rrk, X86::VPSLLVWZ256rmk, 0 },
- { X86::VPSLLWZ256rrk, X86::VPSLLWZ256rmk, 0 },
- { X86::VPSRADZ256rrk, X86::VPSRADZ256rmk, 0 },
- { X86::VPSRAQZ256rrk, X86::VPSRAQZ256rmk, 0 },
- { X86::VPSRAVDZ256rrk, X86::VPSRAVDZ256rmk, 0 },
- { X86::VPSRAVQZ256rrk, X86::VPSRAVQZ256rmk, 0 },
- { X86::VPSRAVWZ256rrk, X86::VPSRAVWZ256rmk, 0 },
- { X86::VPSRAWZ256rrk, X86::VPSRAWZ256rmk, 0 },
- { X86::VPSRLDZ256rrk, X86::VPSRLDZ256rmk, 0 },
- { X86::VPSRLQZ256rrk, X86::VPSRLQZ256rmk, 0 },
- { X86::VPSRLVDZ256rrk, X86::VPSRLVDZ256rmk, 0 },
- { X86::VPSRLVQZ256rrk, X86::VPSRLVQZ256rmk, 0 },
- { X86::VPSRLVWZ256rrk, X86::VPSRLVWZ256rmk, 0 },
- { X86::VPSRLWZ256rrk, X86::VPSRLWZ256rmk, 0 },
- { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
- { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
- { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
- { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
- { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
- { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
- { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
- { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
- { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
- { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
- { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
- { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
- { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
- { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
- { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
- { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
- { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
- { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
- { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
- { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
- { X86::VSHUFPDZ256rrik, X86::VSHUFPDZ256rmik, 0 },
- { X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 },
- { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
- { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
- { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
- { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
- { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
- { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
- { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
- { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },
-
- // AVX-512{F,VL} foldable instructions 128-bit
- { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
- { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
- { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
- { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
- { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
- { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
- { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
- { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
- { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
- { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
- { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
- { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
- { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
- { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
- { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
- { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
- { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
- { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
- { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
- { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
- { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
- { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
- { X86::VPACKSSDWZ128rrk, X86::VPACKSSDWZ128rmk, 0 },
- { X86::VPACKSSWBZ128rrk, X86::VPACKSSWBZ128rmk, 0 },
- { X86::VPACKUSDWZ128rrk, X86::VPACKUSDWZ128rmk, 0 },
- { X86::VPACKUSWBZ128rrk, X86::VPACKUSWBZ128rmk, 0 },
- { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
- { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
- { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
- { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
- { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
- { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
- { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
- { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
- { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
- { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
- { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
- { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
- { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
- { X86::VPAVGBZ128rrk, X86::VPAVGBZ128rmk, 0 },
- { X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0 },
- { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
- { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
- { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
- { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
- { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
- { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
- { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
- { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
- { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
- { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
- { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
- { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
- { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
- { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
- { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
- { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
- { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
- { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
- { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 },
- { X86::VPMAXSDZ128rrk, X86::VPMAXSDZ128rmk, 0 },
- { X86::VPMAXSQZ128rrk, X86::VPMAXSQZ128rmk, 0 },
- { X86::VPMAXSWZ128rrk, X86::VPMAXSWZ128rmk, 0 },
- { X86::VPMAXUBZ128rrk, X86::VPMAXUBZ128rmk, 0 },
- { X86::VPMAXUDZ128rrk, X86::VPMAXUDZ128rmk, 0 },
- { X86::VPMAXUQZ128rrk, X86::VPMAXUQZ128rmk, 0 },
- { X86::VPMAXUWZ128rrk, X86::VPMAXUWZ128rmk, 0 },
- { X86::VPMINSBZ128rrk, X86::VPMINSBZ128rmk, 0 },
- { X86::VPMINSDZ128rrk, X86::VPMINSDZ128rmk, 0 },
- { X86::VPMINSQZ128rrk, X86::VPMINSQZ128rmk, 0 },
- { X86::VPMINSWZ128rrk, X86::VPMINSWZ128rmk, 0 },
- { X86::VPMINUBZ128rrk, X86::VPMINUBZ128rmk, 0 },
- { X86::VPMINUDZ128rrk, X86::VPMINUDZ128rmk, 0 },
- { X86::VPMINUQZ128rrk, X86::VPMINUQZ128rmk, 0 },
- { X86::VPMINUWZ128rrk, X86::VPMINUWZ128rmk, 0 },
- { X86::VPMULDQZ128rrk, X86::VPMULDQZ128rmk, 0 },
- { X86::VPMULLDZ128rrk, X86::VPMULLDZ128rmk, 0 },
- { X86::VPMULLQZ128rrk, X86::VPMULLQZ128rmk, 0 },
- { X86::VPMULLWZ128rrk, X86::VPMULLWZ128rmk, 0 },
- { X86::VPMULUDQZ128rrk, X86::VPMULUDQZ128rmk, 0 },
- { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
- { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
- { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
- { X86::VPSLLDZ128rrk, X86::VPSLLDZ128rmk, 0 },
- { X86::VPSLLQZ128rrk, X86::VPSLLQZ128rmk, 0 },
- { X86::VPSLLVDZ128rrk, X86::VPSLLVDZ128rmk, 0 },
- { X86::VPSLLVQZ128rrk, X86::VPSLLVQZ128rmk, 0 },
- { X86::VPSLLVWZ128rrk, X86::VPSLLVWZ128rmk, 0 },
- { X86::VPSLLWZ128rrk, X86::VPSLLWZ128rmk, 0 },
- { X86::VPSRADZ128rrk, X86::VPSRADZ128rmk, 0 },
- { X86::VPSRAQZ128rrk, X86::VPSRAQZ128rmk, 0 },
- { X86::VPSRAVDZ128rrk, X86::VPSRAVDZ128rmk, 0 },
- { X86::VPSRAVQZ128rrk, X86::VPSRAVQZ128rmk, 0 },
- { X86::VPSRAVWZ128rrk, X86::VPSRAVWZ128rmk, 0 },
- { X86::VPSRAWZ128rrk, X86::VPSRAWZ128rmk, 0 },
- { X86::VPSRLDZ128rrk, X86::VPSRLDZ128rmk, 0 },
- { X86::VPSRLQZ128rrk, X86::VPSRLQZ128rmk, 0 },
- { X86::VPSRLVDZ128rrk, X86::VPSRLVDZ128rmk, 0 },
- { X86::VPSRLVQZ128rrk, X86::VPSRLVQZ128rmk, 0 },
- { X86::VPSRLVWZ128rrk, X86::VPSRLVWZ128rmk, 0 },
- { X86::VPSRLWZ128rrk, X86::VPSRLWZ128rmk, 0 },
- { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
- { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
- { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
- { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
- { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
- { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
- { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
- { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
- { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
- { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
- { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
- { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
- { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
- { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
- { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
- { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
- { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
- { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
- { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
- { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
- { X86::VSHUFPDZ128rrik, X86::VSHUFPDZ128rmik, 0 },
- { X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0 },
- { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
- { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
- { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
- { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
- { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
- { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
- { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
- { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
-
- // 512-bit three source instructions with zero masking.
- { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
- { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
- { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
- { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
- { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
- { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
- { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
- { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
- { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
- { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
- { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
- { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
- { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
- { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
-
- // 256-bit three source instructions with zero masking.
- { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
- { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
- { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
- { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
- { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
- { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
- { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
- { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
- { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
- { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
- { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
- { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
- { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
- { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
-
- // 128-bit three source instructions with zero masking.
- { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
- { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
- { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
- { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
- { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
- { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
- { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
- { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
- { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
- { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
- { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
- { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
- { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
- { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
- };
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
@@ -3539,20 +163,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// Index 4, folded load
Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
}
- for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
- if (I.getGroup()->isKMasked()) {
- // Intrinsics need to pass TB_NO_REVERSE.
- if (I.getGroup()->isIntrinsic()) {
- AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
- I.getRegOpcode(), I.getMemOpcode(),
- TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
- } else {
- AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
- I.getRegOpcode(), I.getMemOpcode(),
- TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
- }
- }
- }
}
void
@@ -5930,7 +2540,7 @@ void X86InstrInfo::replaceBranchWithTailCall(
// Add implicit uses and defs of all live regs potentially clobbered by the
// call. This way they still appear live across the call.
- LivePhysRegs LiveRegs(&getRegisterInfo());
+ LivePhysRegs LiveRegs(getRegisterInfo());
LiveRegs.addLiveOuts(MBB);
SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers;
LiveRegs.stepForward(*MIB, Clobbers);
@@ -6545,9 +3155,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// first frame index.
// See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
- const TargetRegisterInfo *TRI = &getRegisterInfo();
+ const TargetRegisterInfo &TRI = getRegisterInfo();
MachineBasicBlock::LivenessQueryResult LQR =
- MBB.computeRegisterLiveness(TRI, AX, MI);
+ MBB.computeRegisterLiveness(&TRI, AX, MI);
// We do not want to save and restore AX if we do not have to.
// Moreover, if we do so whereas AX is dead, we would need to set
// an undef flag on the use of AX, otherwise the verifier will
@@ -6564,7 +3174,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
// AX contains the top most register in the aliasing hierarchy.
// It may not be live, but one of its aliases may be.
- for (MCRegAliasIterator AI(AX, TRI, true);
+ for (MCRegAliasIterator AI(AX, &TRI, true);
AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
: MachineBasicBlock::LQR_Dead;
@@ -8374,7 +4984,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
unsigned Opc = LoadMI.getOpcode();
unsigned UserOpc = UserMI.getOpcode();
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- const TargetRegisterClass *RC =
+ const TargetRegisterClass *RC =
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
unsigned RegSize = TRI.getRegSizeInBits(*RC);
@@ -10473,7 +7083,7 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const {
// catch it.
if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
- MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
+ MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
return MachineOutlinerInstrType::Illegal;
// Outlined calls change the instruction pointer, so don't read from it.
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 01df07e1715f..fab70e918b8a 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -813,6 +813,8 @@ def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">;
def HasCDI : Predicate<"Subtarget->hasCDI()">,
AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
+def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">,
+ AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">;
def HasPFI : Predicate<"Subtarget->hasPFI()">,
AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
def HasERI : Predicate<"Subtarget->hasERI()">,
@@ -1436,11 +1438,14 @@ def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
// Longer forms that use a ModR/M byte. Needed for disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
- "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+ FoldGenData<"MOV8ri">;
def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+ FoldGenData<"MOV16ri">;
def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+ FoldGenData<"MOV32ri">;
}
} // SchedRW
@@ -1563,13 +1568,17 @@ def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
SchedRW = [WriteMove] in {
def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
- "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+ "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+ FoldGenData<"MOV8rr">;
def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+ FoldGenData<"MOV16rr">;
def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+ "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+ FoldGenData<"MOV32rr">;
def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+ "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+ FoldGenData<"MOV64rr">;
}
let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index dc3800ce381b..2c047722db24 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -248,7 +248,8 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst,
(MMX_X86movd2w (x86mmx VR64:$src)))],
- IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+ IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>,
+ FoldGenData<"MMX_MOVD64rr">;
let isBitcast = 1 in
def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
@@ -277,7 +278,7 @@ def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
"movq\t{$src, $dst|$dst, $src}", [],
- IIC_MMX_MOVQ_RR>;
+ IIC_MMX_MOVQ_RR>, FoldGenData<"MMX_MOVQ64rr">;
}
} // SchedRW
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index f73d85e7e01b..a3e677209305 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -507,7 +507,8 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
X86MemOperand x86memop, string base_opc,
- string asm_opr, Domain d = GenericDomain> {
+ string asm_opr, Domain d = GenericDomain,
+ string Name> {
let isCommutable = 1 in
def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, RC:$src2),
@@ -521,15 +522,17 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src1, RC:$src2),
!strconcat(base_opc, asm_opr),
- [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+ [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>,
+ FoldGenData<Name#rr>;
}
multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
X86MemOperand x86memop, string OpcodeStr,
- Domain d = GenericDomain> {
+ Domain d = GenericDomain, string Name> {
// AVX
defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
+ "V"#Name>,
VEX_4V, VEX_LIG, VEX_WIG;
def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
@@ -539,7 +542,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
// SSE1 & 2
let Constraints = "$src1 = $dst" in {
defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
- "\t{$src2, $dst|$dst, $src2}", d>;
+ "\t{$src2, $dst|$dst, $src2}", d, Name>;
}
def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
@@ -563,9 +566,9 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
}
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
- SSEPackedSingle>, XS;
+ SSEPackedSingle, "MOVSS">, XS;
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
- SSEPackedDouble>, XD;
+ SSEPackedDouble, "MOVSD">, XD;
let canFoldAsLoad = 1, isReMaterializable = 1 in {
defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
@@ -864,35 +867,43 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG;
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
+ FoldGenData<"VMOVAPSrr">;
def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movapd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG;
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
+ FoldGenData<"VMOVAPDrr">;
def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movups\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG;
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
+ FoldGenData<"VMOVUPSrr">;
def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movupd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG;
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
+ FoldGenData<"VMOVUPDrr">;
def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+ FoldGenData<"VMOVAPSYrr">;
def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
"movapd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+ FoldGenData<"VMOVAPDYrr">;
def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
"movups\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+ FoldGenData<"VMOVUPSYrr">;
def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
(ins VR256:$src),
"movupd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+ FoldGenData<"VMOVUPDYrr">;
}
// Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -938,16 +949,16 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
SchedRW = [WriteFShuffle] in {
def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>;
+ IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPSrr">;
def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movapd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>;
+ IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPDrr">;
def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movups\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>;
+ IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPSrr">;
def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movupd\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>;
+ IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPDrr">;
}
let Predicates = [HasAVX, NoVLX] in {
@@ -3752,17 +3763,19 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>,
- VEX, VEX_WIG;
+ VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
"movdqa\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
+ IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+ FoldGenData<"VMOVDQAYrr">;
def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVU_P_RR>,
- VEX, VEX_WIG;
+ VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
"movdqu\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
+ IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+ FoldGenData<"VMOVDQUYrr">;
}
let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
@@ -3820,11 +3833,12 @@ def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [],
- IIC_SSE_MOVA_P_RR>;
+ IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVDQArr">;
def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movdqu\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+ [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>,
+ FoldGenData<"MOVDQUrr">;
}
} // SchedRW
@@ -5915,7 +5929,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
(ins VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, Sched<[WriteShuffle]>;
+ []>, Sched<[WriteShuffle]>, FoldGenData<NAME#ri>;
let hasSideEffects = 0, mayStore = 1,
SchedRW = [WriteShuffleLd, WriteRMW] in
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 53224431c0e9..5dde2d07babe 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -111,7 +111,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>,
- XOP_4V, VEX_W, Sched<[WriteVarVecShift]>;
+ XOP_4V, VEX_W, Sched<[WriteVarVecShift]>, FoldGenData<NAME#rr>;
}
let ExeDomain = SSEPackedInt in {
@@ -282,7 +282,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_W;
+ []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>;
}
let ExeDomain = SSEPackedInt in {
@@ -318,7 +318,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_W;
+ []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>;
}
let ExeDomain = SSEPackedInt in {
@@ -357,7 +357,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
(ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>, VEX_W;
+ []>, VEX_W, FoldGenData<NAME#rr>;
}
let ExeDomain = SSEPackedDouble in {
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 61956f741820..77dead8d2413 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -302,6 +302,26 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
: HasAVX512
? X86::VMOVUPSZ128mr_NOVLX
: HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+ } else if (Ty.isVector() && Ty.getSizeInBits() == 256) {
+ if (Alignment >= 32)
+ return Isload ? (HasVLX ? X86::VMOVAPSZ256rm
+ : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
+ : X86::VMOVAPSYrm)
+ : (HasVLX ? X86::VMOVAPSZ256mr
+ : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
+ : X86::VMOVAPSYmr);
+ else
+ return Isload ? (HasVLX ? X86::VMOVUPSZ256rm
+ : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
+ : X86::VMOVUPSYrm)
+ : (HasVLX ? X86::VMOVUPSZ256mr
+ : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
+ : X86::VMOVUPSYmr);
+ } else if (Ty.isVector() && Ty.getSizeInBits() == 512) {
+ if (Alignment >= 64)
+ return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+ else
+ return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
}
return Opc;
}
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index da724f5d8989..979aaee110aa 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -35,6 +35,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizerInfoSSE1();
setLegalizerInfoSSE2();
setLegalizerInfoSSE41();
+ setLegalizerInfoAVX();
setLegalizerInfoAVX2();
setLegalizerInfoAVX512();
setLegalizerInfoAVX512DQ();
@@ -209,6 +210,18 @@ void X86LegalizerInfo::setLegalizerInfoSSE41() {
setAction({G_MUL, v4s32}, Legal);
}
+void X86LegalizerInfo::setLegalizerInfoAVX() {
+ if (!Subtarget.hasAVX())
+ return;
+
+ const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ for (auto Ty : {v8s32, v4s64})
+ setAction({MemOp, Ty}, Legal);
+}
+
void X86LegalizerInfo::setLegalizerInfoAVX2() {
if (!Subtarget.hasAVX2())
return;
@@ -239,6 +252,10 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
setAction({G_MUL, v16s32}, Legal);
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ for (auto Ty : {v16s32, v8s64})
+ setAction({MemOp, Ty}, Legal);
+
/************ VLX *******************/
if (!Subtarget.hasVLX())
return;
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
index ab5405a70427..135950a95f84 100644
--- a/lib/Target/X86/X86LegalizerInfo.h
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -39,6 +39,7 @@ private:
void setLegalizerInfoSSE1();
void setLegalizerInfoSSE2();
void setLegalizerInfoSSE41();
+ void setLegalizerInfoAVX();
void setLegalizerInfoAVX2();
void setLegalizerInfoAVX512();
void setLegalizerInfoAVX512DQ();
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 2b1f43bffd71..84ec98484f8e 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -286,6 +286,7 @@ void X86Subtarget::initializeEnvironment() {
HasCDI = false;
HasPFI = false;
HasDQI = false;
+ HasVPOPCNTDQ = false;
HasBWI = false;
HasVLX = false;
HasADX = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a9f3a2aee1be..550e95c39ab5 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -270,6 +270,9 @@ protected:
/// Processor has AVX-512 Conflict Detection Instructions
bool HasCDI;
+ /// Processor has AVX-512 population count Instructions
+ bool HasVPOPCNTDQ;
+
/// Processor has AVX-512 Doubleword and Quadword instructions
bool HasDQI;
@@ -494,6 +497,7 @@ public:
bool slow3OpsLEA() const { return Slow3OpsLEA; }
bool slowIncDec() const { return SlowIncDec; }
bool hasCDI() const { return HasCDI; }
+ bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
bool hasPFI() const { return HasPFI; }
bool hasERI() const { return HasERI; }
bool hasDQI() const { return HasDQI; }