diff options
Diffstat (limited to 'lib/Target')
122 files changed, 3581 insertions, 5095 deletions
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 056ffd58b521..981fd22c213c 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -320,6 +320,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, switch (ExtraCode[0]) { default: return true; // Unknown modifier. + case 'a': // Print 'a' modifier + PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O); + return false; case 'w': // Print W register case 'x': // Print X register if (MO.isReg()) @@ -388,7 +391,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) + if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a') return true; // Unknown modifier. const MachineOperand &MO = MI->getOperand(OpNum); diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 629ad5c61b78..33fec74998d6 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -584,27 +584,21 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, return true; } -static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - MBB->addLiveIn(*I); -} - bool AArch64ExpandPseudo::expandCMP_SWAP( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); - - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + unsigned NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -616,19 +610,18 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( MF->insert(++StoreBB->getIterator(), DoneBB); // .Lloadcmp: + // mov wStatus, 0 // ldaxr xDest, [xAddr] // cmp xDest, xDesired // b.ne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - + if (!StatusDead) + BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg) + .addImm(0).addImm(0); BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg()) - .addReg(Addr.getReg()); + .addReg(AddrReg); BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) - .add(Desired) + .addReg(DesiredReg) .addImm(ExtendImm); BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) .addImm(AArch64CC::NE) @@ -640,25 +633,35 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( // .Lstore: // stlxr wStatus, xNew, [xAddr] // cbnz wStatus, .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - - BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg).add(New).add(Addr); + BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) + .addReg(NewReg) + .addReg(AddrReg); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addMBB(LoadCmpBB); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } @@ -671,16 +674,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( MachineOperand &DestLo = MI.getOperand(0); MachineOperand &DestHi = MI.getOperand(1); unsigned StatusReg = MI.getOperand(2).getReg(); - MachineOperand &Addr = MI.getOperand(3); - MachineOperand &DesiredLo = MI.getOperand(4); - MachineOperand &DesiredHi = MI.getOperand(5); - MachineOperand &NewLo = MI.getOperand(6); - MachineOperand &NewHi = MI.getOperand(7); - - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(2).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(3).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(3).getReg(); + unsigned DesiredLoReg = MI.getOperand(4).getReg(); + unsigned DesiredHiReg = MI.getOperand(5).getReg(); + unsigned NewLoReg = MI.getOperand(6).getReg(); + unsigned NewHiReg = MI.getOperand(7).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -696,20 +698,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( // cmp xDestLo, xDesiredLo // sbcs xDestHi, xDesiredHi // b.ne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(DestLo.getReg()); - LoadCmpBB->addLiveIn(DestHi.getReg()); - LoadCmpBB->addLiveIn(DesiredLo.getReg()); - LoadCmpBB->addLiveIn(DesiredHi.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) .addReg(DestLo.getReg(), RegState::Define) .addReg(DestHi.getReg(), RegState::Define) - .addReg(Addr.getReg()); + .addReg(AddrReg); BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) - .add(DesiredLo) + .addReg(DesiredLoReg) .addImm(0); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) .addUse(AArch64::WZR) @@ -717,14 +712,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( .addImm(AArch64CC::EQ); BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) - .add(DesiredHi) + .addReg(DesiredHiReg) .addImm(0); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) .addUse(StatusReg, RegState::Kill) .addUse(StatusReg, RegState::Kill) .addImm(AArch64CC::EQ); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW)) - .addUse(StatusReg, RegState::Kill) + .addUse(StatusReg, getKillRegState(StatusDead)) .addMBB(DoneBB); LoadCmpBB->addSuccessor(DoneBB); LoadCmpBB->addSuccessor(StoreBB); @@ -732,28 +727,36 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( // .Lstore: // stlxp wStatus, xNewLo, xNewHi, [xAddr] // cbnz wStatus, .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(NewLo.getReg()); - StoreBB->addLiveIn(NewHi.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) - .add(NewLo) - .add(NewHi) - .add(Addr); + .addReg(NewLoReg) + .addReg(NewHiReg) + .addReg(AddrReg); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addMBB(LoadCmpBB); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute liveness bottom up. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass in the loop to get the loop carried dependencies right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 1aec602a2a36..0b92249580c8 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -267,12 +267,12 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { return AArch64::X9; const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); - const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); LivePhysRegs LiveRegs(TRI); LiveRegs.addLiveIns(*MBB); // Mark callee saved registers as used so we will not choose them. - const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(MF); + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); for (unsigned i = 0; CSRegs[i]; ++i) LiveRegs.addReg(CSRegs[i]); @@ -991,6 +991,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( SmallVector<RegPairInfo, 8> RegPairs; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; ++RPII) { @@ -1022,9 +1023,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( dbgs() << ")\n"); MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - MBB.addLiveIn(Reg1); + if (!MRI.isReserved(Reg1)) + MBB.addLiveIn(Reg1); if (RPI.isPaired()) { - MBB.addLiveIn(Reg2); + if (!MRI.isReserved(Reg2)) + MBB.addLiveIn(Reg2); MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 1af36086ad90..62f4c953830b 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -886,18 +886,21 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, // Create the new constant immediate node. EVT VT = Op.getValueType(); SDLoc DL(Op); + SDValue New; // If the new constant immediate is all-zeros or all-ones, let the target // independent DAG combine optimize this node. - if (NewImm == 0 || NewImm == OrigMask) - return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT)); - + if (NewImm == 0 || NewImm == OrigMask) { + New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), + TLO.DAG.getConstant(NewImm, DL, VT)); // Otherwise, create a machine node so that target independent DAG combine // doesn't undo this optimization. - Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); - SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); - SDValue New( - TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + } else { + Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); + SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); + New = SDValue( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + } return TLO.CombineTo(Op, New); } @@ -9219,16 +9222,26 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, // instructions (stp). SDLoc DL(&St); SDValue BasePtr = St.getBasePtr(); + uint64_t BaseOffset = 0; + const MachinePointerInfo &PtrInfo = St.getPointerInfo(); SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, OrigAlignment, St.getMemOperand()->getFlags()); + // As this in ISel, we will not merge this add which may degrade results. + if (BasePtr->getOpcode() == ISD::ADD && + isa<ConstantSDNode>(BasePtr->getOperand(1))) { + BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue(); + BasePtr = BasePtr->getOperand(0); + } + unsigned Offset = EltOffset; while (--NumVecElts) { unsigned Alignment = MinAlign(OrigAlignment, Offset); - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(Offset, DL, MVT::i64)); + SDValue OffsetPtr = + DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, PtrInfo.getWithOffset(Offset), Alignment, St.getMemOperand()->getFlags()); diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index c42738da7ab0..faf39be9b41e 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -763,15 +763,126 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { llvm_unreachable("Unknown opcode to check as cheap as a move!"); } -bool AArch64InstrInfo::isFalkorLSLFast(const MachineInstr &MI) const { - if (MI.getNumOperands() < 4) +bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: return false; - unsigned ShOpVal = MI.getOperand(3).getImm(); - unsigned ShImm = AArch64_AM::getShiftValue(ShOpVal); - if (AArch64_AM::getShiftType(ShOpVal) == AArch64_AM::LSL && - ShImm < 4) - return true; - return false; + + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + if (ShiftVal == 0) + return true; + return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; + } + + case AArch64::ADDWrx: + case AArch64::ADDXrx: + case AArch64::ADDXrx64: + case AArch64::ADDSWrx: + case AArch64::ADDSXrx: + case AArch64::ADDSXrx64: { + unsigned Imm = MI.getOperand(3).getImm(); + switch (AArch64_AM::getArithExtendType(Imm)) { + default: + return false; + case AArch64_AM::UXTB: + case AArch64_AM::UXTH: + case AArch64_AM::UXTW: + case AArch64_AM::UXTX: + return AArch64_AM::getArithShiftValue(Imm) <= 4; + } + } + + case AArch64::SUBWrs: + case AArch64::SUBSWrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + return ShiftVal == 0 || + (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); + } + + case AArch64::SUBXrs: + case AArch64::SUBSXrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + return ShiftVal == 0 || + (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); + } + + case AArch64::SUBWrx: + case AArch64::SUBXrx: + case AArch64::SUBXrx64: + case AArch64::SUBSWrx: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: { + unsigned Imm = MI.getOperand(3).getImm(); + switch (AArch64_AM::getArithExtendType(Imm)) { + default: + return false; + case AArch64_AM::UXTB: + case AArch64_AM::UXTH: + case AArch64_AM::UXTW: + case AArch64_AM::UXTX: + return AArch64_AM::getArithShiftValue(Imm) == 0; + } + } + + case AArch64::LDRBBroW: + case AArch64::LDRBBroX: + case AArch64::LDRBroW: + case AArch64::LDRBroX: + case AArch64::LDRDroW: + case AArch64::LDRDroX: + case AArch64::LDRHHroW: + case AArch64::LDRHHroX: + case AArch64::LDRHroW: + case AArch64::LDRHroX: + case AArch64::LDRQroW: + case AArch64::LDRQroX: + case AArch64::LDRSBWroW: + case AArch64::LDRSBWroX: + case AArch64::LDRSBXroW: + case AArch64::LDRSBXroX: + case AArch64::LDRSHWroW: + case AArch64::LDRSHWroX: + case AArch64::LDRSHXroW: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroW: + case AArch64::LDRSWroX: + case AArch64::LDRSroW: + case AArch64::LDRSroX: + case AArch64::LDRWroW: + case AArch64::LDRWroX: + case AArch64::LDRXroW: + case AArch64::LDRXroX: + case AArch64::PRFMroW: + case AArch64::PRFMroX: + case AArch64::STRBBroW: + case AArch64::STRBBroX: + case AArch64::STRBroW: + case AArch64::STRBroX: + case AArch64::STRDroW: + case AArch64::STRDroX: + case AArch64::STRHHroW: + case AArch64::STRHHroX: + case AArch64::STRHroW: + case AArch64::STRHroX: + case AArch64::STRQroW: + case AArch64::STRQroX: + case AArch64::STRSroW: + case AArch64::STRSroX: + case AArch64::STRWroW: + case AArch64::STRWroX: + case AArch64::STRXroW: + case AArch64::STRXroX: { + unsigned IsSigned = MI.getOperand(3).getImm(); + return !IsSigned; + } + } } bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 4cd14db633b9..59f3405fe439 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -270,7 +270,7 @@ public: bool IsTailCall) const override; /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. - bool isFalkorLSLFast(const MachineInstr &MI) const; + bool isFalkorShiftExtFast(const MachineInstr &MI) const; private: /// \brief Sets the offsets on outlined instructions in \p MBB which use SP diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index da68f3165c5e..ad24612239fa 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -442,7 +442,7 @@ def MSRpstateImm4 : MSRpstateImm0_15; // TPIDR_EL0. Add pseudo op so we can mark it as not having any side effects. let hasSideEffects = 0 in def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), - [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[]>; + [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>; // The cycle counter PMC register is PMCCNTR_EL0. let Predicates = [HasPerfMon] in diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp index a6926a6700e1..3b71d529db59 100644 --- a/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -232,6 +232,19 @@ static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) { dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " << DAG->TII->getName(SecondMI->getOpcode()) << '\n'; ); + if (&SecondSU != &DAG->ExitSU) + // Make instructions dependent on FirstSU also dependent on SecondSU to + // prevent them from being scheduled between FirstSU and and SecondSU. + for (SUnit::const_succ_iterator + SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end(); + SI != SE; ++SI) { + if (!SI->getSUnit() || SI->getSUnit() == &SecondSU) + continue; + DEBUG(dbgs() << " Copy Succ "; + SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';); + DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial)); + } + ++NumFused; return true; } diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td index cf1c0b66db58..44fd94fc3d48 100644 --- a/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/lib/Target/AArch64/AArch64SchedFalkor.td @@ -61,56 +61,42 @@ let SchedModel = FalkorModel in { let SchedModel = FalkorModel in { -def : WriteRes<WriteImm, [FalkorUnitXYZ]> { let Latency = 1; } -def : WriteRes<WriteI, [FalkorUnitXYZ]> { let Latency = 1; } -def : WriteRes<WriteISReg, [FalkorUnitVXVY, FalkorUnitVXVY]> - { let Latency = 1; let NumMicroOps = 2; } -def : WriteRes<WriteIEReg, [FalkorUnitXYZ, FalkorUnitXYZ]> - { let Latency = 2; let NumMicroOps = 2; } -def : WriteRes<WriteExtr, [FalkorUnitXYZ, FalkorUnitXYZ]> - { let Latency = 2; let NumMicroOps = 2; } -def : WriteRes<WriteIS, [FalkorUnitXYZ]> { let Latency = 1; } -def : WriteRes<WriteID32, [FalkorUnitX, FalkorUnitZ]> - { let Latency = 8; let NumMicroOps = 2; } -def : WriteRes<WriteID64, [FalkorUnitX, FalkorUnitZ]> - { let Latency = 16; let NumMicroOps = 2; } -def : WriteRes<WriteIM32, [FalkorUnitX]> { let Latency = 4; } -def : WriteRes<WriteIM64, [FalkorUnitX]> { let Latency = 5; } -def : WriteRes<WriteBr, [FalkorUnitB]> { let Latency = 1; } -def : WriteRes<WriteBrReg, [FalkorUnitB]> { let Latency = 1; } -def : WriteRes<WriteLD, [FalkorUnitLD]> { let Latency = 3; } -def : WriteRes<WriteST, [FalkorUnitST, FalkorUnitSD]> - { let Latency = 0; let NumMicroOps = 2; } -def : WriteRes<WriteSTP, [FalkorUnitST, FalkorUnitSD]> - { let Latency = 0; let NumMicroOps = 2; } -def : WriteRes<WriteAdr, [FalkorUnitXYZ]> { let Latency = 1; } -def : WriteRes<WriteLDIdx, [FalkorUnitLD]> { let Latency = 5; } -def : WriteRes<WriteSTIdx, [FalkorUnitST, FalkorUnitSD]> - { let Latency = 0; let NumMicroOps = 2; } -def : WriteRes<WriteF, [FalkorUnitVXVY, FalkorUnitVXVY]> - { let Latency = 3; let NumMicroOps = 2; } -def : WriteRes<WriteFCmp, [FalkorUnitVXVY]> { let Latency = 2; } -def : WriteRes<WriteFCvt, [FalkorUnitVXVY]> { let Latency = 4; } -def : WriteRes<WriteFCopy, [FalkorUnitVXVY]> { let Latency = 4; } -def : WriteRes<WriteFImm, [FalkorUnitVXVY]> { let Latency = 4; } -def : WriteRes<WriteFMul, [FalkorUnitVXVY, FalkorUnitVXVY]> - { let Latency = 6; let NumMicroOps = 2; } -def : WriteRes<WriteFDiv, [FalkorUnitVXVY, FalkorUnitVXVY]> - { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1 -def : WriteRes<WriteV, [FalkorUnitVXVY]> { let Latency = 6; } -def : WriteRes<WriteVLD, [FalkorUnitLD]> { let Latency = 3; } -def : WriteRes<WriteVST, [FalkorUnitST, FalkorUnitVSD]> - { let Latency = 0; let NumMicroOps = 2; } - -def : WriteRes<WriteSys, []> { let Latency = 1; } -def : WriteRes<WriteBarrier, []> { let Latency = 1; } -def : WriteRes<WriteHint, []> { let Latency = 1; } - -def : WriteRes<WriteLDHi, []> { let Latency = 3; } - -def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } - -// No forwarding logic is modelled yet. +// These WriteRes entries are not used in the Falkor sched model. +def : WriteRes<WriteImm, []> { let Unsupported = 1; } +def : WriteRes<WriteI, []> { let Unsupported = 1; } +def : WriteRes<WriteISReg, []> { let Unsupported = 1; } +def : WriteRes<WriteIEReg, []> { let Unsupported = 1; } +def : WriteRes<WriteExtr, []> { let Unsupported = 1; } +def : WriteRes<WriteIS, []> { let Unsupported = 1; } +def : WriteRes<WriteID32, []> { let Unsupported = 1; } +def : WriteRes<WriteID64, []> { let Unsupported = 1; } +def : WriteRes<WriteIM32, []> { let Unsupported = 1; } +def : WriteRes<WriteIM64, []> { let Unsupported = 1; } +def : WriteRes<WriteBr, []> { let Unsupported = 1; } +def : WriteRes<WriteBrReg, []> { let Unsupported = 1; } +def : WriteRes<WriteLD, []> { let Unsupported = 1; } +def : WriteRes<WriteST, []> { let Unsupported = 1; } +def : WriteRes<WriteSTP, []> { let Unsupported = 1; } +def : WriteRes<WriteAdr, []> { let Unsupported = 1; } +def : WriteRes<WriteLDIdx, []> { let Unsupported = 1; } +def : WriteRes<WriteSTIdx, []> { let Unsupported = 1; } +def : WriteRes<WriteF, []> { let Unsupported = 1; } +def : WriteRes<WriteFCmp, []> { let Unsupported = 1; } +def : WriteRes<WriteFCvt, []> { let Unsupported = 1; } +def : WriteRes<WriteFCopy, []> { let Unsupported = 1; } +def : WriteRes<WriteFImm, []> { let Unsupported = 1; } +def : WriteRes<WriteFMul, []> { let Unsupported = 1; } +def : WriteRes<WriteFDiv, []> { let Unsupported = 1; } +def : WriteRes<WriteV, []> { let Unsupported = 1; } +def : WriteRes<WriteVLD, []> { let Unsupported = 1; } +def : WriteRes<WriteVST, []> { let Unsupported = 1; } +def : WriteRes<WriteSys, []> { let Unsupported = 1; } +def : WriteRes<WriteBarrier, []> { let Unsupported = 1; } +def : WriteRes<WriteHint, []> { let Unsupported = 1; } +def : WriteRes<WriteLDHi, []> { let Unsupported = 1; } +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// These ReadAdvance entries are not used in the Falkor sched model. def : ReadAdvance<ReadI, 0>; def : ReadAdvance<ReadISReg, 0>; def : ReadAdvance<ReadIEReg, 0>; diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index a9b4d44a523e..d098cf7a5a37 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -12,7 +12,509 @@ // //===----------------------------------------------------------------------===// -include "AArch64SchedFalkorWriteRes.td" +// Contains all of the Falkor specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: FalkorWr +// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD) +// Latency: #cyc +// +// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued +// down one Z pipe, six SD pipes, four VX pipes and the total latency is +// six cycles. +// +// Contains all of the Falkor specific ReadAdvance types for forwarding logic. +// +// Contains all of the Falkor specific WriteVariant types for immediate zero +// and LSLFast. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Define 0 micro-op types +def FalkorWr_none_1cyc : SchedWriteRes<[]> { + let Latency = 1; + let NumMicroOps = 0; +} +def FalkorWr_none_3cyc : SchedWriteRes<[]> { + let Latency = 3; + let NumMicroOps = 0; +} +def FalkorWr_none_4cyc : SchedWriteRes<[]> { + let Latency = 4; + let NumMicroOps = 0; +} + +//===----------------------------------------------------------------------===// +// Define 1 micro-op types + +def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } +def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } +def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } +def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } +def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } +def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; } +def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; } +def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; } +def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; } +def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; } +def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; } + +def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } +def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } +def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } +def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } +def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } + +def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } +def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } +def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; } + +def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; } +def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; } +def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Define 2 micro-op types + +def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 1; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} +def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} +def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> { + let Latency = 0; + let NumMicroOps = 2; +} + +def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2, 8]; +} + +def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [2, 16]; +} + +def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 2; +} + +def FalkorWr_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitVSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define 3 micro-op types + +def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 0; + let NumMicroOps = 3; +} + +def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitZ]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_1XYZ_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 3; +} +def FalkorWr_1XYZ_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitVSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 3; +} +//===----------------------------------------------------------------------===// +// Define 4 micro-op types + +def FalkorWr_2VX_2VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, + FalkorUnitVX, FalkorUnitVY]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST, + FalkorUnitSD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_2VSD_2ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define 5 micro-op types + +def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 5; +} +def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 5; +} +def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY]> { + let Latency = 7; + let NumMicroOps = 5; +} +def FalkorWr_1XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, + FalkorUnitVSD, FalkorUnitST, + FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 5; +} +def FalkorWr_1VXVY_2ST_2VSD_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitST, + FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 5; +} +//===----------------------------------------------------------------------===// +// Define 6 micro-op types + +def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, + FalkorUnitVSD, FalkorUnitXYZ, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +def FalkorWr_2VXVY_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +def FalkorWr_3VSD_3ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +//===----------------------------------------------------------------------===// +// Define 8 micro-op types + +def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 8; +} + +def FalkorWr_4VSD_4ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 8; +} + +//===----------------------------------------------------------------------===// +// Define 9 micro-op types + +def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, + FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitLD, + FalkorUnitLD, FalkorUnitXYZ, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 9; +} + +def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, + FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitXYZ, + FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 9; +} + +//===----------------------------------------------------------------------===// +// Define 10 micro-op types + +def FalkorWr_2VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 10; +} + +//===----------------------------------------------------------------------===// +// Define 12 micro-op types + +def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 12; +} + +// Forwarding logic is modeled for multiply add/accumulate. +// ----------------------------------------------------------------------------- +def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>; +def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>; +def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>; +def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>; +def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>; + +// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast +// ----------------------------------------------------------------------------- +def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; +def FalkorFMOVZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR || + MI->getOperand(1).getReg() == AArch64::XZR}]>; +def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>; + +def FalkorWr_FMOV : SchedWriteVariant<[ + SchedVar<FalkorFMOVZrReg, [FalkorWr_1none_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>; + +def FalkorWr_MOVZ : SchedWriteVariant<[ + SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZB_1cyc]>]>; + +def FalkorWr_ADDSUBsx : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>, + SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>; + +def FalkorWr_LDRro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_3cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_4cyc]>]>; + +def FalkorWr_LDRSro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>; + +def FalkorWr_PRFMro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>; + +def FalkorWr_STRVro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1VSD_1ST_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1VSD_1ST_0cyc]>]>; + +def FalkorWr_STRQro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_2ST_2VSD_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_2XYZ_2ST_2VSD_0cyc]>]>; + +def FalkorWr_STRro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1SD_1ST_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1SD_1ST_0cyc]>]>; //===----------------------------------------------------------------------===// // Specialize the coarse model by associating instruction groups with the @@ -22,63 +524,76 @@ include "AArch64SchedFalkorWriteRes.td" // Miscellaneous // ----------------------------------------------------------------------------- -def : InstRW<[WriteI], (instrs COPY)>; +// FIXME: This could be better modeled by looking at the regclasses of the operands. +def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs COPY)>; // SIMD Floating-point Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)v2f32$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v4f16|v2i16p|v2i32p)$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(16|32|64)$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(16|32|64|v2f32|v4f16|v2i32|v4i16)$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i16|v1i32|v1i64|v2i32|v4i16)rz$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v2i32p)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(32|64)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(32|64|v2f32|v2i32)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64|v2i32)rz$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)v2f32$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?V(v4i16|v4i32|v8i16)v$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)(v2f32|v4f16)$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i16p|v2i32p|v2i64p|v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?Vv4i32v$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)v2f32$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i32p|v2i64p|v2f32)$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v2f32)$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)v2i32(_shift)?$")>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instregex "^(FMUL|FMULX)(v2f32|(v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instrs FMULX32)>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instregex "^(FMUL|FMULX)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instrs FMULX64)>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32)$")>; -def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v8f16|v2i64p)$")>; -def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32|v8i16)rz$")>; -def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v2i64p)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs FCVTLv4i16, FCVTLv2i32)>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)$")>; -def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)(v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)v2f32$")>; -def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32|v8f16)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32)$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs FCVTLv8i16, FCVTLv4i32)>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32)(_shift)?$")>; -def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], + (instregex "^(FMUL|FMULX)(v2f64|v4f32|v4i32_indexed)$")>; -def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], + (instregex "^(FMUL|FMULX)v2i64_indexed$")>; -def : InstRW<[FalkorWr_3VXVY_4cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_3VXVY_4cyc], (instrs FCVTNv4i16, FCVTNv2i32, FCVTXNv2f32)>; +def : InstRW<[FalkorWr_3VXVY_5cyc], (instrs FCVTNv8i16, FCVTNv4i32, FCVTXNv4f32)>; -def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i16|v4i32|v8i16|v4f32)$")>; +def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32)$")>; -def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; - -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>; -def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; -def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], + (instregex "^FML(A|S)(v2f32|(v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], + (instregex "^FML(A|S)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], + (instregex "^FML(A|S)(v4f32|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], + (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>; // SIMD Integer Instructions // ----------------------------------------------------------------------------- @@ -92,12 +607,14 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$" def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHLv1i64$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHRd$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs PMULv8i8)>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHLd$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>; @@ -110,6 +627,8 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN) def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHRd$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^R?SHRN(v2i32|v4i16|v8i8)_shift$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs ADDVv4i16v)>; @@ -120,10 +639,14 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64) def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^SQDMULL(i16|i32)$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>; @@ -154,7 +677,7 @@ def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^ADDP(v4i32|v8i16|v16i8)$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>; -def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL2?(v8i8|v16i8)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL(v8i8|v16i8)$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>; @@ -165,14 +688,18 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16 def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^R?SHRN(v2i64|v4i32|v8i16|v16i8)_shift$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>; -def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL2?(v1i64|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL(v1i64|v2i64)$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^SQDMULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>; @@ -186,99 +713,114 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>; def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^SQD(MLAL|MLSL)v[248].*$")>; // SIMD Load Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteVLD], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; -def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLD], (instrs LD2i64)>; -def : InstRW<[WriteVLD, WriteAdr], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>; -def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[WriteVLD, WriteAdr], (instrs LD2i64_POST)>; - -def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "LD1i(8|16|32)$")>; -def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; - -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>; -def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instrs LD3i64_POST)>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instrs LD4i64_POST)>; - -def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>; -def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, WriteAdr], (instregex "^LD2i(8|16|32)_POST$")>; - -def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>; -def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instrs LD3Threev2d_POST)>; -def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "LD3i(8|16|32)$")>; -def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, WriteAdr], (instregex "LD3i(8|16|32)_POST$")>; - -def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>; -def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instrs LD4Fourv2d_POST)>; -def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>; -def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, WriteAdr], (instregex "^LD4i(8|16|32)_POST$")>; - -def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "LD3Threev(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, WriteAdr],(instregex "LD3Threev(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, WriteAdr],(instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>; - -def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "LD3Threev(16b|8h|4s)$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>; - -def : InstRW<[FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, WriteAdr],(instregex "LD3Threev(16b|8h|4s)_POST$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, WriteAdr],(instregex "^LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instrs LD2i64)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instrs LD2i64_POST)>; + +def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD3i64_POST)>; +def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD4i64_POST)>; + +def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instrs LD3Threev2d_POST)>; +def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instrs LD4Fourv2d_POST)>; +def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "^LD3Threev(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1none_4cyc], + (instregex "^LD3Threev(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2none_4cyc], + (instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD3Threev(16b|8h|4s)$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>; + +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc], + (instregex "^LD3Threev(16b|8h|4s)_POST$")>; + +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc], + (instregex "^LD4Fourv(16b|8h|4s)_POST$")>; // Arithmetic and Logical Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_ADD], (instregex "^ADD(S)?(W|X)r(s|x)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADC(S)?(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADD(S)?(W|X)r(r|i)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^AND(S)?(W|X)r(i|r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^BIC(S)?(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EON(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EOR(W|X)r(i|r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORN(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(i|r|s)$")>; -def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^SUB(S)?(W|X)r(s|x)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SBC(S)?(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SUB(S)?(W|X)r(r|i)$")>; +def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>; +def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>; // SIMD Miscellaneous Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; @@ -287,35 +829,42 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs NOTv8i8)>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^REV(16|32|64)v.*$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN|XTN2)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "(S|U)QXTU?Nv.*$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instrs FRECPS64, FRSQRTS64)>; -def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], + (instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs TBLv16i8One)>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>; -def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], + (instrs FRECPSv4f32, FRSQRTSv4f32)>; -def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], + (instrs FRECPSv2f64, FRSQRTSv2f64)>; def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>; def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>; @@ -328,50 +877,95 @@ def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>; // SIMD Store Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteVST], (instregex "^STP(D|S)(i)$")>; -def : InstRW<[WriteVST, WriteAdr], (instregex "^STP(D|S)(post|pre)$")>; -def : InstRW<[FalkorWr_2XYZ_2ST_2VSD_0cyc], (instregex "^STRQro(W|X)$")>; - -def : InstRW<[WriteVST], (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; -def : InstRW<[WriteVST], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>; -def : InstRW<[WriteVST, WriteAdr], (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; -def : InstRW<[WriteVST, WriteAdr], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>; - -def : InstRW<[WriteVST, WriteVST], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; -def : InstRW<[WriteVST, WriteVST], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[WriteVST, WriteVST], (instregex "^ST3(i8|i16|i32|i64)$")>; -def : InstRW<[WriteVST, WriteVST], (instregex "^ST4(i8|i16|i32|i64)$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST3(i8|i16|i32|i64)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST4(i8|i16|i32|i64)_POST$")>; - -def : InstRW<[WriteV, WriteVST, WriteVST], (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>; -def : InstRW<[WriteV, WriteVST, WriteVST, WriteAdr], (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>; - -def : InstRW<[WriteVST, WriteVST, WriteVST], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST], (instrs ST3Threev2d)>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr], (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr], (instrs ST3Threev2d_POST)>; - -def : InstRW<[WriteV, WriteV, WriteVST, WriteVST], (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>; -def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteAdr], (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>; - -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], (instrs ST4Fourv2d)>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr], (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr], (instrs ST4Fourv2d_POST)>; - -def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST3Three(v16b|v8h|v4s)$")>; -def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST3Three(v16b|v8h|v4s)_POST$")>; - -def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST4Four(v16b|v8h|v4s)$")>; -def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST4Four(v16b|v8h|v4s)_POST$")>; + +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STR(Q|D|S|H|B)ui$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^STR(Q|D|S|H|B)(post|pre)$")>; +def : InstRW<[FalkorWr_STRVro], (instregex "^STR(D|S|H|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^STPQi$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^STPQ(post|pre)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STP(D|S)(i)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^STP(D|S)(post|pre)$")>; +def : InstRW<[FalkorWr_STRQro], (instregex "^STRQro(W|X)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STUR(Q|D|S|B|H)i$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instrs STNPDi, STNPSi)>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instrs STNPQi)>; + +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>; + +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST3(i8|i16|i32|i64)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST4(i8|i16|i32|i64)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^ST3(i8|i16|i32|i64)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^ST4(i8|i16|i32|i64)_POST$")>; + +def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc], + (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc], + (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>; + +def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instrs ST3Threev2d)>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc], + (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc], + (instrs ST3Threev2d_POST)>; + +def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc], + (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc], + (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>; + +def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instrs ST4Fourv2d)>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc], + (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc], + (instrs ST4Fourv2d_POST)>; + +def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc], + (instregex "^ST3Three(v16b|v8h|v4s)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc], + (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>; + +def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc], + (instregex "^ST4Four(v16b|v8h|v4s)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc], + (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>; // Branch Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1none_0cyc], (instrs B)>; +def : InstRW<[FalkorWr_1none_0cyc], (instrs B, TCRETURNdi)>; def : InstRW<[FalkorWr_1Z_0cyc], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>; +def : InstRW<[FalkorWr_1Z_0cyc], (instrs RET_ReallyLR, TCRETURNri)>; def : InstRW<[FalkorWr_1ZB_0cyc], (instrs Bcc)>; def : InstRW<[FalkorWr_1XYZB_0cyc], (instrs BL)>; def : InstRW<[FalkorWr_1Z_1XY_0cyc], (instrs BLR)>; @@ -388,89 +982,103 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>; // FP Load Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteLD], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>; -def : InstRW<[WriteLD, WriteAdr], (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>; -def : InstRW<[WriteLD], (instregex "^LDUR(Q|D|S|H|B)i$")>; -def : InstRW<[FalkorWr_LDR], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDNPQi)>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDPQi)>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDNP(D|S)i$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDP(D|S)i$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi, WriteAdr],(instregex "LDP(D|S)(pre|post)$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi, WriteAdr],(instregex "^LDPQ(pre|post)$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], + (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(Q|D|S|H|B)i$")>; +def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], + (instrs LDNPQi)>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], + (instrs LDPQi)>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], + (instregex "LDNP(D|S)i$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], + (instregex "LDP(D|S)i$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], + (instregex "LDP(D|S)(pre|post)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], + (instregex "^LDPQ(pre|post)$")>; // FP Data Processing Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(H|S|D)rr$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(H|S|D)r(r|i)$")>; -def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P)(S|U)U(W|X)(H|S|D)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(H|S|D)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(H|S|D)rrr$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(S|D)r(r|i)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(S|D)rrr$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(H|S|D)rr$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(16|32|64)p$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTHSr, FCVTHDr)>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(H|S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(32|64)p$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTSHr, FCVTDHr)>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(S|D)r$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(16|32|64)$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(H|S|D)rr$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTSHr, FCVTDHr)>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(32|64)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTHSr, FCVTHDr)>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instregex "^F(N)?MULSrr$")>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instregex "^F(N)?MULDrr$")>; -def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>; -def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>; +def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(S|D)rr$")>; +def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(S|D)r$")>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], + (instregex "^F(N)?M(ADD|SUB)Srrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], + (instregex "^F(N)?M(ADD|SUB)Drrr$")>; // FP Miscellaneous Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(H|S|D)i$")>; -def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; -def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>; -def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>; -// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0 +def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(WS|XD|XDHigh)r$")>; +def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(S|D)i$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(d|s)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(SW|DX|DXHigh)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; +// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>; def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>; // Load Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>; def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFUMi)>; - -def : InstRW<[WriteLD, WriteLDHi], (instregex "^LDNP(W|X)i$")>; -def : InstRW<[WriteLD, WriteLDHi], (instregex "^LDP(W|X)i$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(B|H|W|X)ui$")>; -def : InstRW<[WriteLD, WriteAdr], (instregex "^LDR(B|H|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], + (instregex "^LDNP(W|X)i$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], + (instregex "^LDP(W|X)i$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], + (instregex "^LDP(W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(BB|HH|W|X)ui$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], + (instregex "^LDR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>; def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(W|X)l$")>; def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDTR(B|H|W|X)i$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(B|H|W|X)i$")>; - +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(BB|HH|W|X)i$")>; +def : InstRW<[FalkorWr_PRFMro], (instregex "^PRFMro(W|X)$")>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc], + (instrs LDPSWi)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc], + (instregex "^LDPSW(post|pre)$")>; def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc], + (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; +def : InstRW<[FalkorWr_LDRSro], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>; def : InstRW<[FalkorWr_1LD_4cyc], (instrs LDRSWl)>; def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; -def : InstRW<[FalkorWr_PRFM], (instregex "^PRFMro(W|X)$")>; -def : InstRW<[FalkorWr_LDR], (instregex "^LDR(B|H|W|X)ro(W|X)$")>; - -def : InstRW<[FalkorWr_LDRS], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>; - -def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; -def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>; -def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>; -def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>; - // Miscellaneous Data-Processing Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(S|U)?BFM(W|X)ri$")>; @@ -480,17 +1088,22 @@ def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>; // Divide and Multiply Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; -def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], + (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; +def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], + (instregex "^M(ADD|SUB)Wrrr$")>; -def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>; -def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], + (instregex "^M(ADD|SUB)Xrrr$")>; -def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>; -def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>; +def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>; +def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(S|U)MULLv.*$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^(S|U)MULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^(S|U)(MLAL|MLSL)v.*$")>; // Move and Shift Instructions // ----------------------------------------------------------------------------- @@ -498,6 +1111,11 @@ def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W| def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^ADRP?$")>; def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^MOVN(W|X)i$")>; def : InstRW<[FalkorWr_MOVZ], (instregex "^MOVZ(W|X)i$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs MOVi32imm, MOVi64imm)>; +def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>], + (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>; +def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>], + (instrs LOADgot)>; // Other Instructions // ----------------------------------------------------------------------------- @@ -507,13 +1125,12 @@ def : InstRW<[FalkorWr_1ST_0cyc], (instrs SYSxt, SYSLxt)>; def : InstRW<[FalkorWr_1Z_0cyc], (instrs MSRpstateImm1, MSRpstateImm4)>; def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS)>; +def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS, MOVbaseTLS)>; def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>; def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>; -def : InstRW<[WriteVST], (instrs STNPDi, STNPSi)>; -def : InstRW<[WriteSTP], (instrs STNPWi, STNPXi)>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs STNPWi, STNPXi)>; def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>; def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>; @@ -523,20 +1140,16 @@ def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>; def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>; def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>; -def : InstRW<[WriteVST, WriteVST], (instrs STNPQi)>; // Store Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteST], (instregex "^STP(W|X)i$")>; -def : InstRW<[WriteST, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>; -def : InstRW<[WriteST], (instregex "^STR(Q|D|S|BB|HH)ui$")>; -def : InstRW<[WriteST], (instregex "^STUR(Q|D|S|BB|HH)i$")>; -def : InstRW<[WriteST], (instregex "^STR(B|H|W|X)ui$")>; -def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)(post|pre)$")>; -def : InstRW<[WriteST], (instregex "^STTR(B|H|W|X)i$")>; -def : InstRW<[WriteST], (instregex "^STUR(B|H|W|X)i$")>; - -def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)ro(W|X)$")>; - -def : InstRW<[WriteVST, WriteVST], (instregex "^STPQi$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^STPQ(post|pre)$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STP(W|X)i$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc], + (instregex "^STP(W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STR(BB|HH|W|X)ui$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc], + (instregex "^STR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_STRro], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STTR(B|H|W|X)i$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STUR(BB|HH|W|X)i$")>; + diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td deleted file mode 100644 index 6526cc28e806..000000000000 --- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td +++ /dev/null @@ -1,403 +0,0 @@ -//=- AArch64SchedFalkorWrRes.td - Falkor Write Res ---*- tablegen -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Contains all of the Falkor specific SchedWriteRes types. The approach -// below is to define a generic SchedWriteRes for every combination of -// latency and microOps. The naming conventions is to use a prefix, one field -// for latency, and one or more microOp count/type designators. -// Prefix: FalkorWr -// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD) -// Latency: #cyc -// -// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued -// down one Z pipe, six SD pipes, four VX pipes and the total latency is -// six cycles. -// -// Contains all of the Falkor specific ReadAdvance types for forwarding logic. -// -// Contains all of the Falkor specific WriteVariant types for immediate zero -// and LSLFast. -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Define 1 micro-op types - -def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } -def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } -def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } -def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } -def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } -def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } -def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } -def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; } -def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; } -def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; } -def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; } -def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; } -def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; } - -def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } -def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } -def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } -def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } -def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } -def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } -def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } -def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } - -def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } -def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } -def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; } - -def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; } -def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; } -def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } - -//===----------------------------------------------------------------------===// -// Define 2 micro-op types - -def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 1; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 2; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 3; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 5; - let NumMicroOps = 2; -} -def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 5; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 6; - let NumMicroOps = 2; -} -def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 6; - let NumMicroOps = 2; -} - -def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 2; -} - -def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { - let Latency = 5; - let NumMicroOps = 2; -} - -def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { - let Latency = 4; - let NumMicroOps = 2; -} - -def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { - let Latency = 10; - let NumMicroOps = 2; -} - -def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> { - let Latency = 1; - let NumMicroOps = 2; -} - -def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { - let Latency = 5; - let NumMicroOps = 2; -} - -def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> { - let Latency = 0; - let NumMicroOps = 2; -} - -def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { - let Latency = 8; - let ResourceCycles = [2, 8]; -} - -def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { - let Latency = 16; - let ResourceCycles = [2, 16]; -} - -def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> { - let Latency = 3; - let NumMicroOps = 2; -} - -def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 2; -} - -def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> { - let Latency = 0; - let NumMicroOps = 2; -} - -//===----------------------------------------------------------------------===// -// Define 3 micro-op types - -def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, - FalkorUnitLD]> { - let Latency = 0; - let NumMicroOps = 3; -} - -def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, - FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 3; -} - -def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 5; - let NumMicroOps = 3; -} - -def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 6; - let NumMicroOps = 3; -} - -def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 3; -} - -def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitZ]> { - let Latency = 3; - let NumMicroOps = 3; -} - -//===----------------------------------------------------------------------===// -// Define 4 micro-op types - -def FalkorWr_2VX_2VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, - FalkorUnitVX, FalkorUnitVY]> { - let Latency = 2; - let NumMicroOps = 4; -} - -def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 2; - let NumMicroOps = 4; -} -def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 3; - let NumMicroOps = 4; -} -def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 4; -} -def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 6; - let NumMicroOps = 4; -} - -def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitLD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 4; -} - -def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 4; -} - -def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 4; -} - -def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST, - FalkorUnitSD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 4; -} - -//===----------------------------------------------------------------------===// -// Define 5 micro-op types - -def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 5; -} -def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 5; -} -def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY]> { - let Latency = 7; - let NumMicroOps = 5; -} - -//===----------------------------------------------------------------------===// -// Define 6 micro-op types - -def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 6; -} - -def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, - FalkorUnitVSD, FalkorUnitXYZ, - FalkorUnitST, FalkorUnitVSD]> { - let Latency = 0; - let NumMicroOps = 6; -} - -//===----------------------------------------------------------------------===// -// Define 8 micro-op types - -def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 8; -} - -//===----------------------------------------------------------------------===// -// Define 9 micro-op types - -def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, - FalkorUnitLD, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitLD, - FalkorUnitLD, FalkorUnitXYZ, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 9; -} - -def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, - FalkorUnitLD, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitXYZ, - FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 9; -} - -// Forwarding logic is modeled for multiply add/accumulate. -// ----------------------------------------------------------------------------- -def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>; -def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>; -def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>; -def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>; -def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>; - -// SchedPredicates and WriteVariants for Immediate Zero and LSLFast -// ----------------------------------------------------------------------------- -def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; -def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; - -def FalkorWr_FMOV : SchedWriteVariant<[ - SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>, - SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>; - -def FalkorWr_MOVZ : SchedWriteVariant<[ - SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>, - SchedVar<NoSchedPred, [FalkorWr_1XYZB_1cyc]>]>; - -def FalkorWr_LDR : SchedWriteVariant<[ - SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_3cyc]>, - SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_4cyc]>]>; - -def FalkorWr_ADD : SchedWriteVariant<[ - SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>, - SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>; - -def FalkorWr_PRFM : SchedWriteVariant<[ - SchedVar<FalkorLSLFastPred, [FalkorWr_1ST_3cyc]>, - SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>; - -def FalkorWr_LDRS : SchedWriteVariant<[ - SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_4cyc]>, - SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>; diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index b369ee7e4ba2..d3cab1ad3397 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -90,7 +90,6 @@ void AArch64Subtarget::initializeProperties() { break; case Falkor: MaxInterleaveFactor = 4; - VectorInsertExtractBaseCost = 2; // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 132f192f2a9a..cb3f72a524f5 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -10,10 +10,10 @@ // //===----------------------------------------------------------------------===// +#include "AArch64TargetMachine.h" #include "AArch64.h" #include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" @@ -23,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" @@ -277,7 +278,7 @@ public: ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override { const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>(); - if (ST.hasFuseLiterals()) { + if (ST.hasFuseAES() || ST.hasFuseLiterals()) { // Run the Macro Fusion after RA again since literals are expanded from // pseudos then (v. addPreSched2()). ScheduleDAGMI *DAG = createGenericSchedPostRA(C); @@ -295,6 +296,7 @@ public: bool addIRTranslator() override; bool addLegalizeMachineIR() override; bool addRegBankSelect() override; + void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; #endif bool addILPOpts() override; @@ -404,6 +406,12 @@ bool AArch64PassConfig::addRegBankSelect() { return false; } +void AArch64PassConfig::addPreGlobalInstructionSelect() { + // Workaround the deficiency of the fast register allocator. + if (TM->getOptLevel() == CodeGenOpt::None) + addPass(new Localizer()); +} + bool AArch64PassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); return false; diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index b279bd61e180..e7ebb37a9d62 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -425,7 +425,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureDPP, + FeatureFastFMAF32, FeatureSDWA, FeatureDPP, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; @@ -534,10 +534,12 @@ def AMDGPUAsmVariants { int VOP3_ID = 1; string SDWA = "SDWA"; int SDWA_ID = 2; + string SDWA9 = "SDWA9"; + int SDWA9_ID = 3; string DPP = "DPP"; - int DPP_ID = 3; + int DPP_ID = 4; string Disable = "Disable"; - int Disable_ID = 4; + int Disable_ID = 5; } def DefaultAMDGPUAsmParserVariant : AsmParserVariant { @@ -555,6 +557,12 @@ def SDWAAsmParserVariant : AsmParserVariant { let Name = AMDGPUAsmVariants.SDWA; } +def SDWA9AsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.SDWA9_ID; + let Name = AMDGPUAsmVariants.SDWA9; +} + + def DPPAsmParserVariant : AsmParserVariant { let Variant = AMDGPUAsmVariants.DPP_ID; let Name = AMDGPUAsmVariants.DPP; @@ -567,6 +575,7 @@ def AMDGPU : Target { let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant, VOP3AsmParserVariant, SDWAAsmParserVariant, + SDWA9AsmParserVariant, DPPAsmParserVariant]; let AssemblyWriters = [AMDGPUAsmWriter]; } @@ -607,7 +616,10 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; def HasSDWA : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<"FeatureSDWA">; + AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; + +def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"FeatureSDWA,FeatureGFX9">; def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureDPP">; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 5ec46a8294c0..723e8a7b54e2 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -127,6 +127,29 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } +bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) +{ + assert(Op.getOpcode() == ISD::OR); + + SDValue N0 = Op->getOperand(0); + SDValue N1 = Op->getOperand(1); + EVT VT = N0.getValueType(); + + if (VT.isInteger() && !VT.isVector()) { + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(N0, LHSKnown); + + if (LHSKnown.Zero.getBoolValue()) { + DAG.computeKnownBits(N1, RHSKnown); + + if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) + return true; + } + } + + return false; +} + AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -2596,8 +2619,6 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); - if (VT != MVT::i64) - return SDValue(); ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!RHS) @@ -2618,6 +2639,8 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { // shl (ext x) => zext (shl x), if shift does not overflow int + if (VT != MVT::i64) + break; KnownBits Known; SDValue X = LHS->getOperand(0); DAG.computeKnownBits(X, Known); @@ -2628,8 +2651,23 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); return DAG.getZExtOrTrunc(Shl, SL, VT); } + case ISD::OR: if (!isOrEquivalentToAdd(DAG, LHS)) break; + case ISD::ADD: { // Fall through from above + // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) + if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { + SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), + SDValue(RHS, 0)); + SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, + SDLoc(C2), VT); + return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); + } + break; + } } + if (VT != MVT::i64) + return SDValue(); + // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) // On some subtargets, 64-bit shift is a quarter rate instruction. In the @@ -3440,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DL); } - if ((OffsetVal + WidthVal) >= 32) { + if ((OffsetVal + WidthVal) >= 32 && + !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fb2f15022d25..0d066cdbdff4 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -34,6 +34,9 @@ private: /// compare. SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; +public: + static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); + protected: const AMDGPUSubtarget *Subtarget; AMDGPUAS AMDGPUASI; diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 9de302994e68..57905be18813 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -36,6 +36,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { setAction({G_CONSTANT, S32}, Legal); setAction({G_CONSTANT, S64}, Legal); + setAction({G_FCONSTANT, S32}, Legal); + setAction({G_GEP, P1}, Legal); setAction({G_GEP, P2}, Legal); setAction({G_GEP, 1, S64}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 85184b363905..07f92918a43f 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -97,6 +97,9 @@ private: Instruction *UseInst, int OpIdx0, int OpIdx1) const; + /// Check whether we have enough local memory for promotion. + bool hasSufficientLocalMem(const Function &F); + public: static char ID; @@ -107,7 +110,7 @@ public: StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } - void handleAlloca(AllocaInst &I); + bool handleAlloca(AllocaInst &I, bool SufficientLDS); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); if (!ST.isPromoteAllocaEnabled()) return false; - AS = AMDGPU::getAMDGPUAS(*F.getParent()); - - FunctionType *FTy = F.getFunctionType(); - - // If the function has any arguments in the local address space, then it's - // possible these arguments require the entire local memory space, so - // we cannot use local memory in the pass. - for (Type *ParamTy : FTy->params()) { - PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); - if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { - LocalMemLimit = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " - "local memory disabled.\n"); - return false; - } - } - - LocalMemLimit = ST.getLocalMemorySize(); - if (LocalMemLimit == 0) - return false; - - const DataLayout &DL = Mod->getDataLayout(); - - // Check how much local memory is being used by global objects - CurrentLocalMemUsage = 0; - for (GlobalVariable &GV : Mod->globals()) { - if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) - continue; - - for (const User *U : GV.users()) { - const Instruction *Use = dyn_cast<Instruction>(U); - if (!Use) - continue; - - if (Use->getParent()->getParent() == &F) { - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); - // FIXME: Try to account for padding here. The padding is currently - // determined from the inverse order of uses in the function. I'm not - // sure if the use list order is in any way connected to this, so the - // total reported size is likely incorrect. - uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); - CurrentLocalMemUsage += AllocSize; - break; - } - } - } - - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, - F); - - // Restrict local memory usage so that we don't drastically reduce occupancy, - // unless it is already significantly reduced. - - // TODO: Have some sort of hint or other heuristics to guess occupancy based - // on other factors.. - unsigned OccupancyHint = ST.getWavesPerEU(F).second; - if (OccupancyHint == 0) - OccupancyHint = 7; - - // Clamp to max value. - OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); - - // Check the hint but ignore it if it's obviously wrong from the existing LDS - // usage. - MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); - - - // Round up to the next tier of usage. - unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); - - // Program is possibly broken by using more local mem than available. - if (CurrentLocalMemUsage > MaxSizeWithWaveCount) - return false; - - LocalMemLimit = MaxSizeWithWaveCount; - - DEBUG( - dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" - << " Rounding size to " << MaxSizeWithWaveCount - << " with a maximum occupancy of " << MaxOccupancy << '\n' - << " and " << (LocalMemLimit - CurrentLocalMemUsage) - << " available for promotion\n" - ); + AS = AMDGPU::getAMDGPUAS(*F.getParent()); + bool SufficientLDS = hasSufficientLocalMem(F); + bool Changed = false; BasicBlock &EntryBB = *F.begin(); for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { AllocaInst *AI = dyn_cast<AllocaInst>(I); ++I; if (AI) - handleAlloca(*AI); + Changed |= handleAlloca(*AI, SufficientLDS); } - return true; + return Changed; } std::pair<Value *, Value *> @@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( return true; } +bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { + + FunctionType *FTy = F.getFunctionType(); + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (Type *ParamTy : FTy->params()) { + PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); + if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { + LocalMemLimit = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + return false; + } + } + + LocalMemLimit = ST.getLocalMemorySize(); + if (LocalMemLimit == 0) + return false; + + const DataLayout &DL = Mod->getDataLayout(); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + for (GlobalVariable &GV : Mod->globals()) { + if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) + continue; + + for (const User *U : GV.users()) { + const Instruction *Use = dyn_cast<Instruction>(U); + if (!Use) + continue; + + if (Use->getParent()->getParent() == &F) { + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + // FIXME: Try to account for padding here. The padding is currently + // determined from the inverse order of uses in the function. I'm not + // sure if the use list order is in any way connected to this, so the + // total reported size is likely incorrect. + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage += AllocSize; + break; + } + } + } + + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, + F); + + // Restrict local memory usage so that we don't drastically reduce occupancy, + // unless it is already significantly reduced. + + // TODO: Have some sort of hint or other heuristics to guess occupancy based + // on other factors.. + unsigned OccupancyHint = ST.getWavesPerEU(F).second; + if (OccupancyHint == 0) + OccupancyHint = 7; + + // Clamp to max value. + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); + + // Check the hint but ignore it if it's obviously wrong from the existing LDS + // usage. + MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + + // Round up to the next tier of usage. + unsigned MaxSizeWithWaveCount + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); + + // Program is possibly broken by using more local mem than available. + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + return false; + + LocalMemLimit = MaxSizeWithWaveCount; + + DEBUG( + dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n" + ); + + return true; +} + // FIXME: Should try to pick the most likely to be profitable allocas first. -void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { +bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) - return; + return false; IRBuilder<> Builder(&I); @@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I, AS)) { - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); - return; - } + if (tryPromoteAllocaToVector(&I, AS)) + return true; // Promoted to vector. const Function &ContainingFunction = *I.getParent()->getParent(); CallingConv::ID CC = ContainingFunction.getCallingConv(); @@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { break; default: DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); - return; + return false; } + // Not likely to have sufficient local memory for promotion. + if (!SufficientLDS) + return false; + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; @@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (NewSize > LocalMemLimit) { DEBUG(dbgs() << " " << AllocSize << " bytes of local memory not available to promote\n"); - return; + return false; } CurrentLocalMemUsage = NewSize; @@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return; + return false; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); @@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } + return true; } FunctionPass *llvm::createAMDGPUPromoteAlloca() { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index e543cae07ada..660879426810 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -416,6 +416,10 @@ public: return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; } + bool hasSDWA() const { + return HasSDWA; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { @@ -670,10 +674,6 @@ public: return HasInv2PiInlineImm; } - bool hasSDWA() const { - return HasSDWA; - } - bool hasDPP() const { return HasDPP; } diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b52ea2b3a2c6..f5541e08e1b7 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -881,6 +881,10 @@ public: return AMDGPU::isVI(getSTI()); } + bool isGFX9() const { + return AMDGPU::isGFX9(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -989,7 +993,6 @@ private: bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; - bool isSGPR(unsigned Reg); public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1042,9 +1045,10 @@ public: OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands); void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); void cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType); + uint64_t BasicInstType, bool skipVcc = false); }; struct OptionalOperand { @@ -1966,7 +1970,8 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { } if (isForcedSDWA()) { - static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA}; + static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA, + AMDGPUAsmVariants::SDWA9}; return makeArrayRef(Variants); } @@ -1977,7 +1982,7 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const { static const unsigned Variants[] = { AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3, - AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP + AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP }; return makeArrayRef(Variants); @@ -2000,14 +2005,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { return AMDGPU::NoRegister; } -bool AMDGPUAsmParser::isSGPR(unsigned Reg) { - const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); - const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); - return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || - Reg == AMDGPU::SCC; -} - // NB: This code is correct only when used to check constant // bus limitations because GFX7 support no f16 inline constants. // Note that there are no cases when a GFX7 opcode violates @@ -2049,7 +2046,8 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { if (MO.isImm()) { return !isInlineConstant(Inst, OpIdx); } - return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg())); + return !MO.isReg() || + isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo()); } bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { @@ -2060,7 +2058,8 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { if (Desc.TSFlags & (SIInstrFlags::VOPC | SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | - SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) { + SIInstrFlags::VOP3 | SIInstrFlags::VOP3P | + SIInstrFlags::SDWA)) { // Check special imm operands (used by madmk, etc) if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) { @@ -4151,14 +4150,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) { cvtSDWA(Inst, Operands, SIInstrFlags::VOP2); } +void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true); +} + void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { - cvtSDWA(Inst, Operands, SIInstrFlags::VOPC); + cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI()); } void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType) { + uint64_t BasicInstType, bool skipVcc) { using namespace llvm::AMDGPU::SDWA; OptionalImmIndexMap OptionalIdx; + bool skippedVcc = false; unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); @@ -4168,15 +4172,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - // Add the register arguments - if ((BasicInstType == SIInstrFlags::VOPC || - BasicInstType == SIInstrFlags::VOP2)&& - Op.isReg() && - Op.Reg.RegNo == AMDGPU::VCC) { - // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. - // Skip it. - continue; - } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. + // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) + // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. + // Skip VCC only if we didn't skip it on previous iteration. + if (BasicInstType == SIInstrFlags::VOP2 && + (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) { + skippedVcc = true; + continue; + } else if (BasicInstType == SIInstrFlags::VOPC && + Inst.getNumOperands() == 0) { + skippedVcc = true; + continue; + } + } + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithInputModsOperands(Inst, 2); } else if (Op.isImm()) { // Handle optional arguments @@ -4184,20 +4195,30 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } else { llvm_unreachable("Invalid operand type"); } + skippedVcc = false; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - - if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { + if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && + Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { // V_NOP_sdwa_vi has no optional sdwa arguments switch (BasicInstType) { case SIInstrFlags::VOP1: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (isGFX9() && + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOP2: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (isGFX9() && + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); @@ -4205,6 +4226,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, break; case SIInstrFlags::VOPC: + if (isVI()) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; @@ -4220,10 +4244,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) { auto it = Inst.begin(); std::advance( - it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); Inst.insert(it, Inst.getOperand(0)); // src2 = dst } - } /// Force static initialization. diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 137b5cca96ce..9b3cde7c4df6 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -62,32 +62,33 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, return addOperand(Inst, MCOperand::createImm(Imm)); } -#define DECODE_OPERAND2(RegClass, DecName) \ -static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \ - unsigned Imm, \ - uint64_t /*Addr*/, \ - const void *Decoder) { \ +#define DECODE_OPERAND(StaticDecoderName, DecoderName) \ +static DecodeStatus StaticDecoderName(MCInst &Inst, \ + unsigned Imm, \ + uint64_t /*Addr*/, \ + const void *Decoder) { \ auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \ - return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \ + return addOperand(Inst, DAsm->DecoderName(Imm)); \ } -#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass) +#define DECODE_OPERAND_REG(RegClass) \ +DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) -DECODE_OPERAND(VGPR_32) -DECODE_OPERAND(VS_32) -DECODE_OPERAND(VS_64) +DECODE_OPERAND_REG(VGPR_32) +DECODE_OPERAND_REG(VS_32) +DECODE_OPERAND_REG(VS_64) -DECODE_OPERAND(VReg_64) -DECODE_OPERAND(VReg_96) -DECODE_OPERAND(VReg_128) +DECODE_OPERAND_REG(VReg_64) +DECODE_OPERAND_REG(VReg_96) +DECODE_OPERAND_REG(VReg_128) -DECODE_OPERAND(SReg_32) -DECODE_OPERAND(SReg_32_XM0_XEXEC) -DECODE_OPERAND(SReg_64) -DECODE_OPERAND(SReg_64_XEXEC) -DECODE_OPERAND(SReg_128) -DECODE_OPERAND(SReg_256) -DECODE_OPERAND(SReg_512) +DECODE_OPERAND_REG(SReg_32) +DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) +DECODE_OPERAND_REG(SReg_64) +DECODE_OPERAND_REG(SReg_64_XEXEC) +DECODE_OPERAND_REG(SReg_128) +DECODE_OPERAND_REG(SReg_256) +DECODE_OPERAND_REG(SReg_512) static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, @@ -106,6 +107,13 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } +#define DECODE_SDWA9(DecName) \ +DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName) + +DECODE_SDWA9(Src32) +DECODE_SDWA9(Src16) +DECODE_SDWA9(VopcDst) + #include "AMDGPUGenDisassemblerTables.inc" //===----------------------------------------------------------------------===// @@ -164,6 +172,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); if (Res) break; + + Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); + if (Res) break; } // Reinitialize Bytes as DPP64 could have eaten too much @@ -582,6 +593,48 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return errOperand(Val, "unknown operand encoding " + Twine(Val)); } +MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width, + unsigned Val) const { + using namespace AMDGPU::SDWA; + + if (SDWA9EncValues::SRC_VGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_VGPR_MAX) { + return createRegOperand(getVgprClassId(Width), + Val - SDWA9EncValues::SRC_VGPR_MIN); + } + if (SDWA9EncValues::SRC_SGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_SGPR_MAX) { + return createSRegOperand(getSgprClassId(Width), + Val - SDWA9EncValues::SRC_SGPR_MIN); + } + + return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); +} + +MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const { + return decodeSDWA9Src(OPW16, Val); +} + +MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const { + return decodeSDWA9Src(OPW32, Val); +} + + +MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const { + using namespace AMDGPU::SDWA; + + if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { + Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + if (Val > AMDGPU::EncValues::SGPR_MAX) { + return decodeSpecialReg64(Val); + } else { + return createSRegOperand(getSgprClassId(OPW64), Val); + } + } else { + return createRegOperand(AMDGPU::VCC); + } +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 620bae0a6d1a..0ff405a71e9b 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -104,6 +104,11 @@ public: MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; + + MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSDWA9Src16(unsigned Val) const; + MCOperand decodeSDWA9Src32(unsigned Val) const; + MCOperand decodeSDWA9VopcDst(unsigned Val) const; }; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 3bb5c9bc22b7..8ead48067336 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -191,6 +191,7 @@ public: } }; +namespace { // just a stub to make base class happy class SchedStrategyStub : public MachineSchedStrategy { public: @@ -202,6 +203,7 @@ public: void releaseTopNode(SUnit *SU) override {} void releaseBottomNode(SUnit *SU) override {} }; +} // namespace GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, StrategyKind S) diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index c6d0f2179950..d378df674be9 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -17,6 +17,7 @@ using namespace llvm; #define DEBUG_TYPE "misched" +namespace { class GCNMinRegScheduler { struct Candidate : ilist_node<Candidate> { const SUnit *SU; @@ -71,6 +72,7 @@ public: std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots, const ScheduleDAG &DAG); }; +} // namespace void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) { NumPreds.resize(SUnits.size()); diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 18374dca3f84..390a8286c76a 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -211,9 +211,9 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO, return getLiveLaneMask(MO.getReg(), SI, LIS, MRI); } -SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI, - const LiveIntervals &LIS, - const MachineRegisterInfo &MRI) { +static SmallVector<RegisterMaskPair, 8> +collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { SmallVector<RegisterMaskPair, 8> Res; for (const auto &MO : MI.operands()) { if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 3d3858ab47ec..a856b17a228f 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -52,6 +52,18 @@ public: return 0; } + virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + + virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + protected: uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; void verifyInstructionPredicates(const MCInst &MI, diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index bda0928036fd..e02acf516c0d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -69,6 +69,14 @@ public: unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; + + unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; }; } // end anonymous namespace @@ -319,6 +327,44 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, return getMachineOpValue(MI, MO, Fixups, STI); } +unsigned +SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; + if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { + RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; + } + return RegEnc; +} + +unsigned +SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + if (Reg != AMDGPU::VCC) { + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; + } + return RegEnc; +} + uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 3590a9b05e1d..60b913cfd39a 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1618,6 +1618,14 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { + // Local and Private addresses do not handle vectors. Limit to i32 + if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) { + return (MemVT.getSizeInBits() <= 32); + } + return true; +} + bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 9700ce14c6f3..d6a0876a6ee7 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -44,6 +44,8 @@ public: EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td index cc667d985a82..3c1e8527284c 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_Addr, R600_KC0, R600_KC1, ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, - ALU_CONST, ALU_PARAM, OQAP + ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR )>; def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a01330cb9171..80967edee0ab 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -118,6 +118,10 @@ namespace AMDGPU { // Operand for source modifiers for VOP instructions OPERAND_INPUT_MODS, + // Operand for GFX9 SDWA instructions + OPERAND_SDWA9_SRC, + OPERAND_SDWA9_VOPC_DST, + /// Operand with 32-bit immediate that uses the constant bus. OPERAND_KIMM32, OPERAND_KIMM16 @@ -160,7 +164,8 @@ namespace AMDGPUAsmVariants { DEFAULT = 0, VOP3 = 1, SDWA = 2, - DPP = 3 + SDWA9 = 3, + DPP = 4 }; } @@ -294,6 +299,18 @@ enum DstUnused { UNUSED_PRESERVE = 2, }; +enum SDWA9EncValues{ + SRC_SGPR_MASK = 0x100, + SRC_VGPR_MASK = 0xFF, + VOPC_DST_VCC_MASK = 0x80, + VOPC_DST_SGPR_MASK = 0x7F, + + SRC_VGPR_MIN = 0, + SRC_VGPR_MAX = 255, + SRC_SGPR_MIN = 256, + SRC_SGPR_MAX = 357, +}; + } // namespace SDWA } // namespace AMDGPU diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 01c1f78e7ca4..76c2644867aa 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -698,6 +698,18 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } } +bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { + if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { + return (MemVT.getSizeInBits() <= 4 * 32); + } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { + unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); + return (MemVT.getSizeInBits() <= MaxPrivateBits); + } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + return (MemVT.getSizeInBits() <= 2 * 32); + } + return true; +} + bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, @@ -4229,12 +4241,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, SDValue RHS = N->getOperand(1); - if (VT == MVT::i64) { - const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); - if (CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) - return Split; + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); + if (VT == MVT::i64 && CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + return Split; + } + + if (CRHS && VT == MVT::i32) { + // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb + // nb = number of trailing zeroes in mask + // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, + // given that we are selecting 8 or 16 bit fields starting at byte boundary. + uint64_t Mask = CRHS->getZExtValue(); + unsigned Bits = countPopulation(Mask); + if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && + (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { + if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { + unsigned Shift = CShift->getZExtValue(); + unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); + unsigned Offset = NB + Shift; + if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. + SDLoc SL(N); + SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + LHS->getOperand(0), + DAG.getConstant(Offset, SL, MVT::i32), + DAG.getConstant(Bits, SL, MVT::i32)); + EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); + SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, + DAG.getValueType(NarrowVT)); + SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, + DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); + return Shl; + } + } } } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index e68837747491..8e2ec40b224c 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -150,6 +150,8 @@ public: bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 38a16b525a75..36d29b8ecf06 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2331,6 +2331,10 @@ static bool isSubRegOf(const SIRegisterInfo &TRI, bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); + + if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) + return true; + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 7b052844f177..c5287c7f64ba 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -439,6 +439,27 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { let ParserMatchClass = VReg32OrOffClass; } +class SDWA9Src : RegisterOperand<VS_32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_SDWA9_SRC"; + let EncoderMethod = "getSDWA9SrcEncoding"; +} + +def SDWA9Src32 : SDWA9Src { + let DecoderMethod = "decodeSDWA9Src32"; +} + +def SDWA9Src16 : SDWA9Src { + let DecoderMethod = "decodeSDWA9Src16"; +} + +def SDWA9VopcDst : VOPDstOperand<SReg_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_SDWA9_VOPC_DST"; + let EncoderMethod = "getSDWA9VopcDstEncoding"; + let DecoderMethod = "decodeSDWA9VopcDst"; +} + class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass { let Name = "Imm"#CName; let PredicateMethod = "is"#CName; @@ -588,6 +609,16 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; +def FPRegInputModsMatchClass : AsmOperandClass { + let Name = "RegWithFPInputMods"; + let ParserMethod = "parseRegWithFPInputMods"; + let PredicateMethod = "isRegKind"; +} + +def FPRegInputMods : InputMods <FPRegInputModsMatchClass> { + let PrintMethod = "printOperandAndFPInputMods"; +} + def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; @@ -598,6 +629,17 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { let PrintMethod = "printOperandAndFPInputMods"; } + +def IntRegInputModsMatchClass : AsmOperandClass { + let Name = "RegWithIntInputMods"; + let ParserMethod = "parseRegWithIntInputMods"; + let PredicateMethod = "isRegKind"; +} + +def IntRegInputMods : InputMods <IntRegInputModsMatchClass> { + let PrintMethod = "printOperandAndIntInputMods"; +} + def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; @@ -783,6 +825,14 @@ class getVALUDstForVT<ValueType VT> { VOPDstOperand<SReg_64>)))); // else VT == i1 } +// Returns the register class to use for the destination of VOP[12C] +// instructions with GFX9 SDWA extension +class getSDWA9DstForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 1), + SDWA9VopcDst, // VOPC + VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst +} + // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { @@ -823,6 +873,9 @@ class getVregSrcForVT<ValueType VT> { !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); } +class getSDWA9SrcForVT <ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32); +} // Returns the register class to use for sources of VOP3 instructions for the // given VT. @@ -926,6 +979,15 @@ class getSrcModExt <ValueType VT> { Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } +// Return type of input modifiers operand specified input operand for SDWA 9 +class getSrcModSDWA9 <ValueType VT> { + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); + Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods); +} + // Returns the input arguments for VOP[12C] instructions for the given SrcVT. class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 @@ -1062,6 +1124,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, // VOP1 without input operands (V_NOP) (ins), !if(!eq(NumSrcArgs, 1), + // VOP1_SDWA (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel), @@ -1071,7 +1134,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), - // VOP2_SDWA or VOPC_SDWA with modifiers + // VOP2_SDWA with modifiers (ins Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, @@ -1079,12 +1142,65 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, (ins)/* endif */))); } +// Ins for GFX9 SDWA +class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs, + bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod, + ValueType DstVT> { + + dag ret = !if(!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins), + !if(!eq(NumSrcArgs, 1), + // VOP1 + !if(!eq(HasSDWAOMod, 0), + // VOP1_SDWA9 without omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel), + // VOP1_SDWA9 with omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel)), + !if(!eq(NumSrcArgs, 2), + !if(!eq(DstVT.Size, 1), + // VOPC_SDWA9 + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA9 + !if(!eq(HasSDWAOMod, 0), + // VOP2_SDWA9 without omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP1_SDWA9 with omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel))), + (ins)/* endif */))); +} + // Outs for DPP and SDWA -class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> { +class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> { dag ret = !if(HasDst, !if(!eq(DstVT.Size, 1), (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions - (outs DstRCDPP:$vdst)), + (outs DstRCExt:$vdst)), + (outs)); // V_NOP +} + +// Outs for GFX9 SDWA +class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> { + dag ret = !if(HasDst, + !if(!eq(DstVT.Size, 1), + (outs DstRCSDWA9:$sdst), + (outs DstRCSDWA9:$vdst)), (outs)); // V_NOP } @@ -1153,8 +1269,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; } -class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, - ValueType DstVT = i32> { +class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), " vcc", // use vcc token as dst for VOPC instructioins @@ -1182,6 +1297,35 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, string ret = dst#args#sdwa; } +class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs, + ValueType DstVT = i32> { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", // VOPC + "$vdst"), // VOP1/2 + ""); + string src0 = "$src0_modifiers"; + string src1 = "$src1_modifiers"; + string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod"); + string args = !if(!eq(NumSrcArgs, 0), "", + !if(!eq(NumSrcArgs, 1), + ", "#src0, + ", "#src0#", "#src1 + ) + ); + string sdwa = !if(!eq(NumSrcArgs, 0), "", + !if(!eq(NumSrcArgs, 1), + out_mods#" $dst_sel $dst_unused $src0_sel", + !if(!eq(DstVT.Size, 1), + " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC + out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel" + ) + ) + ); + string ret = dst#args#sdwa; +} + + // Function that checks if instruction supports DPP and SDWA class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32> { @@ -1219,6 +1363,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret; field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret; + field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; @@ -1228,6 +1373,8 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret; field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret; + field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret; + field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret; field Operand Src0Mod = getSrcMod<Src0VT>.ret; field Operand Src1Mod = getSrcMod<Src1VT>.ret; field Operand Src2Mod = getSrcMod<Src2VT>.ret; @@ -1235,6 +1382,8 @@ class VOPProfile <list<ValueType> _ArgVT> { field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret; field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret; field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret; + field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret; + field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret; field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); @@ -1261,14 +1410,16 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0); field bit HasClamp = HasModifiers; - field bit HasSDWAClamp = HasSrc0; + field bit HasSDWAClamp = EmitDst; field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret; field bit IsPacked = isPackedType<Src0VT>.ret; field bit HasOpSel = IsPacked; field bit HasOMod = !if(HasOpSel, 0, HasModifiers); + field bit HasSDWAOMod = isFloatType<DstVT>.ret; field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; + field bit HasSDWA9 = HasExt; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -1282,6 +1433,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Outs64 = Outs; field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret; + field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, @@ -1296,16 +1448,21 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasModifiers, Src0ModSDWA, Src1ModSDWA, DstVT>.ret; + field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs, + HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9, + DstVT>.ret; field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret; field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret; field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; - field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; + field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; } class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExt = 0; + let HasSDWA9 = 0; } def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; @@ -1446,6 +1603,15 @@ def getSDWAOp : InstrMapping { let ValueCols = [["SDWA"]]; } +// Maps ordinary instructions to their SDWA GFX9 counterparts +def getSDWA9Op : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["SDWA9"]]; +} + def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index f2d8b6f7b7a4..ec29a66c8bbb 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -184,7 +184,9 @@ def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">; def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">; def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; -def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; +def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", + [(set i64:$sdst, (int_amdgcn_s_getpc))] +>; let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 2abd4afad3b6..630f469eabf0 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -544,6 +544,17 @@ bool isVI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; } +bool isGFX9(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; +} + +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { + const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); + const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); + return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || + Reg == AMDGPU::SCC; +} + unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { switch(Reg) { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 8e74aa2cc9a8..19888ad7556a 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -273,6 +273,10 @@ inline bool isKernel(CallingConv::ID CC) { bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); +bool isGFX9(const MCSubtargetInfo &STI); + +/// \brief Is Reg - scalar register +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 1febc6bf8ec2..95b5ef0a49db 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -30,6 +30,15 @@ class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{31-25} = 0x3f; // encoding } +class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> { + bits<8> vdst; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = op; + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; // encoding +} + class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> : InstSI <P.Outs32, P.Ins32, "", pattern>, VOP <opName>, @@ -84,6 +93,11 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP1"; } +class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA9_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOP1"; +} + class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, @@ -103,6 +117,7 @@ multiclass VOP1Inst <string opName, VOPProfile P, def _e32 : VOP1_Pseudo <opName, P>; def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; def _sdwa : VOP1_SDWA_Pseudo <opName, P>; + def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>; } // Special profile for instructions which have clamp @@ -243,6 +258,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> { let Src0RC64 = VRegSrc_32; let HasExt = 0; + let HasSDWA9 = 0; } // Special case because there are no true output operands. Hack vdst @@ -258,16 +274,21 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); - let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0, + let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel); + let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; let Asm64 = getAsm64<1, 1, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; - let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; + let AsmSDWA = getAsmSDWA<1, 1>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; let HasExt = 0; + let HasSDWA9 = 0; let HasDst = 0; let EmitDst = 1; // force vdst emission } @@ -324,7 +345,7 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; @@ -347,7 +368,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { def : Pat< (f32 (f16_to_fp i16:$src)), @@ -523,6 +544,10 @@ multiclass VOP1_Real_vi <bits<10> op> { VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; + // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 4a11d9471f1d..657cacaa792c 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -48,6 +48,18 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{31} = 0x0; // encoding } +class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> { + bits<8> vdst; + bits<9> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} + class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> : InstSI <P.Outs32, P.Ins32, "", pattern>, VOP <opName>, @@ -102,6 +114,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP2"; } +class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA9_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOP2"; +} + class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -121,10 +138,10 @@ multiclass VOP2Inst <string opName, def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>; } -// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst multiclass VOP2bInst <string opName, VOPProfile P, SDPatternOperator node = null_frag, @@ -136,7 +153,13 @@ multiclass VOP2bInst <string opName, def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } + + def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -203,13 +226,21 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { VGPR_32:$src2, // stub argument clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + VGPR_32:$src2, // stub argument + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; - let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; + let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; let HasExt = 1; + let HasSDWA9 = 0; } def VOP_MAC_F16 : VOP_MAC <f16> { @@ -229,6 +260,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { let Asm32 = "$vdst, vcc, $src0, $src1"; let Asm64 = "$vdst, $sdst, $src0, $src1"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -246,6 +278,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -254,16 +287,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { // implicit VCC use. let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); - let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0, - Src1Mod:$src1_modifiers, Src1SDWA:$src1, + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, Src1Mod:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let HasExt = 1; + let HasSDWA9 = 1; } // Read in from vcc or arbitrary SGPR @@ -387,7 +427,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; } // End let SubtargetPredicate = SICI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; @@ -418,7 +458,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; } } // End isCommutable = 1 -} // End SubtargetPredicate = isVI +} // End SubtargetPredicate = Has16BitInsts // Note: 16-bit instructions produce a 0 result in the high 16-bits. multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { @@ -468,7 +508,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : Pat < (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) >; -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>; defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>; @@ -513,7 +553,7 @@ def : Pat< (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) >; -} // End Predicates = [isVI] +} // End Predicates = [Has16BitInsts] //===----------------------------------------------------------------------===// // SI @@ -686,15 +726,21 @@ multiclass VOP2_SDWA_Real <bits<6> op> { VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } +multiclass VOP2_SDWA9_Real <bits<6> op> { + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; +} + multiclass VOP2be_Real_e32e64_vi <bits<6> op> : - Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { + Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; } multiclass VOP2_Real_e32e64_vi <bits<6> op> : - Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> { + Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index c0b5069948fb..001fc960b228 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -243,7 +243,7 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { let isCommutable = 1 in { @@ -258,12 +258,13 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; } // End isCommutable = 1 +} // End SubtargetPredicate = Has16BitInsts +let SubtargetPredicate = isVI in { def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; - } // End SubtargetPredicate = isVI -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst, SDPatternOperator op3> { @@ -288,7 +289,7 @@ def : Pat< defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>; defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; -} // End Predicates = [isVI] +} // End Predicates = [Has16BitInsts] let SubtargetPredicate = isGFX9 in { def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>; diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index a3550a63677b..cd347b86d305 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -34,6 +34,17 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{44-43} = SDWA.UNUSED_PRESERVE; } +class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { + bits<9> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; // encoding + let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} + + //===----------------------------------------------------------------------===// // VOPC classes //===----------------------------------------------------------------------===// @@ -102,6 +113,11 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOPC"; } +class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_SDWA9_Pseudo <OpName, P, pattern> { + let AsmMatchConverter = "cvtSdwaVOPC"; +} + // This class is used only with VOPC instructions. Use $sdst for out operand class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> : InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl { @@ -173,6 +189,13 @@ multiclass VOPC_Pseudos <string opName, let isConvergent = DefExec; let isCompare = 1; } + + def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = P.Schedule; + let isConvergent = DefExec; + let isCompare = 1; + } } def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; @@ -520,7 +543,11 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> : let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel); let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; + //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let HasSrc1Mods = 0; let HasClamp = 0; let HasOMod = 0; @@ -553,6 +580,12 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> { let SchedRW = p.Schedule; let isConvergent = DefExec; } + + def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = p.Schedule; + let isConvergent = DefExec; + } } def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; @@ -920,6 +953,10 @@ multiclass VOPC_Real_vi <bits<10> op> { VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>, + VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>; + def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"), !cast<Instruction>(NAME#"_e32_vi")> { let AssemblerPredicate = isVI; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 69906c419db3..4da654f84f9d 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -293,11 +293,52 @@ class VOP_SDWAe<VOPProfile P> : Enc64 { let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); - let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); + let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); +} + +// gfx9 SDWA basic encoding +class VOP_SDWA9e<VOPProfile P> : Enc64 { + bits<9> src0; // {src0_sgpr{0}, src0{7-0}} + bits<3> src0_sel; + bits<2> src0_modifiers; // float: {abs,neg}, int {sext} + bits<3> src1_sel; + bits<2> src1_modifiers; + bits<1> src1_sgpr; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); + let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); + let Inst{55} = !if(P.HasSrc0, src0{8}, 0); + let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); + let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); + let Inst{63} = 0; // src1_sgpr - should be specified in subclass +} + +// gfx9 SDWA-A +class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> { + bits<3> dst_sel; + bits<2> dst_unused; + bits<1> clamp; + bits<2> omod; + + let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD); + let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); + let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); + let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0); +} + +// gfx9 SDWA-B +class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> { + bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}} + + let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0); + let Inst{47} = !if(P.EmitDst, sdst{7}, 0); } class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : @@ -331,6 +372,50 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : VOPProfile Pfl = P; } +// GFX9 adds two features to SDWA: +// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. +// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather +// than VGPRs (at most 1 can be an SGPR); +// b. OMOD is the standard output modifier (result *2, *4, /2) +// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This +// replaces OMOD and the dest fields with SD and SDST (SGPR destination) +// field. +// a. When SD=1, the SDST is used as the destination for the compare result; +// b.when SD=0, VCC is used. +// +// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA + +class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : + InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>, + VOP <opName>, + SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>, + MnemonicAlias <opName#"_sdwa9", opName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = P.AsmSDWA9; + + let Size = 8; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + + let VALU = 1; + let SDWA = 1; + let Uses = [EXEC]; + + let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); + let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); + let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9, + AMDGPUAsmVariants.Disable); + let DecoderNamespace = "SDWA9"; + + VOPProfile Pfl = P; +} + class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { @@ -358,6 +443,33 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> : let TSFlags = ps.TSFlags; } +class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // Copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AssemblerPredicate = ps.AssemblerPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let DecoderNamespace = ps.DecoderNamespace; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + class VOP_DPPe<VOPProfile P> : Enc64 { bits<2> src0_modifiers; bits<8> src0; diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 46ac4d0ad933..31a2f499a9a7 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -34,6 +34,9 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI) static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T) { + if (T->isArrayTy()) + return true; + EVT VT = TLI.getValueType(DL, T, true); if (!VT.isSimple() || VT.isVector() || !(VT.isInteger() || VT.isFloatingPoint())) @@ -148,23 +151,47 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { }; } // End anonymous namespace. -void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, - SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, - MachineRegisterInfo &MRI) const { +void ARMCallLowering::splitToValueTypes( + const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, + MachineFunction &MF, const SplitArgTy &PerformArgSplit) const { const ARMTargetLowering &TLI = *getTLI<ARMTargetLowering>(); LLVMContext &Ctx = OrigArg.Ty->getContext(); + const DataLayout &DL = MF.getDataLayout(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function *F = MF.getFunction(); SmallVector<EVT, 4> SplitVTs; SmallVector<uint64_t, 4> Offsets; ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); - assert(SplitVTs.size() == 1 && "Unsupported type"); + if (SplitVTs.size() == 1) { + // Even if there is no splitting to do, we still want to replace the + // original type (e.g. pointer type -> integer). + SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), + OrigArg.Flags, OrigArg.IsFixed); + return; + } + + unsigned FirstRegIdx = SplitArgs.size(); + for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) { + EVT SplitVT = SplitVTs[i]; + Type *SplitTy = SplitVT.getTypeForEVT(Ctx); + auto Flags = OrigArg.Flags; + bool NeedsConsecutiveRegisters = + TLI.functionArgumentNeedsConsecutiveRegisters( + SplitTy, F->getCallingConv(), F->isVarArg()); + if (NeedsConsecutiveRegisters) { + Flags.setInConsecutiveRegs(); + if (i == e - 1) + Flags.setInConsecutiveRegsLast(); + } + SplitArgs.push_back( + ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), + SplitTy, Flags, OrigArg.IsFixed}); + } - // Even if there is no splitting to do, we still want to replace the original - // type (e.g. pointer type -> integer). - SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags, OrigArg.IsFixed); + for (unsigned i = 0; i < Offsets.size(); ++i) + PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8); } /// Lower the return value for the already existing \p Ret. This assumes that @@ -187,7 +214,9 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, SmallVector<ArgInfo, 4> SplitVTs; ArgInfo RetInfo(VReg, Val->getType()); setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); - splitToValueTypes(RetInfo, SplitVTs, DL, MF.getRegInfo()); + splitToValueTypes(RetInfo, SplitVTs, MF, [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, VReg, Offset); + }); CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg()); @@ -307,6 +336,26 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { return 1; } + /// Merge the values in \p SrcRegs into \p DstReg at offsets \p SrcOffsets. + /// Note that the source registers are not required to have homogeneous types, + /// so we use G_INSERT rather than G_MERGE_VALUES. + // FIXME: Use G_MERGE_VALUES if the types are homogeneous. + void mergeRegisters(unsigned DstReg, ArrayRef<unsigned> SrcRegs, + ArrayRef<uint64_t> SrcOffsets) { + LLT Ty = MRI.getType(DstReg); + + unsigned Dst = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildUndef(Dst); + + for (unsigned i = 0; i < SrcRegs.size(); ++i) { + unsigned Tmp = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInsert(Tmp, Dst, SrcRegs[i], SrcOffsets[i]); + Dst = Tmp; + } + + MIRBuilder.buildCopy(DstReg, Dst); + } + /// Marking a physical register as used is different between formal /// parameters, where it's a basic block live-in, and call returns, where it's /// an implicit-def of the call instruction. @@ -335,6 +384,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, return false; auto &MF = MIRBuilder.getMF(); + auto &MBB = MIRBuilder.getMBB(); auto DL = MF.getDataLayout(); auto &TLI = *getTLI<ARMTargetLowering>(); @@ -350,17 +400,34 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, CCAssignFn *AssignFn = TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg()); + FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(), + AssignFn); + SmallVector<ArgInfo, 8> ArgInfos; + SmallVector<unsigned, 4> SplitRegs; + SmallVector<uint64_t, 4> RegOffsets; unsigned Idx = 0; for (auto &Arg : F.args()) { ArgInfo AInfo(VRegs[Idx], Arg.getType()); setArgFlags(AInfo, Idx + AttributeList::FirstArgIndex, DL, F); - splitToValueTypes(AInfo, ArgInfos, DL, MF.getRegInfo()); + + SplitRegs.clear(); + RegOffsets.clear(); + + splitToValueTypes(AInfo, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) { + SplitRegs.push_back(Reg); + RegOffsets.push_back(Offset); + }); + + if (!SplitRegs.empty()) + ArgHandler.mergeRegisters(VRegs[Idx], SplitRegs, RegOffsets); + Idx++; } - FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(), - AssignFn); + if (!MBB.empty()) + MIRBuilder.setInstr(*MBB.begin()); + return handleAssignments(MIRBuilder, ArgInfos, ArgHandler); } @@ -407,7 +474,9 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (!Arg.IsFixed) return false; - splitToValueTypes(Arg, ArgInfos, DL, MRI); + splitToValueTypes(Arg, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, Arg.Reg, Offset); + }); } auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false); @@ -423,12 +492,24 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; ArgInfos.clear(); - splitToValueTypes(OrigRet, ArgInfos, DL, MRI); + SmallVector<uint64_t, 8> RegOffsets; + SmallVector<unsigned, 8> SplitRegs; + splitToValueTypes(OrigRet, ArgInfos, MF, + [&](unsigned Reg, uint64_t Offset) { + RegOffsets.push_back(Offset); + SplitRegs.push_back(Reg); + }); auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false); CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler)) return false; + + if (!RegOffsets.empty()) { + // We have split the value and allocated each individual piece, now build + // it up again. + RetHandler.mergeRegisters(OrigRet.Reg, SplitRegs, RegOffsets); + } } // We now know the size of the stack - update the ADJCALLSTACKDOWN diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h index 6404c7a2689e..f5a6872336f6 100644 --- a/lib/Target/ARM/ARMCallLowering.h +++ b/lib/Target/ARM/ARMCallLowering.h @@ -42,11 +42,14 @@ private: bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val, unsigned VReg, MachineInstrBuilder &Ret) const; + typedef std::function<void(unsigned Reg, uint64_t Offset)> SplitArgTy; + /// Split an argument into one or more arguments that the CC lowering can cope /// with (e.g. replace pointers with integers). void splitToValueTypes(const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, MachineRegisterInfo &MRI) const; + MachineFunction &MF, + const SplitArgTy &PerformArgSplit) const; }; } // End of namespace llvm #endif diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 78a9144bd321..90baabcdb652 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -779,7 +779,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineOperand &Desired = MI.getOperand(3); MachineOperand &New = MI.getOperand(4); - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LivePhysRegs LiveRegs(TII->getRegisterInfo()); LiveRegs.addLiveOuts(MBB); for (auto I = std::prev(MBB.end()); I != MBBI; --I) LiveRegs.stepBackward(*I); @@ -903,7 +903,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0); unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1); - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LivePhysRegs LiveRegs(TII->getRegisterInfo()); LiveRegs.addLiveOuts(MBB); for (auto I = std::prev(MBB.end()); I != MBBI; --I) LiveRegs.stepBackward(*I); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index f8b584db7b99..62e774d869da 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -127,7 +127,7 @@ static cl::opt<bool> EnableConstpoolPromotion( "arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), - cl::init(true)); + cl::init(false)); // FIXME: set to true by default once PR32780 is fixed static cl::opt<unsigned> ConstpoolPromotionMaxSize( "arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), @@ -12147,12 +12147,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, } } - // Lowering to i32/i16 if the size permits. - if (Size >= 4) - return MVT::i32; - else if (Size >= 2) - return MVT::i16; - // Let the target-independent logic figure it out. return MVT::Other; } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 875c06210ae6..26da528c19e6 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -510,7 +510,7 @@ class InstrItineraryData; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; - bool canMergeStoresTo(EVT MemVT) const override { + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT) const override { // Do not merge to larger than i32. return (MemVT.getSizeInBits() <= 32); } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 51290e5a5b93..858136a82078 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -674,7 +674,7 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd), (ins AddrMode:$Rn), IIC_VLD1, - "vld1", Dt, "$Vd, $Rn", "", []> { + "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD1]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -682,7 +682,7 @@ class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode> class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd), (ins AddrMode:$Rn), IIC_VLD1x2, - "vld1", Dt, "$Vd, $Rn", "", []> { + "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -703,7 +703,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1u, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -711,7 +711,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> { def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u, "vld1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -720,7 +720,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -728,7 +728,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> { def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -747,7 +747,7 @@ defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>; class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd), (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt, - "$Vd, $Rn", "", []> { + "$Vd, $Rn", "", []>, Sched<[WriteVLD3]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -756,7 +756,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -764,7 +764,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> { def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -780,15 +780,15 @@ defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>; defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>; defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>; -def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>; -def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>; -def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>; +def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>; +def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>; +def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>; // ...with 4 registers class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd), (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt, - "$Vd, $Rn", "", []> { + "$Vd, $Rn", "", []>, Sched<[WriteVLD4]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -797,7 +797,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -805,7 +805,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> { def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -821,9 +821,9 @@ defm VLD1d16Qwb : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>; defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; -def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>; -def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>; -def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>; +def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>; +def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>; +def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>; // VLD2 : Vector Load (multiple 2-element structures) class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, @@ -837,22 +837,22 @@ class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, } def VLD2d8 : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2d16 : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2d32 : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2q8 : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; def VLD2q16 : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; def VLD2q32 : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; -def VLD2q8Pseudo : VLDQQPseudo<IIC_VLD2x2>; -def VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>; -def VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>; +def VLD2q8Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>; +def VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>; +def VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>; // ...with address register writeback: multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt, @@ -875,45 +875,45 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt, } defm VLD2d8wb : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2q8wb : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; -def VLD2q8PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>; -def VLD2q16PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>; -def VLD2q32PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>; -def VLD2q8PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>; -def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>; -def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>; +def VLD2q8PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>; +def VLD2q16PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>; +def VLD2q32PseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>; +def VLD2q8PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>; +def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>; +def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>; // ...with double-spaced registers def VLD2b8 : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2b16 : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2b32 : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2b8wb : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; // VLD3 : Vector Load (multiple 3-element structures) class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), (ins addrmode6:$Rn), IIC_VLD3, - "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> { + "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []>, Sched<[WriteVLD3]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST3Instruction"; @@ -923,9 +923,9 @@ def VLD3d8 : VLD3D<0b0100, {0,0,0,?}, "8">; def VLD3d16 : VLD3D<0b0100, {0,1,0,?}, "16">; def VLD3d32 : VLD3D<0b0100, {1,0,0,?}, "32">; -def VLD3d8Pseudo : VLDQQPseudo<IIC_VLD3>; -def VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>; -def VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>; +def VLD3d8Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>; +def VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>; +def VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>; // ...with address register writeback: class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -933,7 +933,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u, "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST3Instruction"; } @@ -942,9 +942,9 @@ def VLD3d8_UPD : VLD3DWB<0b0100, {0,0,0,?}, "8">; def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">; def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">; -def VLD3d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; -def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; -def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>; +def VLD3d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; +def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; +def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; // ...with double-spaced registers: def VLD3q8 : VLD3D<0b0101, {0,0,0,?}, "8">; @@ -954,25 +954,26 @@ def VLD3q8_UPD : VLD3DWB<0b0101, {0,0,0,?}, "8">; def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">; def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">; -def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; -def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; -def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; +def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; +def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; // ...alternate versions to be allocated odd register numbers: -def VLD3q8oddPseudo : VLDQQQQPseudo<IIC_VLD3>; -def VLD3q16oddPseudo : VLDQQQQPseudo<IIC_VLD3>; -def VLD3q32oddPseudo : VLDQQQQPseudo<IIC_VLD3>; +def VLD3q8oddPseudo : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>; +def VLD3q16oddPseudo : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>; +def VLD3q32oddPseudo : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>; -def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; -def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; -def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>; +def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; +def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; +def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>; // VLD4 : Vector Load (multiple 4-element structures) class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), (ins addrmode6:$Rn), IIC_VLD4, - "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []>, + Sched<[WriteVLD4]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST4Instruction"; @@ -982,9 +983,9 @@ def VLD4d8 : VLD4D<0b0000, {0,0,?,?}, "8">; def VLD4d16 : VLD4D<0b0000, {0,1,?,?}, "16">; def VLD4d32 : VLD4D<0b0000, {1,0,?,?}, "32">; -def VLD4d8Pseudo : VLDQQPseudo<IIC_VLD4>; -def VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>; -def VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>; +def VLD4d8Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>; +def VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>; +def VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>; // ...with address register writeback: class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -992,7 +993,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u, "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST4Instruction"; } @@ -1001,9 +1002,9 @@ def VLD4d8_UPD : VLD4DWB<0b0000, {0,0,?,?}, "8">; def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">; def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">; -def VLD4d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; -def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; -def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>; +def VLD4d8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; +def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; +def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; // ...with double-spaced registers: def VLD4q8 : VLD4D<0b0001, {0,0,?,?}, "8">; @@ -1013,18 +1014,18 @@ def VLD4q8_UPD : VLD4DWB<0b0001, {0,0,?,?}, "8">; def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">; def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">; -def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; -def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; -def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; +def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; +def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; // ...alternate versions to be allocated odd register numbers: -def VLD4q8oddPseudo : VLDQQQQPseudo<IIC_VLD4>; -def VLD4q16oddPseudo : VLDQQQQPseudo<IIC_VLD4>; -def VLD4q32oddPseudo : VLDQQQQPseudo<IIC_VLD4>; +def VLD4q8oddPseudo : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>; +def VLD4q16oddPseudo : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>; +def VLD4q32oddPseudo : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>; -def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; -def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; -def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>; +def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; +def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; +def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1076,11 +1077,12 @@ class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, "$src = $Vd", [(set DPR:$Vd, (vector_insert (Ty DPR:$src), (i32 (LoadOp addrmode6oneL32:$Rn)), - imm:$lane))]> { + imm:$lane))]>, Sched<[WriteVLD1]> { let Rm = 0b1111; let DecoderMethod = "DecodeVLD1LN"; } -class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> { +class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln>, + Sched<[WriteVLD1]> { let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src), (i32 (LoadOp addrmode6:$addr)), imm:$lane))]; @@ -1117,7 +1119,7 @@ class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, am6offset:$Rm, DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn$Rm", - "$src = $Vd, $Rn.addr = $wb", []> { + "$src = $Vd, $Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let DecoderMethod = "DecodeVLD1LN"; } @@ -1134,16 +1136,16 @@ def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> { let Inst{4} = Rn{4}; } -def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; -def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; -def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>; +def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>; +def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>; +def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>; // VLD2LN : Vector Load (single 2-element structure to one lane) class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2), (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn", - "$src1 = $Vd, $src2 = $dst2", []> { + "$src1 = $Vd, $src2 = $dst2", []>, Sched<[WriteVLD1]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD2LN"; @@ -1159,9 +1161,9 @@ def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> { let Inst{7} = lane{0}; } -def VLD2LNd8Pseudo : VLDQLNPseudo<IIC_VLD2ln>; -def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>; -def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>; +def VLD2LNd8Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>; +def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>; +def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>; // ...with double-spaced registers: def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> { @@ -1171,8 +1173,8 @@ def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> { let Inst{7} = lane{0}; } -def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>; -def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>; +def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>; +def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>; // ...with address register writeback: class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -1195,9 +1197,9 @@ def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> { let Inst{7} = lane{0}; } -def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; -def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; -def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>; +def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>; +def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>; def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> { let Inst{7-6} = lane{1-0}; @@ -1206,8 +1208,8 @@ def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> { let Inst{7} = lane{0}; } -def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>; -def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>; +def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>; +def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>; // VLD3LN : Vector Load (single 3-element structure to one lane) class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -1215,7 +1217,7 @@ class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt, "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn", - "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> { + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []>, Sched<[WriteVLD2]> { let Rm = 0b1111; let DecoderMethod = "DecodeVLD3LN"; } @@ -1230,9 +1232,9 @@ def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> { let Inst{7} = lane{0}; } -def VLD3LNd8Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; -def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; -def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>; +def VLD3LNd8Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>; +def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>; +def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>; // ...with double-spaced registers: def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> { @@ -1242,8 +1244,8 @@ def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> { let Inst{7} = lane{0}; } -def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>; -def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>; +def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>; +def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>; // ...with address register writeback: class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -1254,7 +1256,7 @@ class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> IIC_VLD3lnu, "vld3", Dt, "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm", "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb", - []> { + []>, Sched<[WriteVLD2]> { let DecoderMethod = "DecodeVLD3LN"; } @@ -1268,9 +1270,9 @@ def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> { let Inst{7} = lane{0}; } -def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; -def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; -def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>; +def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>; +def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>; def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> { let Inst{7-6} = lane{1-0}; @@ -1279,8 +1281,8 @@ def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> { let Inst{7} = lane{0}; } -def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>; -def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>; +def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>; +def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>; // VLD4LN : Vector Load (single 4-element structure to one lane) class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -1289,7 +1291,8 @@ class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt, "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn", - "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> { + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>, + Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD4LN"; @@ -1306,9 +1309,9 @@ def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VLD4LNd8Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; -def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; -def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>; +def VLD4LNd8Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>; +def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>; +def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>; // ...with double-spaced registers: def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> { @@ -1319,8 +1322,8 @@ def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>; -def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>; +def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>; +def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>; // ...with address register writeback: class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -1347,9 +1350,9 @@ def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; -def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; -def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>; +def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>; +def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>; def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> { let Inst{7-6} = lane{1-0}; @@ -1359,8 +1362,8 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>; -def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>; +def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>; +def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1371,7 +1374,8 @@ class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp, (ins AddrMode:$Rn), IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "", [(set VecListOneDAllLanes:$Vd, - (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> { + (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>, + Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; @@ -1434,7 +1438,7 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> { (outs VecListDPairAllLanes:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1dupu, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; @@ -1491,7 +1495,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy, (outs VdTy:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD2dupu, "vld2", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD2DupInstruction"; @@ -1500,7 +1504,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy, (outs VdTy:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu, "vld2", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD2DupInstruction"; } @@ -1524,7 +1528,8 @@ defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes, class VLD3DUP<bits<4> op7_4, string Dt> : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), (ins addrmode6dup:$Rn), IIC_VLD3dup, - "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> { + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []>, + Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{4} = 0; let DecoderMethod = "DecodeVLD3DupInstruction"; @@ -1534,9 +1539,9 @@ def VLD3DUPd8 : VLD3DUP<{0,0,0,?}, "8">; def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">; def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">; -def VLD3DUPd8Pseudo : VLDQQPseudo<IIC_VLD3dup>; -def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>; -def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>; +def VLD3DUPd8Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; +def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; +def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>; // ...with double-spaced registers (not used for codegen): def VLD3DUPq8 : VLD3DUP<{0,0,1,?}, "8">; @@ -1548,7 +1553,7 @@ class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu, "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> { let Inst{4} = 0; let DecoderMethod = "DecodeVLD3DupInstruction"; } @@ -1561,9 +1566,9 @@ def VLD3DUPq8_UPD : VLD3DUPWB<{0,0,1,0}, "8", addrmode6dupalign64>; def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>; def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>; -def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; -def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; -def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>; +def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>; +def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>; +def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>; // VLD4DUP : Vector Load (single 4-element structure to all lanes) class VLD4DUP<bits<4> op7_4, string Dt> @@ -1580,9 +1585,9 @@ def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8">; def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">; def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; } -def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>; -def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>; -def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>; +def VLD4DUPd8Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; +def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; +def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>; // ...with double-spaced registers (not used for codegen): def VLD4DUPq8 : VLD4DUP<{0,0,1,?}, "8">; @@ -1595,7 +1600,7 @@ class VLD4DUPWB<bits<4> op7_4, string Dt> (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu, "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD4DupInstruction"; } @@ -1608,9 +1613,9 @@ def VLD4DUPq8_UPD : VLD4DUPWB<{0,0,1,0}, "8">; def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">; def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } -def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; -def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; -def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>; +def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>; +def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>; +def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1657,14 +1662,14 @@ class VSTQQQQWBPseudo<InstrItinClass itin> // VST1 : Vector Store (multiple single elements) class VST1D<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd), - IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> { + IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST1]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; } class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd), - IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> { + IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST2]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1685,7 +1690,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u, "vst1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST1]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1694,7 +1699,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd), IIC_VLD1u, "vst1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST1]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -1703,7 +1708,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u, "vst1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST2]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1712,7 +1717,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd), IIC_VLD1x2u, "vst1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST2]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -1732,7 +1737,7 @@ defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>; class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b00, 0b0110, op7_4, (outs), (ins AddrMode:$Rn, VecListThreeD:$Vd), - IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> { + IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST3]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1741,7 +1746,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u, "vst1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST3]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1750,7 +1755,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd), IIC_VLD1x3u, "vst1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST3]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -1766,16 +1771,16 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>; defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>; defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>; -def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>; -def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>; -def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>; +def VST1d64TPseudo : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>; +def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>; +def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>; // ...with 4 registers class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b00, 0b0010, op7_4, (outs), (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "", - []> { + []>, Sched<[WriteVST4]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1784,7 +1789,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u, "vst1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1793,7 +1798,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd), IIC_VLD1x4u, "vst1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -1809,9 +1814,9 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>; defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; -def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>; -def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>; -def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>; +def VST1d64QPseudo : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>; +def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>; +def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>; // VST2 : Vector Store (multiple 2-element structures) class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, @@ -1824,22 +1829,22 @@ class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, } def VST2d8 : VST2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VST2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVST2]>; def VST2d16 : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVST2]>; def VST2d32 : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVST2]>; def VST2q8 : VST2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VST2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVST4]>; def VST2q16 : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVST4]>; def VST2q32 : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVST4]>; -def VST2q8Pseudo : VSTQQPseudo<IIC_VST2x2>; -def VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>; -def VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>; +def VST2q8Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>; +def VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>; +def VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>; // ...with address register writeback: multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt, @@ -1847,7 +1852,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt, def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST2]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST2Instruction"; @@ -1855,7 +1860,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt, def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST2]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST2Instruction"; } @@ -1864,7 +1869,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST2Instruction"; @@ -1873,7 +1878,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST2Instruction"; } @@ -1890,12 +1895,12 @@ defm VST2q8wb : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>; defm VST2q16wb : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>; defm VST2q32wb : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>; -def VST2q8PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>; -def VST2q16PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>; -def VST2q32PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>; -def VST2q8PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>; -def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>; -def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>; +def VST2q8PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>; +def VST2q16PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>; +def VST2q32PseudoWB_fixed : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>; +def VST2q8PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>; +def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>; +def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>; // ...with double-spaced registers def VST2b8 : VST2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VST2, @@ -1915,7 +1920,7 @@ defm VST2b32wb : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3, - "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> { + "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []>, Sched<[WriteVST3]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST3Instruction"; @@ -1925,9 +1930,9 @@ def VST3d8 : VST3D<0b0100, {0,0,0,?}, "8">; def VST3d16 : VST3D<0b0100, {0,1,0,?}, "16">; def VST3d32 : VST3D<0b0100, {1,0,0,?}, "32">; -def VST3d8Pseudo : VSTQQPseudo<IIC_VST3>; -def VST3d16Pseudo : VSTQQPseudo<IIC_VST3>; -def VST3d32Pseudo : VSTQQPseudo<IIC_VST3>; +def VST3d8Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>; +def VST3d16Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>; +def VST3d32Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>; // ...with address register writeback: class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -1935,7 +1940,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u, "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST3]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST3Instruction"; } @@ -1944,9 +1949,9 @@ def VST3d8_UPD : VST3DWB<0b0100, {0,0,0,?}, "8">; def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">; def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">; -def VST3d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; -def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; -def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>; +def VST3d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; +def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; +def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; // ...with double-spaced registers: def VST3q8 : VST3D<0b0101, {0,0,0,?}, "8">; @@ -1956,25 +1961,25 @@ def VST3q8_UPD : VST3DWB<0b0101, {0,0,0,?}, "8">; def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">; def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">; -def VST3q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; -def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; -def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; +def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; +def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; // ...alternate versions to be allocated odd register numbers: -def VST3q8oddPseudo : VSTQQQQPseudo<IIC_VST3>; -def VST3q16oddPseudo : VSTQQQQPseudo<IIC_VST3>; -def VST3q32oddPseudo : VSTQQQQPseudo<IIC_VST3>; +def VST3q8oddPseudo : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>; +def VST3q16oddPseudo : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>; +def VST3q32oddPseudo : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>; -def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; -def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; -def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>; +def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; +def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; +def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>; // VST4 : Vector Store (multiple 4-element structures) class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", - "", []> { + "", []>, Sched<[WriteVST4]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST4Instruction"; @@ -1984,9 +1989,9 @@ def VST4d8 : VST4D<0b0000, {0,0,?,?}, "8">; def VST4d16 : VST4D<0b0000, {0,1,?,?}, "16">; def VST4d32 : VST4D<0b0000, {1,0,?,?}, "32">; -def VST4d8Pseudo : VSTQQPseudo<IIC_VST4>; -def VST4d16Pseudo : VSTQQPseudo<IIC_VST4>; -def VST4d32Pseudo : VSTQQPseudo<IIC_VST4>; +def VST4d8Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>; +def VST4d16Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>; +def VST4d32Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>; // ...with address register writeback: class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -1994,7 +1999,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST4Instruction"; } @@ -2003,9 +2008,9 @@ def VST4d8_UPD : VST4DWB<0b0000, {0,0,?,?}, "8">; def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">; def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">; -def VST4d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; -def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; -def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>; +def VST4d8Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; +def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; +def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; // ...with double-spaced registers: def VST4q8 : VST4D<0b0001, {0,0,?,?}, "8">; @@ -2015,18 +2020,18 @@ def VST4q8_UPD : VST4DWB<0b0001, {0,0,?,?}, "8">; def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">; def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">; -def VST4q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; -def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; -def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q8Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; +def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; +def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; // ...alternate versions to be allocated odd register numbers: -def VST4q8oddPseudo : VSTQQQQPseudo<IIC_VST4>; -def VST4q16oddPseudo : VSTQQQQPseudo<IIC_VST4>; -def VST4q32oddPseudo : VSTQQQQPseudo<IIC_VST4>; +def VST4q8oddPseudo : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>; +def VST4q16oddPseudo : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>; +def VST4q32oddPseudo : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>; -def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; -def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; -def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>; +def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; +def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; +def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>; } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 @@ -2060,12 +2065,13 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, : NLdStLn<1, 0b00, op11_8, op7_4, (outs), (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane), IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "", - [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> { + [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]>, + Sched<[WriteVST1]> { let Rm = 0b1111; let DecoderMethod = "DecodeVST1LN"; } class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp> - : VSTQLNPseudo<IIC_VST1ln> { + : VSTQLNPseudo<IIC_VST1ln>, Sched<[WriteVST1]> { let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), addrmode6:$addr)]; } @@ -2104,11 +2110,12 @@ class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty, "\\{$Vd[$lane]\\}, $Rn$Rm", "$Rn.addr = $wb", [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), - AdrMode:$Rn, am6offset:$Rm))]> { + AdrMode:$Rn, am6offset:$Rm))]>, + Sched<[WriteVST1]> { let DecoderMethod = "DecodeVST1LN"; } class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp> - : VSTQLNWBPseudo<IIC_VST1lnu> { + : VSTQLNWBPseudo<IIC_VST1lnu>, Sched<[WriteVST1]> { let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), addrmode6:$addr, am6offset:$offset))]; } @@ -2139,7 +2146,7 @@ class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdStLn<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane), IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn", - "", []> { + "", []>, Sched<[WriteVST1]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVST2LN"; @@ -2155,9 +2162,9 @@ def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> { let Inst{7} = lane{0}; } -def VST2LNd8Pseudo : VSTQLNPseudo<IIC_VST2ln>; -def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>; -def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>; +def VST2LNd8Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>; +def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>; +def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>; // ...with double-spaced registers: def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> { @@ -2169,8 +2176,8 @@ def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> { let Inst{4} = Rn{4}; } -def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>; -def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>; +def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>; +def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>; // ...with address register writeback: class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -2193,9 +2200,9 @@ def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> { let Inst{7} = lane{0}; } -def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; -def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; -def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>; +def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>; +def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>; +def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>; def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> { let Inst{7-6} = lane{1-0}; @@ -2204,15 +2211,16 @@ def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> { let Inst{7} = lane{0}; } -def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>; -def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>; +def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>; +def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>; // VST3LN : Vector Store (single 3-element structure from one lane) class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt> : NLdStLn<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VST3ln, "vst3", Dt, - "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> { + "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []>, + Sched<[WriteVST2]> { let Rm = 0b1111; let DecoderMethod = "DecodeVST3LN"; } @@ -2227,9 +2235,9 @@ def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> { let Inst{7} = lane{0}; } -def VST3LNd8Pseudo : VSTQQLNPseudo<IIC_VST3ln>; -def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>; -def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>; +def VST3LNd8Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>; +def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>; +def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>; // ...with double-spaced registers: def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> { @@ -2263,9 +2271,9 @@ def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> { let Inst{7} = lane{0}; } -def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; -def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; -def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>; +def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>; +def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>; def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> { let Inst{7-6} = lane{1-0}; @@ -2274,8 +2282,8 @@ def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> { let Inst{7} = lane{0}; } -def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>; -def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>; +def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>; +def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>; // VST4LN : Vector Store (single 4-element structure from one lane) class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -2283,7 +2291,7 @@ class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VST4ln, "vst4", Dt, "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn", - "", []> { + "", []>, Sched<[WriteVST2]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVST4LN"; @@ -2300,9 +2308,9 @@ def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VST4LNd8Pseudo : VSTQQLNPseudo<IIC_VST4ln>; -def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>; -def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>; +def VST4LNd8Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>; +def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>; +def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>; // ...with double-spaced registers: def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> { @@ -2313,8 +2321,8 @@ def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>; -def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>; +def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>; +def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>; // ...with address register writeback: class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt> @@ -2339,9 +2347,9 @@ def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; -def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; -def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>; +def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>; +def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>; def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> { let Inst{7-6} = lane{1-0}; @@ -2351,8 +2359,8 @@ def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>; -def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>; +def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>; +def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>; } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 87eb4c2b9074..ec5b97cba8cd 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -131,6 +131,17 @@ def WriteFPDIV64 : SchedWrite; def WriteFPSQRT32 : SchedWrite; def WriteFPSQRT64 : SchedWrite; +// Vector load and stores +def WriteVLD1 : SchedWrite; +def WriteVLD2 : SchedWrite; +def WriteVLD3 : SchedWrite; +def WriteVLD4 : SchedWrite; +def WriteVST1 : SchedWrite; +def WriteVST2 : SchedWrite; +def WriteVST3 : SchedWrite; +def WriteVST4 : SchedWrite; + + // Define TII for use in SchedVariant Predicates. def : PredicateProlog<[{ const ARMBaseInstrInfo *TII = diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 8fb8a2a3b6d2..4e72b13d94cb 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -1981,6 +1981,15 @@ def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; } def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; } def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; } +def : WriteRes<WriteVLD1, []>; +def : WriteRes<WriteVLD2, []>; +def : WriteRes<WriteVLD3, []>; +def : WriteRes<WriteVLD4, []>; +def : WriteRes<WriteVST1, []>; +def : WriteRes<WriteVST2, []>; +def : WriteRes<WriteVST3, []>; +def : WriteRes<WriteVST4, []>; + // Reserve A9UnitFP for 2 consecutive cycles. def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td index 537e5da9669f..782be9b60a7a 100644 --- a/lib/Target/ARM/ARMScheduleR52.td +++ b/lib/Target/ARM/ARMScheduleR52.td @@ -120,6 +120,12 @@ def : WriteRes<WriteFPDIV64, [R52UnitDiv]> { def : WriteRes<WriteFPSQRT32, [R52UnitDiv]> { let Latency = 7; } def : WriteRes<WriteFPSQRT64, [R52UnitDiv]> { let Latency = 17; } +// Overriden via InstRW for this processor. +def : WriteRes<WriteVST1, []>; +def : WriteRes<WriteVST2, []>; +def : WriteRes<WriteVST3, []>; +def : WriteRes<WriteVST4, []>; + def : ReadAdvance<ReadFPMUL, 1>; // mul operand read in F1 def : ReadAdvance<ReadFPMAC, 1>; // fp-mac operand read in F1 @@ -712,20 +718,20 @@ def R52WriteSTM : SchedWriteVariant<[ // Vector Load/Stores. Can issue only in slot-0. Can dual-issue with // another instruction in slot-1, but only in the last issue. -def R52WriteVLD1Mem : SchedWriteRes<[R52UnitLd]> { let Latency = 5;} -def R52WriteVLD2Mem : SchedWriteRes<[R52UnitLd]> { +def : WriteRes<WriteVLD1, [R52UnitLd]> { let Latency = 5;} +def : WriteRes<WriteVLD2, [R52UnitLd]> { let Latency = 6; let NumMicroOps = 3; let ResourceCycles = [2]; let SingleIssue = 1; } -def R52WriteVLD3Mem : SchedWriteRes<[R52UnitLd]> { +def : WriteRes<WriteVLD3, [R52UnitLd]> { let Latency = 7; let NumMicroOps = 5; let ResourceCycles = [3]; let SingleIssue = 1; } -def R52WriteVLD4Mem : SchedWriteRes<[R52UnitLd]> { +def : WriteRes<WriteVLD4, [R52UnitLd]> { let Latency = 8; let NumMicroOps = 7; let ResourceCycles = [4]; @@ -829,95 +835,6 @@ def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VRSHL", "VR def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VSWP", "VTRN", "VUZP", "VZIP")>; //--- -// VLDx. Vector Loads -//--- -// 1-element structure load -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD1q(8|16|32|64)$")>; -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)T$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Q$")>; -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d64TPseudo$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d64QPseudo$")>; - -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)d(8|16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1LNdAsm_(8|16|32)")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo$")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)wb")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1q(8|16|32|64)wb")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Twb")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Qwb")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64TPseudoWB")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64QPseudoWB")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNd(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNdWB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1DUP(d|q)(8|16|32)wb")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo_UPD")>; - -// 2-element structure load -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)wb")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)wb")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)Pseudo$")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)PseudoWB")>; - -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNdAsm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNqAsm_(16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNdWB_(fixed|register)_Asm_(8|16|32)")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)_UPD")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNqWB_(fixed|register)_Asm_(16|32)")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)wb")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2wb")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo_UPD")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo_UPD")>; - -// 3-element structure load -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)$")>; -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)Asm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>; - -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)Asm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>; - -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>; - -// 4-element structure load -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)Asm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>; - - -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)Asm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4LN(d|q)(8|16|32)Pseudo$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4DUPd(8|16|32)Pseudo$")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>; - -//--- // VSTx. Vector Stores //--- // 1-element structure store diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td index dc041c6c6006..b838688c6f04 100644 --- a/lib/Target/ARM/ARMScheduleSwift.td +++ b/lib/Target/ARM/ARMScheduleSwift.td @@ -1070,6 +1070,16 @@ let SchedModel = SwiftModel in { def : ReadAdvance<ReadFPMUL, 0>; def : ReadAdvance<ReadFPMAC, 0>; + // Overriden via InstRW for this processor. + def : WriteRes<WriteVLD1, []>; + def : WriteRes<WriteVLD2, []>; + def : WriteRes<WriteVLD3, []>; + def : WriteRes<WriteVLD4, []>; + def : WriteRes<WriteVST1, []>; + def : WriteRes<WriteVST2, []>; + def : WriteRes<WriteVST3, []>; + def : WriteRes<WriteVST4, []>; + // Not specified. def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>; // Preload. diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 1979cbf50125..c4f23c66e4ea 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -85,9 +85,9 @@ namespace llvm { extern "C" void LLVMInitializeARMTarget() { // Register the target. RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget()); + RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget()); RegisterTargetMachine<ARMBETargetMachine> Y(getTheARMBETarget()); - RegisterTargetMachine<ThumbLETargetMachine> A(getTheThumbLETarget()); - RegisterTargetMachine<ThumbBETargetMachine> B(getTheThumbBETarget()); + RegisterTargetMachine<ARMBETargetMachine> B(getTheThumbBETarget()); PassRegistry &Registry = *PassRegistry::getPassRegistry(); initializeGlobalISel(Registry); @@ -263,6 +263,11 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, else this->Options.EABIVersion = EABI::EABI5; } + + initAsmInfo(); + if (!Subtarget.isThumb() && !Subtarget.hasARMOps()) + report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " + "support ARM mode execution!"); } ARMBaseTargetMachine::~ARMBaseTargetMachine() = default; @@ -355,22 +360,6 @@ TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { }); } -void ARMTargetMachine::anchor() {} - -ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL, - bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { - initAsmInfo(); - if (!Subtarget.hasARMOps()) - report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " - "support ARM mode execution!"); -} - -void ARMLETargetMachine::anchor() {} ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -378,9 +367,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} - -void ARMBETargetMachine::anchor() {} + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -388,39 +375,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} - -void ThumbTargetMachine::anchor() {} - -ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL, bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { - initAsmInfo(); -} - -void ThumbLETargetMachine::anchor() {} - -ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) - : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} - -void ThumbBETargetMachine::anchor() {} - -ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional<Reloc::Model> RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) - : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} namespace { diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index f0ca9427d9fb..e5eb27114c72 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -62,23 +62,9 @@ public: } }; -/// ARM target machine. +/// ARM/Thumb little endian target machine. /// -class ARMTargetMachine : public ARMBaseTargetMachine { - virtual void anchor(); - -public: - ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool isLittle); -}; - -/// ARM little endian target machine. -/// -class ARMLETargetMachine : public ARMTargetMachine { - void anchor() override; - +class ARMLETargetMachine : public ARMBaseTargetMachine { public: ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -86,11 +72,9 @@ public: CodeGenOpt::Level OL); }; -/// ARM big endian target machine. +/// ARM/Thumb big endian target machine. /// -class ARMBETargetMachine : public ARMTargetMachine { - void anchor() override; - +class ARMBETargetMachine : public ARMBaseTargetMachine { public: ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -98,44 +82,6 @@ public: CodeGenOpt::Level OL); }; -/// Thumb target machine. -/// Due to the way architectures are handled, this represents both -/// Thumb-1 and Thumb-2. -/// -class ThumbTargetMachine : public ARMBaseTargetMachine { - virtual void anchor(); - -public: - ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool isLittle); -}; - -/// Thumb little endian target machine. -/// -class ThumbLETargetMachine : public ThumbTargetMachine { - void anchor() override; - -public: - ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; - -/// Thumb big endian target machine. -/// -class ThumbBETargetMachine : public ThumbTargetMachine { - void anchor() override; - -public: - ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; - } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp index 94f9e8dfebbf..edbf2b99126c 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -30,8 +30,8 @@ using namespace dwarf; void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { - const ARMTargetMachine &ARM_TM = static_cast<const ARMTargetMachine &>(TM); - bool isAAPCS_ABI = ARM_TM.TargetABI == ARMTargetMachine::ARMABI::ARM_ABI_AAPCS; + const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM); + bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS; genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly(); TargetLoweringObjectFileELF::Initialize(Ctx, TM); diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 1a17d4e33e4f..f917c35b9ceb 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -535,14 +535,14 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, // Look for a temporary register to use. // First, compute the liveness information. - LivePhysRegs UsedRegs(STI.getRegisterInfo()); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + LivePhysRegs UsedRegs(TRI); UsedRegs.addLiveOuts(MBB); // The semantic of pristines changed recently and now, // the callee-saved registers that are touched in the function // are not part of the pristines set anymore. // Add those callee-saved now. - const TargetRegisterInfo *TRI = STI.getRegisterInfo(); - const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF); for (unsigned i = 0; CSRegs[i]; ++i) UsedRegs.addReg(CSRegs[i]); @@ -561,12 +561,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, // And some temporary register, just in case. unsigned TemporaryReg = 0; BitVector PopFriendly = - TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID)); + TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID)); assert(PopFriendly.any() && "No allocatable pop-friendly register?!"); // Rebuild the GPRs from the high registers because they are removed // form the GPR reg class for thumb1. BitVector GPRsNoLRSP = - TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID)); + TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::hGPRRegClassID)); GPRsNoLRSP |= PopFriendly; GPRsNoLRSP.reset(ARM::LR); GPRsNoLRSP.reset(ARM::SP); diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index 06ad2b3ffdf8..f10ca394f36c 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -902,7 +902,6 @@ let Defs = [SREG] in // CPI Rd, K // Compares a register with an 8 bit immediate. - let Uses = [SREG] in def CPIRdK : FRdK<0b0011, (outs), (ins GPR8:$rd, imm_ldi8:$k), diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index 6897161c903c..cc7a7c3849bc 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -132,6 +132,10 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128; } +bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + return false; +} + SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { case ISD::BR_CC: @@ -496,8 +500,11 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const { SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + auto N = cast<GlobalAddressSDNode>(Op); + assert(N->getOffset() == 0 && "Invalid offset for global address"); + SDLoc DL(Op); - const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + const GlobalValue *GV = N->getGlobal(); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i64); return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA); diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h index 3d1726be286e..0b8a8ca20c3b 100644 --- a/lib/Target/BPF/BPFISelLowering.h +++ b/lib/Target/BPF/BPFISelLowering.h @@ -42,6 +42,10 @@ public: // This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + // This method decides whether folding a constant offset + // with the given GlobalAddress is legal. + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index a04aca4afa0f..25018b9ed510 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1657,7 +1657,7 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B, // defined. From the point of view of the liveness tracking, it is ok to // store it as a whole, but if we break it up we may end up storing a // register that is entirely undefined. - LivePhysRegs LPR(&HRI); + LivePhysRegs LPR(HRI); LPR.addLiveIns(B); SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers; for (auto R = B.begin(); R != It; ++R) { diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 03794511414e..66e07c67958e 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1254,7 +1254,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { const MachineOperand &Op1 = MI.getOperand(1); const MachineOperand &Op2 = MI.getOperand(2); const MachineOperand &Op3 = MI.getOperand(3); - LivePhysRegs LiveAtMI(&HRI); + LivePhysRegs LiveAtMI(HRI); getLiveRegsAt(LiveAtMI, MI); bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg()); if (Op0.getReg() != Op2.getReg()) { @@ -1283,7 +1283,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineOperand &Op1 = MI.getOperand(1); MachineOperand &Op2 = MI.getOperand(2); MachineOperand &Op3 = MI.getOperand(3); - LivePhysRegs LiveAtMI(&HRI); + LivePhysRegs LiveAtMI(HRI); getLiveRegsAt(LiveAtMI, MI); bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg()); diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td index 0f99dfe342b8..93fb688fc1c0 100644 --- a/lib/Target/Hexagon/HexagonPseudo.td +++ b/lib/Target/Hexagon/HexagonPseudo.td @@ -412,6 +412,15 @@ def PS_vstorerwu_ai: STrivv_template<VecDblRegs, V6_vS32Ub_ai>, def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32Ub_ai_128B>, Requires<[HasV60T,UseHVXDbl]>; +let isPseudo = 1, isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0 in { + def PS_vstorerq_ai: Pseudo<(outs), + (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs:$Qt), "", []>, + Requires<[HasV60T,UseHVXSgl]>; + def PS_vstorerq_ai_128B: Pseudo<(outs), + (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs128B:$Qt), "", []>, + Requires<[HasV60T,UseHVXDbl]>; +} + // Vector load pseudos let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in @@ -429,30 +438,16 @@ def PS_vloadrwu_ai: LDrivv_template<VecDblRegs, V6_vL32Ub_ai>, def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32Ub_ai_128B>, Requires<[HasV60T,UseHVXDbl]>; -// Store vector predicate pseudo. -let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13, - isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { - def PS_vstorerq_ai : STInst<(outs), - (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1), - ".error \"should not emit\" ", []>, - Requires<[HasV60T,UseHVXSgl]>; - - def PS_vstorerq_ai_128B : STInst<(outs), - (ins IntRegs:$base, s32_0Imm:$offset, VectorRegs:$src1), - ".error \"should not emit\" ", []>, - Requires<[HasV60T,UseHVXSgl]>; - - def PS_vloadrq_ai : STInst<(outs), - (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1), - ".error \"should not emit\" ", []>, - Requires<[HasV60T,UseHVXDbl]>; - - def PS_vloadrq_ai_128B : STInst<(outs), - (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1), - ".error \"should not emit\" ", []>, - Requires<[HasV60T,UseHVXDbl]>; +let isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in { + def PS_vloadrq_ai: Pseudo<(outs VecPredRegs:$Qd), + (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>, + Requires<[HasV60T,UseHVXSgl]>; + def PS_vloadrq_ai_128B: Pseudo<(outs VecPredRegs128B:$Qd), + (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>, + Requires<[HasV60T,UseHVXDbl]>; } + let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in class VSELInst<dag outs, dag ins, InstHexagon rootInst> : InstHexagon<outs, ins, "", [], "", rootInst.Itinerary, rootInst.Type>; diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 2a1bb63af789..1fc157900ed5 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -50,11 +50,6 @@ bool HexagonRegisterInfo::isEHReturnCalleeSaveReg(unsigned R) const { R == Hexagon::R3 || R == Hexagon::D0 || R == Hexagon::D1; } -bool HexagonRegisterInfo::isCalleeSaveReg(unsigned Reg) const { - return Hexagon::R16 <= Reg && Reg <= Hexagon::R27; -} - - const MCPhysReg * HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF, const TargetRegisterClass *RC) const { diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h index 8a3f175b8488..5f65fad2cc04 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.h +++ b/lib/Target/Hexagon/HexagonRegisterInfo.h @@ -77,7 +77,6 @@ public: unsigned getFirstCallerSavedNonParamReg() const; bool isEHReturnCalleeSaveReg(unsigned Reg) const; - bool isCalleeSaveReg(unsigned Reg) const; }; } // end namespace llvm diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index c21b6e2515d3..cd474921d4bc 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -214,12 +214,12 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) { for (auto &MB : MF) { auto Begin = MB.begin(), End = MB.end(); while (Begin != End) { - // First the first non-boundary starting from the end of the last + // Find the first non-boundary starting from the end of the last // scheduling region. MachineBasicBlock::iterator RB = Begin; while (RB != End && HII->isSchedulingBoundary(*RB, &MB, MF)) ++RB; - // First the first boundary starting from the beginning of the new + // Find the first boundary starting from the beginning of the new // region. MachineBasicBlock::iterator RE = RB; while (RE != End && !HII->isSchedulingBoundary(*RE, &MB, MF)) diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt index 8be2a898e380..34b966df7761 100644 --- a/lib/Target/LLVMBuild.txt +++ b/lib/Target/LLVMBuild.txt @@ -29,6 +29,7 @@ subdirectories = MSP430 NVPTX Mips + Nios2 PowerPC RISCV Sparc diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td index dfea669f3ba1..203864dd4065 100644 --- a/lib/Target/MSP430/MSP430.td +++ b/lib/Target/MSP430/MSP430.td @@ -22,6 +22,18 @@ def FeatureX : SubtargetFeature<"ext", "ExtendedInsts", "true", "Enable MSP430-X extensions">; +def FeatureHWMult16 + : SubtargetFeature<"hwmult16", "HWMultMode", "HWMult16", + "Enable 16-bit hardware multiplier">; + +def FeatureHWMult32 + : SubtargetFeature<"hwmult32", "HWMultMode", "HWMult32", + "Enable 32-bit hardware multiplier">; + +def FeatureHWMultF5 + : SubtargetFeature<"hwmultf5", "HWMultMode", "HWMultF5", + "Enable F5 series hardware multiplier">; + //===----------------------------------------------------------------------===// // MSP430 supported processors. //===----------------------------------------------------------------------===// @@ -29,6 +41,8 @@ class Proc<string Name, list<SubtargetFeature> Features> : Processor<Name, NoItineraries, Features>; def : Proc<"generic", []>; +def : Proc<"msp430", []>; +def : Proc<"msp430x", [FeatureX]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp index cd58eda5d924..0b02f79f472a 100644 --- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp +++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp @@ -403,12 +403,12 @@ void MSP430DAGToDAGISel::Select(SDNode *Node) { int FI = cast<FrameIndexSDNode>(Node)->getIndex(); SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16); if (Node->hasOneUse()) { - CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI, + CurDAG->SelectNodeTo(Node, MSP430::ADDframe, MVT::i16, TFI, CurDAG->getTargetConstant(0, dl, MVT::i16)); return; } ReplaceNode(Node, CurDAG->getMachineNode( - MSP430::ADD16ri, dl, MVT::i16, TFI, + MSP430::ADDframe, dl, MVT::i16, TFI, CurDAG->getTargetConstant(0, dl, MVT::i16))); return; } diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index cc6e64043f54..dae14fd301ee 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -38,27 +38,6 @@ using namespace llvm; #define DEBUG_TYPE "msp430-lower" -typedef enum { - NoHWMult, - HWMult16, - HWMult32, - HWMultF5 -} HWMultUseMode; - -static cl::opt<HWMultUseMode> -HWMultMode("mhwmult", cl::Hidden, - cl::desc("Hardware multiplier use mode"), - cl::init(NoHWMult), - cl::values( - clEnumValN(NoHWMult, "none", - "Do not use hardware multiplier"), - clEnumValN(HWMult16, "16bit", - "Use 16-bit hardware multiplier"), - clEnumValN(HWMult32, "32bit", - "Use 32-bit hardware multiplier"), - clEnumValN(HWMultF5, "f5series", - "Use F5 series hardware multiplier"))); - MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, const MSP430Subtarget &STI) : TargetLowering(TM) { @@ -262,7 +241,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setCmpLibcallCC(LC.Op, LC.Cond); } - if (HWMultMode == HWMult16) { + if (STI.hasHWMult16()) { const struct { const RTLIB::Libcall Op; const char * const Name; @@ -277,7 +256,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); } - } else if (HWMultMode == HWMult32) { + } else if (STI.hasHWMult32()) { const struct { const RTLIB::Libcall Op; const char * const Name; @@ -292,7 +271,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); } - } else if (HWMultMode == HWMultF5) { + } else if (STI.hasHWMultF5()) { const struct { const RTLIB::Libcall Op; const char * const Name; diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td index 1cd18611e52c..cec43040f60d 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.td +++ b/lib/Target/MSP430/MSP430InstrInfo.td @@ -122,6 +122,11 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2), [(MSP430callseq_end timm:$amt1, timm:$amt2)]>; } +let Defs = [SR], Uses = [SP] in { +def ADDframe : Pseudo<(outs GR16:$dst), (ins i16imm:$base, i16imm:$offset), + "# ADDframe PSEUDO", []>; +} + let usesCustomInserter = 1 in { let Uses = [SR] in { def Select8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc), diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 9600bc28f100..7a3b7a8bd5ff 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -127,7 +127,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Fold imm into offset Offset += MI.getOperand(FIOperandNum + 1).getImm(); - if (MI.getOpcode() == MSP430::ADD16ri) { + if (MI.getOpcode() == MSP430::ADDframe) { // This is actually "load effective address" of the stack slot // instruction. We have only two-address instructions, thus we need to // expand it into mov + add diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp index 6216348e4d71..776a9dcb11d4 100644 --- a/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/lib/Target/MSP430/MSP430Subtarget.cpp @@ -19,6 +19,20 @@ using namespace llvm; #define DEBUG_TYPE "msp430-subtarget" +static cl::opt<MSP430Subtarget::HWMultEnum> +HWMultModeOption("mhwmult", cl::Hidden, + cl::desc("Hardware multiplier use mode for MSP430"), + cl::init(MSP430Subtarget::NoHWMult), + cl::values( + clEnumValN(MSP430Subtarget::NoHWMult, "none", + "Do not use hardware multiplier"), + clEnumValN(MSP430Subtarget::HWMult16, "16bit", + "Use 16-bit hardware multiplier"), + clEnumValN(MSP430Subtarget::HWMult32, "32bit", + "Use 32-bit hardware multiplier"), + clEnumValN(MSP430Subtarget::HWMultF5, "f5series", + "Use F5 series hardware multiplier"))); + #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR #include "MSP430GenSubtargetInfo.inc" @@ -27,7 +41,18 @@ void MSP430Subtarget::anchor() { } MSP430Subtarget & MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { - ParseSubtargetFeatures("generic", FS); + ExtendedInsts = false; + HWMultMode = NoHWMult; + + std::string CPUName = CPU; + if (CPUName.empty()) + CPUName = "msp430"; + + ParseSubtargetFeatures(CPUName, FS); + + if (HWMultModeOption != NoHWMult) + HWMultMode = HWMultModeOption; + return *this; } diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h index 1a00d85e01cb..8828dfd65878 100644 --- a/lib/Target/MSP430/MSP430Subtarget.h +++ b/lib/Target/MSP430/MSP430Subtarget.h @@ -30,8 +30,15 @@ namespace llvm { class StringRef; class MSP430Subtarget : public MSP430GenSubtargetInfo { +public: + enum HWMultEnum { + NoHWMult, HWMult16, HWMult32, HWMultF5 + }; + +private: virtual void anchor(); bool ExtendedInsts; + HWMultEnum HWMultMode; MSP430FrameLowering FrameLowering; MSP430InstrInfo InstrInfo; MSP430TargetLowering TLInfo; @@ -50,6 +57,10 @@ public: /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + bool hasHWMult16() const { return HWMultMode == HWMult16; } + bool hasHWMult32() const { return HWMultMode == HWMult32; } + bool hasHWMultF5() const { return HWMultMode == HWMultF5; } + const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; } diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 3641a70d61b5..8fe4e75f3e18 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -813,28 +813,28 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1)) return SDValue(); - // The shift masks must have the same position and size. - if (SMPos0 != SMPos1 || SMSize0 != SMSize1) - return SDValue(); + // The shift masks must have the same position and size. + if (SMPos0 != SMPos1 || SMSize0 != SMSize1) + return SDValue(); - SDValue Shl = And1.getOperand(0); + SDValue Shl = And1.getOperand(0); - if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1)))) - return SDValue(); + if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1)))) + return SDValue(); - unsigned Shamt = CN->getZExtValue(); + unsigned Shamt = CN->getZExtValue(); - // Return if the shift amount and the first bit position of mask are not the - // same. - EVT ValTy = N->getValueType(0); - if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits())) - return SDValue(); + // Return if the shift amount and the first bit position of mask are not the + // same. + EVT ValTy = N->getValueType(0); + if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits())) + return SDValue(); - SDLoc DL(N); - return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0), - DAG.getConstant(SMPos0, DL, MVT::i32), - DAG.getConstant(SMSize0, DL, MVT::i32), - And0.getOperand(0)); + SDLoc DL(N); + return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0), + DAG.getConstant(SMPos0, DL, MVT::i32), + DAG.getConstant(SMSize0, DL, MVT::i32), + And0.getOperand(0)); } else { // Pattern match DINS. // $dst = or (and $src, mask0), mask1 diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index 8f5ecadecdea..1f4e933db2a2 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -59,9 +59,8 @@ static cl::opt<bool> void MipsSubtarget::anchor() { } -MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU, - const std::string &FS, bool little, - const MipsTargetMachine &TM) +MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, + bool little, const MipsTargetMachine &TM) : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault), IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false), NoABICalls(false), IsFP64bit(false), UseOddSPReg(true), @@ -77,8 +76,6 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU, FrameLowering(MipsFrameLowering::create(*this)), TLInfo(MipsTargetLowering::create(TM, *this)) { - PreviousInMips16Mode = InMips16Mode; - if (MipsArchVersion == MipsDefault) MipsArchVersion = Mips32; diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index cca2cb8a4660..b4d15ee361ff 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -119,9 +119,6 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // Mips16 hard float bool InMips16HardFloat; - // PreviousInMips16 -- the function we just processed was in Mips 16 Mode - bool PreviousInMips16Mode; - // InMicroMips -- can process MicroMips instructions bool InMicroMipsMode; @@ -178,8 +175,8 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. - MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, - bool little, const MipsTargetMachine &TM); + MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little, + const MipsTargetMachine &TM); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. diff --git a/lib/Target/Nios2/CMakeLists.txt b/lib/Target/Nios2/CMakeLists.txt new file mode 100644 index 000000000000..78db452094bd --- /dev/null +++ b/lib/Target/Nios2/CMakeLists.txt @@ -0,0 +1,18 @@ +set(LLVM_TARGET_DEFINITIONS Nios2.td) + +#Generate Nios2GenRegisterInfo.inc and Nios2GenInstrInfo.inc which included by +#your hand code C++ files. +#Nios2GenRegisterInfo.inc came from Nios2RegisterInfo.td, Nios2GenInstrInfo.inc +#came from Nios2InstrInfo.td. +tablegen(LLVM Nios2GenRegisterInfo.inc -gen-register-info) +tablegen(LLVM Nios2GenInstrInfo.inc -gen-instr-info) + +#Nios2CommonTableGen must be defined +add_public_tablegen_target(Nios2CommonTableGen) + +#Nios2CodeGen should match with LLVMBuild.txt Nios2CodeGen +add_llvm_target(Nios2CodeGen Nios2TargetMachine.cpp) + +#Should match with "subdirectories = MCTargetDesc TargetInfo" in LLVMBuild.txt +add_subdirectory(TargetInfo) +add_subdirectory(MCTargetDesc) diff --git a/lib/Target/Nios2/LLVMBuild.txt b/lib/Target/Nios2/LLVMBuild.txt new file mode 100644 index 000000000000..b40a76379706 --- /dev/null +++ b/lib/Target/Nios2/LLVMBuild.txt @@ -0,0 +1,61 @@ +;===- ./lib/Target/Nios2/LLVMBuild.txt -------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +#Following comments extracted from http: // llvm.org/docs/LLVMBuild.html + +[common] +subdirectories = + MCTargetDesc + TargetInfo + +[component_0] +#TargetGroup components are an extension of LibraryGroups, specifically for +#defining LLVM targets(which are handled specially in a few places). +type = TargetGroup +#The name of the component should always be the name of the target.(should +#match "def Nios2 : Target" in Nios2.td) +name = Nios2 +#Nios2 component is located in directory Target / +parent = Target +#Whether this target defines an assembly parser, assembly printer, disassembler +#, and supports JIT compilation.They are optional. + +[component_1] +#component_1 is a Library type and name is Nios2CodeGen.After build it will +#in lib / libLLVMNios2CodeGen.a of your build command directory. +type = Library +name = Nios2CodeGen +#Nios2CodeGen component(Library) is located in directory Nios2 / +parent = Nios2 +#If given, a list of the names of Library or LibraryGroup components which +#must also be linked in whenever this library is used.That is, the link time +#dependencies for this component.When tools are built, the build system will +#include the transitive closure of all required_libraries for the components +#the tool needs. +required_libraries = CodeGen + Core + GlobalISel + MC + Nios2Desc + Nios2Info + Support + Target +#end of required_libraries + +#All LLVMBuild.txt in Target / Nios2 and subdirectory use 'add_to_library_groups +#= Nios2' +add_to_library_groups = Nios2 diff --git a/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt b/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt new file mode 100644 index 000000000000..21def509a232 --- /dev/null +++ b/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt @@ -0,0 +1,2 @@ +#MCTargetDesc / CMakeLists.txt +add_llvm_library(LLVMNios2Desc Nios2MCTargetDesc.cpp) diff --git a/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt b/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt new file mode 100644 index 000000000000..4dc6995e7f5c --- /dev/null +++ b/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt @@ -0,0 +1,25 @@ +;===- ./lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = Nios2Desc +parent = Nios2 +required_libraries = MC + Nios2Info + Support +add_to_library_groups = Nios2 diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp new file mode 100644 index 000000000000..d913166399c6 --- /dev/null +++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp @@ -0,0 +1,25 @@ +//===-- Nios2MCTargetDesc.cpp - Nios2 Target Descriptions -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides Nios2 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "Nios2MCTargetDesc.h" +#include "llvm/MC/MCInstrInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "Nios2GenInstrInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "Nios2GenRegisterInfo.inc" + +extern "C" void LLVMInitializeNios2TargetMC() {} diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h new file mode 100644 index 000000000000..d426062db168 --- /dev/null +++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h @@ -0,0 +1,34 @@ +//===-- Nios2MCTargetDesc.h - Nios2 Target Descriptions ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides Nios2 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H +#define LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H + +namespace llvm { +class Target; +class Triple; + +Target &getTheNios2Target(); + +} // namespace llvm + +// Defines symbolic names for Nios2 registers. This defines a mapping from +// register name to register number. +#define GET_REGINFO_ENUM +#include "Nios2GenRegisterInfo.inc" + +// Defines symbolic names for the Nios2 instructions. +#define GET_INSTRINFO_ENUM +#include "Nios2GenInstrInfo.inc" + +#endif diff --git a/lib/Target/Nios2/Nios2.h b/lib/Target/Nios2/Nios2.h new file mode 100644 index 000000000000..87202f48cfbe --- /dev/null +++ b/lib/Target/Nios2/Nios2.h @@ -0,0 +1,25 @@ +//===-- Nios2.h - Top-level interface for Nios2 representation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in +// the LLVM Nios2 back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2_H +#define LLVM_LIB_TARGET_NIOS2_NIOS2_H + +#include "MCTargetDesc/Nios2MCTargetDesc.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +class Nios2TargetMachine; +} // namespace llvm + +#endif diff --git a/lib/Target/Nios2/Nios2.td b/lib/Target/Nios2/Nios2.td new file mode 100644 index 000000000000..e8abba863370 --- /dev/null +++ b/lib/Target/Nios2/Nios2.td @@ -0,0 +1,29 @@ +//===-- Nios2.td - Describe the Nios2 Target Machine -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// Target-dependent interfaces +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Calling Conv, Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "Nios2RegisterInfo.td" +include "Nios2InstrInfo.td" + +def Nios2InstrInfo : InstrInfo; + +def Nios2 : Target { let InstructionSet = Nios2InstrInfo; } diff --git a/lib/Target/Nios2/Nios2InstrFormats.td b/lib/Target/Nios2/Nios2InstrFormats.td new file mode 100644 index 000000000000..79868be48a48 --- /dev/null +++ b/lib/Target/Nios2/Nios2InstrFormats.td @@ -0,0 +1,117 @@ +//===-- Nios2InstrFormats.td - Nios2 Instruction Formats ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Describe NIOS2 instructions format +// +// +//===----------------------------------------------------------------------===// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<3> val> { + bits<3> Value = val; +} + +def Pseudo : Format<0>; +def FrmI : Format<1>; +def FrmR : Format<2>; +def FrmJ : Format<3>; +def FrmOther : Format<4>; // Instruction w/ a custom format + +// Generic Nios2 Format +class Nios2Inst<dag outs, dag ins, string asmstr, list<dag> pattern, Format f> + : Instruction { + field bits<32> Inst; + Format Form = f; + + let Namespace = "Nios2"; + + let Size = 4; + + bits<6> Opcode = 0; + + // Bottom 6 bits are the 'opcode' field + let Inst{5 - 0} = Opcode; + + let OutOperandList = outs; + let InOperandList = ins; + + let AsmString = asmstr; + let Pattern = pattern; + + // + // Attributes specific to Nios2 instructions: + // + bits<3> FormBits = Form.Value; + + // TSFlags layout should be kept in sync with Nios2InstrInfo.h. + let TSFlags{2 - 0} = FormBits; + + let DecoderNamespace = "Nios2"; +} + +// Nios2 Instruction Format +class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern, Format f> + : Nios2Inst<outs, ins, asmstr, pattern, f> { +} + +//===----------------------------------------------------------------------===// +// Format I instruction class in Nios2 : <|A|B|immediate|opcode|> +//===----------------------------------------------------------------------===// + +class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSE<outs, ins, asmstr, pattern, FrmI> { + bits<5> rA; + bits<5> rB; + bits<16> imm; + + let Opcode = op; + + let Inst{31 - 27} = rA; + let Inst{26 - 22} = rB; + let Inst{21 - 6} = imm; +} + +//===----------------------------------------------------------------------===// +// Format R instruction : <|A|B|C|opx|imm|opcode|> +//===----------------------------------------------------------------------===// + +class FR<bits<6> opx, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSE<outs, ins, asmstr, pattern, FrmR> { + bits<5> rA; + bits<5> rB; + bits<5> rC; + bits<5> imm = 0; + + // opcode is always 0x3a for R instr. + let Opcode = 0x3a; + + let Inst{31 - 27} = rA; + let Inst{26 - 22} = rB; + let Inst{21 - 17} = rC; + // opx stands for opcode extension + let Inst{16 - 11} = opx; + // optional 5-bit immediate value + let Inst{10 - 6} = imm; +} + +//===----------------------------------------------------------------------===// +// Format J instruction class in Nios2 : <|address|opcode|> +//===----------------------------------------------------------------------===// + +class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSE<outs, ins, asmstr, pattern, FrmJ> { + bits<26> addr; + + let Opcode = op; + + let Inst{31 - 6} = addr; +} diff --git a/lib/Target/Nios2/Nios2InstrInfo.td b/lib/Target/Nios2/Nios2InstrInfo.td new file mode 100644 index 000000000000..5e4815ab3e16 --- /dev/null +++ b/lib/Target/Nios2/Nios2InstrInfo.td @@ -0,0 +1,50 @@ +//===- Nios2InstrInfo.td - Target Description for Nios2 ------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Nios2 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction format superclass +//===----------------------------------------------------------------------===// + +include "Nios2InstrFormats.td" + +//===----------------------------------------------------------------------===// +// Nios2 Operand, Complex Patterns and Transformations Definitions. +//===----------------------------------------------------------------------===// + +def simm16 : Operand<i32> { + let DecoderMethod= "DecodeSimm16"; +} + +// Node immediate fits as 16-bit sign extended on target immediate. +// e.g. addi, andi +def immSExt16 : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>; + +//===----------------------------------------------------------------------===// +// Instructions specific format +//===----------------------------------------------------------------------===// + +// Arithmetic and logical instructions with 2 register operands. +class ArithLogicI<bits<6> op, string instr_asm, SDNode OpNode, + Operand Od, PatLeaf imm_type, RegisterClass RC> : + FI<op, (outs RC:$rB), (ins RC:$rA, Od:$imm16), + !strconcat(instr_asm, "\t$rB, $rA, $imm16"), + [(set RC:$rB, (OpNode RC:$rA, imm_type:$imm16))]> { + let isReMaterializable = 1; +} + +//===----------------------------------------------------------------------===// +// Nios2 R1 Instructions +//===----------------------------------------------------------------------===// + +/// Arithmetic Instructions (ALU Immediate) +def ADDi : ArithLogicI<0x04, "addi", add, simm16, immSExt16, CPURegs>; diff --git a/lib/Target/Nios2/Nios2RegisterInfo.td b/lib/Target/Nios2/Nios2RegisterInfo.td new file mode 100644 index 000000000000..1808815816f3 --- /dev/null +++ b/lib/Target/Nios2/Nios2RegisterInfo.td @@ -0,0 +1,60 @@ +//===-- Nios2RegisterInfo.td - Nios2 Register defs ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// We have bank of 32 registers. +class Nios2Reg<string n> : Register<n> { + field bits<5> Num; + let Namespace = "Nios2"; +} + +// Nios2 CPU Registers +class Nios2GPRReg<bits<5> num, string n> : Nios2Reg<n> { + let Num = num; +} + +//===----------------------------------------------------------------------===// +// Registers +//===----------------------------------------------------------------------===// + +let Namespace = "Nios2" in { + // General Purpose Registers + def ZERO : Nios2GPRReg<0, "zero">, DwarfRegNum<[ 0 ]>; + def AT : Nios2GPRReg<1, "at">, DwarfRegNum<[ 1 ]>; + foreach RegNum = 2 - 23 in { + def R #RegNum : Nios2GPRReg<RegNum, "r" #RegNum>, DwarfRegNum<[ RegNum ]>; + } + def ET : Nios2GPRReg<24, "et">, DwarfRegNum<[ 24 ]>; + def BT : Nios2GPRReg<25, "bt">, DwarfRegNum<[ 25 ]>; + def GP : Nios2GPRReg<26, "gp">, DwarfRegNum<[ 26 ]>; + def SP : Nios2GPRReg<27, "sp">, DwarfRegNum<[ 27 ]>; + def FP : Nios2GPRReg<28, "fp">, DwarfRegNum<[ 28 ]>; + def EA : Nios2GPRReg<29, "ea">, DwarfRegNum<[ 29 ]>; + def BA : Nios2GPRReg<30, "ba">, DwarfRegNum<[ 30 ]>; + def RA : Nios2GPRReg<31, "ra">, DwarfRegNum<[ 31 ]>; + def PC : Nios2Reg<"pc">, DwarfRegNum<[ 32 ]>; +} + +//===----------------------------------------------------------------------===// +// Register Classes +//===----------------------------------------------------------------------===// + +def CPURegs : RegisterClass<"Nios2", [ i32 ], 32, + (add + // Reserved + ZERO, + AT, + // Return Values and Arguments + (sequence "R%u", 2, 7), + // Not preserved across procedure calls + // Caller saved + (sequence "R%u", 8, 15), + // Callee saved + (sequence "R%u", 16, 23), + // Reserved + ET, BT, GP, SP, FP, EA, BA, RA, PC)>; diff --git a/lib/Target/Nios2/Nios2TargetMachine.cpp b/lib/Target/Nios2/Nios2TargetMachine.cpp new file mode 100644 index 000000000000..16d4eabcfaf7 --- /dev/null +++ b/lib/Target/Nios2/Nios2TargetMachine.cpp @@ -0,0 +1,46 @@ +//===-- Nios2TargetMachine.cpp - Define TargetMachine for Nios2 -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implements the info about Nios2 target spec. +// +//===----------------------------------------------------------------------===// + +#include "Nios2TargetMachine.h" +#include "Nios2.h" + +using namespace llvm; + +#define DEBUG_TYPE "nios2" + +extern "C" void LLVMInitializeNios2Target() { + // Register the target. +} + +static std::string computeDataLayout(const Triple &TT, StringRef CPU, + const TargetOptions &Options) { + return "e-p:32:32:32-i8:8:32-i16:16:32-n32"; +} + +static Reloc::Model getEffectiveRelocModel(CodeModel::Model CM, + Optional<Reloc::Model> RM) { + if (!RM.hasValue() || CM == CodeModel::JITDefault) + return Reloc::Static; + return *RM; +} + +Nios2TargetMachine::Nios2TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Optional<Reloc::Model> RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS, + Options, getEffectiveRelocModel(CM, RM), CM, OL) {} + +Nios2TargetMachine::~Nios2TargetMachine() {} diff --git a/lib/Target/Nios2/Nios2TargetMachine.h b/lib/Target/Nios2/Nios2TargetMachine.h new file mode 100644 index 000000000000..7f145c82f32c --- /dev/null +++ b/lib/Target/Nios2/Nios2TargetMachine.h @@ -0,0 +1,30 @@ +//===-- Nios2TargetMachine.h - Define TargetMachine for Nios2 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the Nios2 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H +#define LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +class Nios2TargetMachine : public LLVMTargetMachine { +public: + Nios2TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, + CodeGenOpt::Level OL); + ~Nios2TargetMachine() override; +}; +} // namespace llvm + +#endif diff --git a/lib/Target/Nios2/TargetInfo/CMakeLists.txt b/lib/Target/Nios2/TargetInfo/CMakeLists.txt new file mode 100644 index 000000000000..394d2c2680b7 --- /dev/null +++ b/lib/Target/Nios2/TargetInfo/CMakeLists.txt @@ -0,0 +1 @@ +add_llvm_library(LLVMNios2Info Nios2TargetInfo.cpp) diff --git a/lib/Target/Nios2/TargetInfo/LLVMBuild.txt b/lib/Target/Nios2/TargetInfo/LLVMBuild.txt new file mode 100644 index 000000000000..558f7501ea6b --- /dev/null +++ b/lib/Target/Nios2/TargetInfo/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/Nios2/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = Nios2Info +parent = Nios2 +required_libraries = Support +add_to_library_groups = Nios2 diff --git a/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp b/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp new file mode 100644 index 000000000000..e317686140f7 --- /dev/null +++ b/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp @@ -0,0 +1,24 @@ +//===-- Nios2TargetInfo.cpp - Nios2 Target Implementation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Nios2.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +Target &llvm::getTheNios2Target() { + static Target TheNios2Target; + return TheNios2Target; +} + +extern "C" void LLVMInitializeNios2TargetInfo() { + RegisterTarget<Triple::nios2, + /*HasJIT=*/true> + X(getTheNios2Target(), "nios2", "Nios2"); +} diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp index ebd414baf1d2..41e3190c3eec 100644 --- a/lib/Target/PowerPC/PPCExpandISEL.cpp +++ b/lib/Target/PowerPC/PPCExpandISEL.cpp @@ -339,7 +339,7 @@ void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL, // Note: Cannot use stepBackward instead since we are using the Reg // liveness state at the end of MBB (liveOut of MBB) as the liveIn for // NewSuccessor. Otherwise, will cause cyclic dependence. - LivePhysRegs LPR(MF->getSubtarget<PPCSubtarget>().getRegisterInfo()); + LivePhysRegs LPR(*MF->getSubtarget<PPCSubtarget>().getRegisterInfo()); SmallVector<std::pair<unsigned, const MachineOperand *>, 2> Clobbers; for (MachineInstr &MI : *MBB) LPR.stepForward(MI, Clobbers); diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index e65b1f1aa0a5..b90a5ee28342 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1596,9 +1596,8 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { return true; } -bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, - unsigned &InsertAtByte, bool &Swap, bool IsLE) { // Check that the mask is shuffling words +static bool isWordShuffleMask(ShuffleVectorSDNode *N) { for (unsigned i = 0; i < 4; ++i) { unsigned B0 = N->getMaskElt(i*4); unsigned B1 = N->getMaskElt(i*4+1); @@ -1610,6 +1609,14 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, return false; } + return true; +} + +bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + unsigned &InsertAtByte, bool &Swap, bool IsLE) { + if (!isWordShuffleMask(N)) + return false; + // Now we look at mask elements 0,4,8,12 unsigned M0 = N->getMaskElt(0) / 4; unsigned M1 = N->getMaskElt(4) / 4; @@ -1680,6 +1687,69 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, return false; } +bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + // Ensure each byte index of the word is consecutive. + if (!isWordShuffleMask(N)) + return false; + + // Now we look at mask elements 0,4,8,12, which are the beginning of words. + unsigned M0 = N->getMaskElt(0) / 4; + unsigned M1 = N->getMaskElt(4) / 4; + unsigned M2 = N->getMaskElt(8) / 4; + unsigned M3 = N->getMaskElt(12) / 4; + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + assert(M0 < 4 && "Indexing into an undef vector?"); + if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) + return false; + + ShiftElts = IsLE ? (4 - M0) % 4 : M0; + Swap = false; + return true; + } + + // Ensure each word index of the ShuffleVector Mask is consecutive. + if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) + return false; + + if (IsLE) { + if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { + // Input vectors don't need to be swapped if the leading element + // of the result is one of the 3 left elements of the second vector + // (or if there is no shift to be done at all). + Swap = false; + ShiftElts = (8 - M0) % 8; + } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { + // Input vectors need to be swapped if the leading element + // of the result is one of the 3 left elements of the first vector + // (or if we're shifting by 4 - thereby simply swapping the vectors). + Swap = true; + ShiftElts = (4 - M0) % 4; + } + + return true; + } else { // BE + if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { + // Input vectors don't need to be swapped if the leading element + // of the result is one of the 4 elements of the first vector. + Swap = false; + ShiftElts = M0; + } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { + // Input vectors need to be swapped if the leading element + // of the result is one of the 4 elements of the right vector. + Swap = true; + ShiftElts = M0 - 4; + } + + return true; + } +} + + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, @@ -7679,6 +7749,20 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + + if (Subtarget.hasVSX() && + PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); + + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); + } + if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); @@ -8212,10 +8296,12 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDLoc DL(Op); switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { case Intrinsic::ppc_cfence: { + assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, - Op.getOperand(ArgStart + 1))), + Op.getOperand(ArgStart + 1)), + Op.getOperand(0)), 0); } default: diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index acb77943b118..2f9eb95f6de6 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -450,7 +450,11 @@ namespace llvm { /// a VMRGEW or VMRGOW instruction bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG); - + /// isXXSLDWIShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXSLDWI instruction. + bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE); + /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the /// shift amount, otherwise return -1. int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index a3f894c81a01..165970f9678c 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1001,7 +1001,9 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), isPPC64; // LR8 is a true define, while the rest of the Defs are clobbers. X3 is // explicitly defined when this op is created, so not mentioned here. -let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, +// This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be +// correct because the branch select pass is relying on it. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8, Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), "#GETtlsADDR", diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 46f103141bc1..fd6785e963a6 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1931,6 +1931,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case PPC::DFSTOREf64: { assert(Subtarget.hasP9Vector() && "Invalid D-Form Pseudo-ops on non-P9 target."); + assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() && + "D-form op must have register and immediate operands"); unsigned UpperOpcode, LowerOpcode; switch (MI.getOpcode()) { case PPC::DFLOADf32: diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 0766cfe4a987..26b99eced23c 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -46,7 +46,7 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>, ]>; def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>, - SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> + SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3> ]>; def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>, diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index b98140fedfc0..1589ab03e507 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -1066,6 +1066,10 @@ def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>; +// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and +// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable. +def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>; + // Selects. def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)), (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>; @@ -2379,8 +2383,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Load Vector Indexed def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc, - [(set v2f64:$XT, (load xoaddr:$src))]>; - + [(set v2f64:$XT, (load xaddr:$src))]>; // Load Vector (Left-justified) with Length def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB), "lxvl $XT, $src, $rB", IIC_LdStLoad, @@ -2430,7 +2433,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Store Vector Indexed def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc, - [(store v2f64:$XT, xoaddr:$dst)]>; + [(store v2f64:$XT, xaddr:$dst)]>; // Store Vector (Left-justified) with Length def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB), @@ -2498,21 +2501,38 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>; } // IsLittleEndian, HasP9Vector - def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), - (STXVX $rS, xoaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), - (STXVX $rS, xoaddr:$dst)>; - + // D-Form Load/Store + def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>; + + def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst), + (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst), + (STXV $rS, memrix16:$dst)>; + + + def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; + def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; + def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; + def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst), + (STXVX $rS, xaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst), + (STXVX $rS, xaddr:$dst)>; def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), (v4i32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), @@ -2704,9 +2724,15 @@ def FltToUIntLoad { def FltToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A))))); } +def FltToLongLoadP9 { + dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A))))); +} def FltToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A))))); } +def FltToULongLoadP9 { + dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A))))); +} def FltToLong { dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A)))); } @@ -2728,9 +2754,15 @@ def DblToULong { def DblToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A))))); } +def DblToIntLoadP9 { + dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A))))); +} def DblToUIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A))))); } +def DblToUIntLoadP9 { + dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A))))); +} def DblToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A))))); } @@ -2898,17 +2930,17 @@ let AddedComplexity = 400 in { (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>; - def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)), + def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; - def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)), + def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; - def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)), + def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 iaddr:$A), VSFRC)), 0))>; - def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)), + def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddr:$A), VSFRC)), 0))>; diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp index 92ce8089c24f..d02db9a617a3 100644 --- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp +++ b/lib/Target/SystemZ/SystemZExpandPseudo.cpp @@ -74,7 +74,7 @@ bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB, unsigned CCValid = MI.getOperand(3).getImm(); unsigned CCMask = MI.getOperand(4).getImm(); - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LivePhysRegs LiveRegs(TII->getRegisterInfo()); LiveRegs.addLiveOuts(MBB); for (auto I = std::prev(MBB.end()); I != MBBI; --I) LiveRegs.stepBackward(*I); diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index a30bf34857b5..b34c181124de 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -236,32 +236,30 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode, void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction &MF = *MBB->getParent(); - const unsigned Reg = MI->getOperand(0).getReg(); + const unsigned Reg64 = MI->getOperand(0).getReg(); + const unsigned Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32); - // Conveniently, all 4 instructions are cloned from LOAD_STACK_GUARD, - // so they already have operand 0 set to reg. + // EAR can only load the low subregister so us a shift for %a0 to produce + // the GR containing %a0 and %a1. // ear <reg>, %a0 - MachineInstr *Ear1MI = MF.CloneMachineInstr(MI); - MBB->insert(MI, Ear1MI); - Ear1MI->setDesc(get(SystemZ::EAR)); - MachineInstrBuilder(MF, Ear1MI).addReg(SystemZ::A0); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32) + .addReg(SystemZ::A0) + .addReg(Reg64, RegState::ImplicitDefine); // sllg <reg>, <reg>, 32 - MachineInstr *SllgMI = MF.CloneMachineInstr(MI); - MBB->insert(MI, SllgMI); - SllgMI->setDesc(get(SystemZ::SLLG)); - MachineInstrBuilder(MF, SllgMI).addReg(Reg).addReg(0).addImm(32); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::SLLG), Reg64) + .addReg(Reg64) + .addReg(0) + .addImm(32); // ear <reg>, %a1 - MachineInstr *Ear2MI = MF.CloneMachineInstr(MI); - MBB->insert(MI, Ear2MI); - Ear2MI->setDesc(get(SystemZ::EAR)); - MachineInstrBuilder(MF, Ear2MI).addReg(SystemZ::A1); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32) + .addReg(SystemZ::A1); // lg <reg>, 40(<reg>) MI->setDesc(get(SystemZ::LG)); - MachineInstrBuilder(MF, MI).addReg(Reg).addImm(40).addReg(0); + MachineInstrBuilder(MF, MI).addReg(Reg64).addImm(40).addReg(0); } // Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 3766ed45b8c4..ad597f5c65f0 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -55,6 +55,7 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool prefersVectorizedAddressing() { return false; } bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 32ab475f1186..e5d3209ec6a9 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1316,16 +1316,17 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { while (!Done) { bool UpdateLocLex = true; + AsmToken::TokenKind TK = getLexer().getKind(); // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an // identifier. Don't try an parse it as a register. - if (PrevTK != AsmToken::Error && Tok.getString().startswith(".")) + if (PrevTK != AsmToken::Error && Tok.getString().startswith(".") && + TK != AsmToken::Identifier) break; // If we're parsing an immediate expression, we don't expect a '['. if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) break; - AsmToken::TokenKind TK = getLexer().getKind(); switch (TK) { default: { if (SM.isValidEndState()) { diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index fc4adddc149b..7471373334f6 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -11,6 +11,7 @@ tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables) +tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables) if(LLVM_BUILD_GLOBAL_ISEL) tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank) tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel) diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 3a421fe77392..fe105298f5c1 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -127,6 +127,9 @@ def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", "Enable AVX-512 Conflict Detection Instructions", [FeatureAVX512]>; +def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ", + "true", "Enable AVX-512 Population Count Instructions", + [FeatureAVX512]>; def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", "Enable AVX-512 PreFetch Instructions", [FeatureAVX512]>; diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index a5489b9aa8b7..313920e02c3e 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -1655,8 +1655,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { } void FPS::setKillFlags(MachineBasicBlock &MBB) const { - const TargetRegisterInfo *TRI = - MBB.getParent()->getSubtarget().getRegisterInfo(); + const TargetRegisterInfo &TRI = + *MBB.getParent()->getSubtarget().getRegisterInfo(); LivePhysRegs LPR(TRI); LPR.addLiveOuts(MBB); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 37b248416e4a..86744b064132 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1364,6 +1364,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i64, Legal); } + if (Subtarget.hasVPOPCNTDQ()) { + // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512 + // version of popcntd/q. + for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64, + MVT::v4i32, MVT::v2i64}) + setOperationAction(ISD::CTPOP, VT, Legal); + } + // Custom lower several nodes. for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index f9344413bbcf..d8702693884d 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2693,22 +2693,22 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, } multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - PatFrag st_frag, PatFrag mstore> { + PatFrag st_frag, PatFrag mstore, string Name> { let hasSideEffects = 0 in { def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr # ".s\t{$src, $dst|$dst, $src}", - [], _.ExeDomain>, EVEX; + [], _.ExeDomain>, EVEX, FoldGenData<Name#rr>; def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"# "${dst} {${mask}}, $src}", - [], _.ExeDomain>, EVEX, EVEX_K; + [], _.ExeDomain>, EVEX, EVEX_K, FoldGenData<Name#rrk>; def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" # "${dst} {${mask}} {z}, $src}", - [], _.ExeDomain>, EVEX, EVEX_KZ; + [], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>; } def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), @@ -2726,80 +2726,92 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo _, Predicate prd> { + AVX512VLVectorVTInfo _, Predicate prd, + string Name> { let Predicates = [prd] in defm Z : avx512_store<opc, OpcodeStr, _.info512, store, - masked_store_unaligned>, EVEX_V512; + masked_store_unaligned, Name#Z>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store, - masked_store_unaligned>, EVEX_V256; + masked_store_unaligned, Name#Z256>, EVEX_V256; defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store, - masked_store_unaligned>, EVEX_V128; + masked_store_unaligned, Name#Z128>, EVEX_V128; } } multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo _, Predicate prd> { + AVX512VLVectorVTInfo _, Predicate prd, + string Name> { let Predicates = [prd] in defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512, - masked_store_aligned512>, EVEX_V512; + masked_store_aligned512, Name#Z>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256, - masked_store_aligned256>, EVEX_V256; + masked_store_aligned256, Name#Z256>, EVEX_V256; defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore, - masked_store_aligned128>, EVEX_V128; + masked_store_aligned128, Name#Z128>, EVEX_V128; } } defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, HasAVX512>, avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, - HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + HasAVX512, "VMOVAPS">, + PS, EVEX_CD8<32, CD8VF>; defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, HasAVX512>, avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, - HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + HasAVX512, "VMOVAPD">, + PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, null_frag>, - avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>, + avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512, + "VMOVUPS">, PS, EVEX_CD8<32, CD8VF>; defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, null_frag>, - avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, + avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512, + "VMOVUPD">, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, - HasAVX512>, PD, EVEX_CD8<32, CD8VF>; + HasAVX512, "VMOVDQA32">, + PD, EVEX_CD8<32, CD8VF>; defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, - HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + HasAVX512, "VMOVDQA64">, + PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, - avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, - HasBWI>, XD, EVEX_CD8<8, CD8VF>; + avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, + HasBWI, "VMOVDQU8">, + XD, EVEX_CD8<8, CD8VF>; defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, - HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; + HasBWI, "VMOVDQU16">, + XD, VEX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, - HasAVX512>, XS, EVEX_CD8<32, CD8VF>; + HasAVX512, "VMOVDQU32">, + XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, - HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; + HasAVX512, "VMOVDQU64">, + XS, VEX_W, EVEX_CD8<64, CD8VF>; // Special instructions to help with spilling when we don't have VLX. We need // to load or store from a ZMM register instead. These are converted in @@ -3354,17 +3366,52 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; -let hasSideEffects = 0 in -defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info, - (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2), - "vmovss.s", "$src2, $src1", "$src1, $src2", []>, - XS, EVEX_4V, VEX_LIG; - -let hasSideEffects = 0 in -defm VMOVSDZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info, - (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2), - "vmovsd.s", "$src2, $src1", "$src1, $src2", []>, - XD, EVEX_4V, VEX_LIG, VEX_W; +let hasSideEffects = 0 in { + def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, FR32X:$src2), + "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], NoItinerary>, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrr">; + +let Constraints = "$src0 = $dst" in + def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, + VR128X:$src1, FR32X:$src2), + "vmovss.s\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrk">; + + def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2), + "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrkz">; + + def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, FR64X:$src2), + "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W, + FoldGenData<"VMOVSDZrr">; + +let Constraints = "$src0 = $dst" in + def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, + VR128X:$src1, FR64X:$src2), + "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrk">; + + def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.KRCWM:$mask, VR128X:$src1, + FR64X:$src2), + "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrkz">; +} let Predicates = [HasAVX512] in { let AddedComplexity = 15 in { @@ -8649,6 +8696,41 @@ let Predicates = [HasCDI, NoVLX] in { } //===---------------------------------------------------------------------===// +// Counts number of ones - VPOPCNTD and VPOPCNTQ +//===---------------------------------------------------------------------===// + +multiclass avx512_unary_rmb_popcnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo VTInfo> { + let Predicates = [HasVPOPCNTDQ] in + defm Z : avx512_unary_rmb<opc, OpcodeStr, ctpop, VTInfo>, EVEX_V512; +} + +// Use 512bit version to implement 128/256 bit. +multiclass avx512_unary_lowering<SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info256.RC:$src1, + _.info256.SubRegIdx)), + _.info256.SubRegIdx)>; + + def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info128.RC:$src1, + _.info128.SubRegIdx)), + _.info128.SubRegIdx)>; + } +} + +defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>, + avx512_unary_lowering<ctpop, avx512vl_i32_info, HasVPOPCNTDQ>; +defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>, + avx512_unary_lowering<ctpop, avx512vl_i64_info, HasVPOPCNTDQ>, VEX_W; + +//===---------------------------------------------------------------------===// // Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{ @@ -8795,7 +8877,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> { def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), (ins _.RC:$src1, u8imm:$src2), OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - EVEX, TAPD; + EVEX, TAPD, FoldGenData<NAME#rr>; defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; } diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 66382014f6e8..e38bbc9b3d36 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -964,10 +964,10 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } // isConvertibleToThreeAddress } // isCommutable - def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; - def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; - def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; - def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>; + def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>; + def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>; + def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>; def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>; @@ -1049,10 +1049,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } // isConvertibleToThreeAddress } // isCommutable - def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>; - def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>; - def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>; - def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>; + def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>; + def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>; + def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>; + def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>; def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; @@ -1129,10 +1129,10 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // isCommutable - def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; - def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>; - def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>; - def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; + def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>; + def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>; + def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>; + def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>; def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>; diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 1941ae57f0f1..3a3cdc9fa574 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -297,7 +297,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - VEX_LIG; + VEX_LIG, FoldGenData<NAME#rr>; } multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, @@ -321,6 +321,12 @@ let isCodeGenOnly = 1 in { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG; +let hasSideEffects = 0 in + def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_LIG, FoldGenData<NAME#rr_Int>; } // isCodeGenOnly = 1 } @@ -372,12 +378,13 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + FoldGenData<NAME#rr>; def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - VEX_L; + VEX_L, FoldGenData<NAME#Yrr>; } // isCodeGenOnly = 1 } diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index c2fe786732dc..bfcbf71d252f 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -225,6 +225,12 @@ class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } class XOP { Encoding OpEnc = EncXOP; } class XOP_4V : XOP { bit hasVEX_4V = 1; } +// Specify the alternative register form instruction to replace the current +// instruction in case it was picked during generation of memory folding tables +class FoldGenData<string _RegisterForm> { + string FoldGenRegForm = _RegisterForm; +} + class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, string AsmStr, InstrItinClass itin, @@ -304,6 +310,10 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, CD8_EltSize, !srl(VectSize, CD8_Form{1-0}))), 0); + // Used in the memory folding generation (TableGen backend) to point to an alternative + // instruction to replace the current one in case it got picked during generation. + string FoldGenRegForm = ?; + // TSFlags layout should be kept in sync with X86BaseInfo.h. let TSFlags{6-0} = FormBits; let TSFlags{8-7} = OpSizeBits; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index f7083a7448ce..33fbd41bb631 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -121,172 +121,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) (STI.is64Bit() ? X86::RETQ : X86::RETL)), Subtarget(STI), RI(STI.getTargetTriple()) { - static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { - { X86::ADC32ri, X86::ADC32mi, 0 }, - { X86::ADC32ri8, X86::ADC32mi8, 0 }, - { X86::ADC32rr, X86::ADC32mr, 0 }, - { X86::ADC64ri32, X86::ADC64mi32, 0 }, - { X86::ADC64ri8, X86::ADC64mi8, 0 }, - { X86::ADC64rr, X86::ADC64mr, 0 }, - { X86::ADD16ri, X86::ADD16mi, 0 }, - { X86::ADD16ri8, X86::ADD16mi8, 0 }, - { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, - { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, - { X86::ADD16rr, X86::ADD16mr, 0 }, - { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, - { X86::ADD32ri, X86::ADD32mi, 0 }, - { X86::ADD32ri8, X86::ADD32mi8, 0 }, - { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, - { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, - { X86::ADD32rr, X86::ADD32mr, 0 }, - { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, - { X86::ADD64ri32, X86::ADD64mi32, 0 }, - { X86::ADD64ri8, X86::ADD64mi8, 0 }, - { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, - { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, - { X86::ADD64rr, X86::ADD64mr, 0 }, - { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, - { X86::ADD8ri, X86::ADD8mi, 0 }, - { X86::ADD8rr, X86::ADD8mr, 0 }, - { X86::AND16ri, X86::AND16mi, 0 }, - { X86::AND16ri8, X86::AND16mi8, 0 }, - { X86::AND16rr, X86::AND16mr, 0 }, - { X86::AND32ri, X86::AND32mi, 0 }, - { X86::AND32ri8, X86::AND32mi8, 0 }, - { X86::AND32rr, X86::AND32mr, 0 }, - { X86::AND64ri32, X86::AND64mi32, 0 }, - { X86::AND64ri8, X86::AND64mi8, 0 }, - { X86::AND64rr, X86::AND64mr, 0 }, - { X86::AND8ri, X86::AND8mi, 0 }, - { X86::AND8rr, X86::AND8mr, 0 }, - { X86::DEC16r, X86::DEC16m, 0 }, - { X86::DEC32r, X86::DEC32m, 0 }, - { X86::DEC64r, X86::DEC64m, 0 }, - { X86::DEC8r, X86::DEC8m, 0 }, - { X86::INC16r, X86::INC16m, 0 }, - { X86::INC32r, X86::INC32m, 0 }, - { X86::INC64r, X86::INC64m, 0 }, - { X86::INC8r, X86::INC8m, 0 }, - { X86::NEG16r, X86::NEG16m, 0 }, - { X86::NEG32r, X86::NEG32m, 0 }, - { X86::NEG64r, X86::NEG64m, 0 }, - { X86::NEG8r, X86::NEG8m, 0 }, - { X86::NOT16r, X86::NOT16m, 0 }, - { X86::NOT32r, X86::NOT32m, 0 }, - { X86::NOT64r, X86::NOT64m, 0 }, - { X86::NOT8r, X86::NOT8m, 0 }, - { X86::OR16ri, X86::OR16mi, 0 }, - { X86::OR16ri8, X86::OR16mi8, 0 }, - { X86::OR16rr, X86::OR16mr, 0 }, - { X86::OR32ri, X86::OR32mi, 0 }, - { X86::OR32ri8, X86::OR32mi8, 0 }, - { X86::OR32rr, X86::OR32mr, 0 }, - { X86::OR64ri32, X86::OR64mi32, 0 }, - { X86::OR64ri8, X86::OR64mi8, 0 }, - { X86::OR64rr, X86::OR64mr, 0 }, - { X86::OR8ri, X86::OR8mi, 0 }, - { X86::OR8rr, X86::OR8mr, 0 }, - { X86::ROL16r1, X86::ROL16m1, 0 }, - { X86::ROL16rCL, X86::ROL16mCL, 0 }, - { X86::ROL16ri, X86::ROL16mi, 0 }, - { X86::ROL32r1, X86::ROL32m1, 0 }, - { X86::ROL32rCL, X86::ROL32mCL, 0 }, - { X86::ROL32ri, X86::ROL32mi, 0 }, - { X86::ROL64r1, X86::ROL64m1, 0 }, - { X86::ROL64rCL, X86::ROL64mCL, 0 }, - { X86::ROL64ri, X86::ROL64mi, 0 }, - { X86::ROL8r1, X86::ROL8m1, 0 }, - { X86::ROL8rCL, X86::ROL8mCL, 0 }, - { X86::ROL8ri, X86::ROL8mi, 0 }, - { X86::ROR16r1, X86::ROR16m1, 0 }, - { X86::ROR16rCL, X86::ROR16mCL, 0 }, - { X86::ROR16ri, X86::ROR16mi, 0 }, - { X86::ROR32r1, X86::ROR32m1, 0 }, - { X86::ROR32rCL, X86::ROR32mCL, 0 }, - { X86::ROR32ri, X86::ROR32mi, 0 }, - { X86::ROR64r1, X86::ROR64m1, 0 }, - { X86::ROR64rCL, X86::ROR64mCL, 0 }, - { X86::ROR64ri, X86::ROR64mi, 0 }, - { X86::ROR8r1, X86::ROR8m1, 0 }, - { X86::ROR8rCL, X86::ROR8mCL, 0 }, - { X86::ROR8ri, X86::ROR8mi, 0 }, - { X86::SAR16r1, X86::SAR16m1, 0 }, - { X86::SAR16rCL, X86::SAR16mCL, 0 }, - { X86::SAR16ri, X86::SAR16mi, 0 }, - { X86::SAR32r1, X86::SAR32m1, 0 }, - { X86::SAR32rCL, X86::SAR32mCL, 0 }, - { X86::SAR32ri, X86::SAR32mi, 0 }, - { X86::SAR64r1, X86::SAR64m1, 0 }, - { X86::SAR64rCL, X86::SAR64mCL, 0 }, - { X86::SAR64ri, X86::SAR64mi, 0 }, - { X86::SAR8r1, X86::SAR8m1, 0 }, - { X86::SAR8rCL, X86::SAR8mCL, 0 }, - { X86::SAR8ri, X86::SAR8mi, 0 }, - { X86::SBB32ri, X86::SBB32mi, 0 }, - { X86::SBB32ri8, X86::SBB32mi8, 0 }, - { X86::SBB32rr, X86::SBB32mr, 0 }, - { X86::SBB64ri32, X86::SBB64mi32, 0 }, - { X86::SBB64ri8, X86::SBB64mi8, 0 }, - { X86::SBB64rr, X86::SBB64mr, 0 }, - { X86::SHL16r1, X86::SHL16m1, 0 }, - { X86::SHL16rCL, X86::SHL16mCL, 0 }, - { X86::SHL16ri, X86::SHL16mi, 0 }, - { X86::SHL32r1, X86::SHL32m1, 0 }, - { X86::SHL32rCL, X86::SHL32mCL, 0 }, - { X86::SHL32ri, X86::SHL32mi, 0 }, - { X86::SHL64r1, X86::SHL64m1, 0 }, - { X86::SHL64rCL, X86::SHL64mCL, 0 }, - { X86::SHL64ri, X86::SHL64mi, 0 }, - { X86::SHL8r1, X86::SHL8m1, 0 }, - { X86::SHL8rCL, X86::SHL8mCL, 0 }, - { X86::SHL8ri, X86::SHL8mi, 0 }, - { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 }, - { X86::SHLD16rri8, X86::SHLD16mri8, 0 }, - { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 }, - { X86::SHLD32rri8, X86::SHLD32mri8, 0 }, - { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 }, - { X86::SHLD64rri8, X86::SHLD64mri8, 0 }, - { X86::SHR16r1, X86::SHR16m1, 0 }, - { X86::SHR16rCL, X86::SHR16mCL, 0 }, - { X86::SHR16ri, X86::SHR16mi, 0 }, - { X86::SHR32r1, X86::SHR32m1, 0 }, - { X86::SHR32rCL, X86::SHR32mCL, 0 }, - { X86::SHR32ri, X86::SHR32mi, 0 }, - { X86::SHR64r1, X86::SHR64m1, 0 }, - { X86::SHR64rCL, X86::SHR64mCL, 0 }, - { X86::SHR64ri, X86::SHR64mi, 0 }, - { X86::SHR8r1, X86::SHR8m1, 0 }, - { X86::SHR8rCL, X86::SHR8mCL, 0 }, - { X86::SHR8ri, X86::SHR8mi, 0 }, - { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 }, - { X86::SHRD16rri8, X86::SHRD16mri8, 0 }, - { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 }, - { X86::SHRD32rri8, X86::SHRD32mri8, 0 }, - { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 }, - { X86::SHRD64rri8, X86::SHRD64mri8, 0 }, - { X86::SUB16ri, X86::SUB16mi, 0 }, - { X86::SUB16ri8, X86::SUB16mi8, 0 }, - { X86::SUB16rr, X86::SUB16mr, 0 }, - { X86::SUB32ri, X86::SUB32mi, 0 }, - { X86::SUB32ri8, X86::SUB32mi8, 0 }, - { X86::SUB32rr, X86::SUB32mr, 0 }, - { X86::SUB64ri32, X86::SUB64mi32, 0 }, - { X86::SUB64ri8, X86::SUB64mi8, 0 }, - { X86::SUB64rr, X86::SUB64mr, 0 }, - { X86::SUB8ri, X86::SUB8mi, 0 }, - { X86::SUB8rr, X86::SUB8mr, 0 }, - { X86::XOR16ri, X86::XOR16mi, 0 }, - { X86::XOR16ri8, X86::XOR16mi8, 0 }, - { X86::XOR16rr, X86::XOR16mr, 0 }, - { X86::XOR32ri, X86::XOR32mi, 0 }, - { X86::XOR32ri8, X86::XOR32mi8, 0 }, - { X86::XOR32rr, X86::XOR32mr, 0 }, - { X86::XOR64ri32, X86::XOR64mi32, 0 }, - { X86::XOR64ri8, X86::XOR64mi8, 0 }, - { X86::XOR64rr, X86::XOR64mr, 0 }, - { X86::XOR8ri, X86::XOR8mi, 0 }, - { X86::XOR8rr, X86::XOR8mr, 0 } - }; +// Generated memory folding tables. +#include "X86GenFoldTables.inc" for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) { AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, @@ -295,744 +131,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); } - static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { - { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, - { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, - { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, - { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD }, - { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD }, - { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD }, - { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD }, - { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD }, - { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD }, - { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD }, - { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD }, - { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD }, - { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD }, - { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD }, - { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD }, - { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD }, - { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD }, - { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD }, - { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD }, - { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD }, - { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE }, - { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD }, - { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD }, - { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD }, - { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD }, - { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD }, - { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD }, - { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD }, - { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD }, - { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD }, - { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, - { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, - { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, - { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, - { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, - { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, - { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, - { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, - { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, - { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, - { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, - { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, - { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE }, - { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, - { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE }, - { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE }, - { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE }, - { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD }, - { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD }, - { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD }, - { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, - { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, - { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, - { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, - { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, - { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, - { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, - { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, - { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, - { X86::SETBr, X86::SETBm, TB_FOLDED_STORE }, - { X86::SETEr, X86::SETEm, TB_FOLDED_STORE }, - { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE }, - { X86::SETGr, X86::SETGm, TB_FOLDED_STORE }, - { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE }, - { X86::SETLr, X86::SETLm, TB_FOLDED_STORE }, - { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE }, - { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE }, - { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE }, - { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE }, - { X86::SETOr, X86::SETOm, TB_FOLDED_STORE }, - { X86::SETPr, X86::SETPm, TB_FOLDED_STORE }, - { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, - { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, - { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, - { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, - { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, - { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, - { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, - { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, - - // AVX 128-bit versions of foldable instructions - { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, - { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE }, - { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE }, - { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE }, - { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE }, - { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, - { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, - { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, - { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE }, - { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE }, - - // AVX 256-bit foldable instructions - { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE }, - { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, - { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, - - // AVX-512 foldable instructions - { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE }, - { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE }, - { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, - { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, - { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, - { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE }, - { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE }, - { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE }, - { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE }, - { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE }, - { X86::VPEXTRDZrr, X86::VPEXTRDZmr, TB_FOLDED_STORE }, - { X86::VPEXTRQZrr, X86::VPEXTRQZmr, TB_FOLDED_STORE }, - { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE }, - { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE }, - { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE }, - { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE }, - { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE }, - { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE }, - { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE }, - { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE }, - { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE }, - { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE }, - - // AVX-512 foldable instructions (256-bit versions) - { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE }, - { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE }, - { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE }, - { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE }, - { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE }, - { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE }, - { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE }, - { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, - { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, - { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE }, - - // AVX-512 foldable instructions (128-bit versions) - { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE }, - { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE }, - - // F16C foldable instructions - { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE }, - { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE } - }; - for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) { AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags); } - static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { - { X86::BSF16rr, X86::BSF16rm, 0 }, - { X86::BSF32rr, X86::BSF32rm, 0 }, - { X86::BSF64rr, X86::BSF64rm, 0 }, - { X86::BSR16rr, X86::BSR16rm, 0 }, - { X86::BSR32rr, X86::BSR32rm, 0 }, - { X86::BSR64rr, X86::BSR64rm, 0 }, - { X86::CMP16rr, X86::CMP16rm, 0 }, - { X86::CMP32rr, X86::CMP32rm, 0 }, - { X86::CMP64rr, X86::CMP64rm, 0 }, - { X86::CMP8rr, X86::CMP8rm, 0 }, - { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 }, - { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 }, - { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 }, - { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 }, - { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 }, - { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 }, - { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 }, - { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 }, - { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 }, - { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 }, - { X86::IMUL16rri, X86::IMUL16rmi, 0 }, - { X86::IMUL16rri8, X86::IMUL16rmi8, 0 }, - { X86::IMUL32rri, X86::IMUL32rmi, 0 }, - { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, - { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, - { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, - { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE }, - { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE }, - { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE }, - { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE }, - { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE }, - { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE }, - { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE }, - { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, - { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, - { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, - { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, - { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE }, - { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, - { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, - { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE }, - { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE }, - { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE }, - { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE }, - { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE }, - { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE }, - { X86::MOV16rr, X86::MOV16rm, 0 }, - { X86::MOV32rr, X86::MOV32rm, 0 }, - { X86::MOV64rr, X86::MOV64rm, 0 }, - { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 }, - { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 }, - { X86::MOV8rr, X86::MOV8rm, 0 }, - { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, - { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, - { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE }, - { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, - { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, - { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, - { X86::MOVDQUrr, X86::MOVDQUrm, 0 }, - { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, - { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 }, - { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 }, - { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 }, - { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 }, - { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 }, - { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 }, - { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, - { X86::MOVUPDrr, X86::MOVUPDrm, 0 }, - { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE }, - { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, - { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, - { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, - { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, - { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 }, - { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 }, - { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 }, - { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 }, - { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 }, - { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 }, - { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 }, - { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 }, - { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE }, - { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE }, - { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE }, - { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE }, - { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE }, - { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE }, - { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE }, - { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE }, - { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE }, - { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE }, - { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE }, - { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE }, - { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, - { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, - { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, - { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 }, - { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, - { X86::RCPSSr, X86::RCPSSm, 0 }, - { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE }, - { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, - { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, - { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, - { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, - { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, - { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, - { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE }, - { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, - { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, - { X86::SQRTSDr, X86::SQRTSDm, 0 }, - { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE }, - { X86::SQRTSSr, X86::SQRTSSm, 0 }, - { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE }, - { X86::TEST16rr, X86::TEST16rm, 0 }, - { X86::TEST32rr, X86::TEST32rm, 0 }, - { X86::TEST64rr, X86::TEST64rm, 0 }, - { X86::TEST8rr, X86::TEST8rm, 0 }, - // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 - { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, - { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, - - // MMX version of foldable instructions - { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 }, - { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 }, - { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 }, - { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 }, - { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 }, - { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 }, - { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 }, - { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 }, - { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 }, - { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 }, - - // 3DNow! version of foldable instructions - { X86::PF2IDrr, X86::PF2IDrm, 0 }, - { X86::PF2IWrr, X86::PF2IWrm, 0 }, - { X86::PFRCPrr, X86::PFRCPrm, 0 }, - { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 }, - { X86::PI2FDrr, X86::PI2FDrm, 0 }, - { X86::PI2FWrr, X86::PI2FWrm, 0 }, - { X86::PSWAPDrr, X86::PSWAPDrm, 0 }, - - // AVX 128-bit versions of foldable instructions - { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE }, - { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE }, - { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE }, - { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE }, - { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, - { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE }, - { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, - { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE }, - { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, - { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE }, - { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, - { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE }, - { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE }, - { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE }, - { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE }, - { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE }, - { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE }, - { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, - { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 }, - { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 }, - { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, - { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE }, - { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 }, - { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, - { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, - { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, - { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, - { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, - { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE }, - { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, - { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, - { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, - { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 }, - { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 }, - { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, - { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, - { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE }, - { X86::VPABSBrr, X86::VPABSBrm, 0 }, - { X86::VPABSDrr, X86::VPABSDrm, 0 }, - { X86::VPABSWrr, X86::VPABSWrm, 0 }, - { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, - { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, - { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, - { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 }, - { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 }, - { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, - { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, - { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE }, - { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE }, - { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE }, - { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE }, - { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE }, - { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE }, - { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE }, - { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE }, - { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE }, - { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE }, - { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE }, - { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE }, - { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, - { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, - { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, - { X86::VPTESTrr, X86::VPTESTrm, 0 }, - { X86::VRCPPSr, X86::VRCPPSm, 0 }, - { X86::VROUNDPDr, X86::VROUNDPDm, 0 }, - { X86::VROUNDPSr, X86::VROUNDPSm, 0 }, - { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, - { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, - { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, - { X86::VTESTPDrr, X86::VTESTPDrm, 0 }, - { X86::VTESTPSrr, X86::VTESTPSrm, 0 }, - { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, - { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, - - // AVX 256-bit foldable instructions - { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE }, - { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, - { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, - { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, - { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, - { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE }, - { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, - { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, - { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, - { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, - { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 }, - { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, - { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 }, - { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 }, - { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 }, - { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, - { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, - { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, - { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, - { X86::VPTESTYrr, X86::VPTESTYrm, 0 }, - { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, - { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 }, - { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 }, - { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, - { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, - { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, - { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 }, - { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 }, - - // AVX2 foldable instructions - - // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the - // VBROADCASTS{SD}rm memory instructions were available from AVX1. - // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction - // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions - // so they don't need an equivalent limitation. - { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, - { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, - { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, - { X86::VPABSBYrr, X86::VPABSBYrm, 0 }, - { X86::VPABSDYrr, X86::VPABSDYrm, 0 }, - { X86::VPABSWYrr, X86::VPABSWYrm, 0 }, - { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE }, - { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE }, - { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE }, - { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE }, - { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE }, - { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, - { X86::VPERMQYri, X86::VPERMQYmi, 0 }, - { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE }, - { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE }, - { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 }, - { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 }, - { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 }, - { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE }, - { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE }, - { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE }, - { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 }, - { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 }, - { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 }, - { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE }, - { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, - { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, - { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, - - // XOP foldable instructions - { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 }, - { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 }, - { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 }, - { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 }, - { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 }, - { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 }, - { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 }, - { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 }, - { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 }, - { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 }, - { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 }, - { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 }, - { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 }, - { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 }, - { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 }, - { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 }, - { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 }, - { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 }, - { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 }, - { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 }, - { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 }, - { X86::VPROTBri, X86::VPROTBmi, 0 }, - { X86::VPROTBrr, X86::VPROTBmr, 0 }, - { X86::VPROTDri, X86::VPROTDmi, 0 }, - { X86::VPROTDrr, X86::VPROTDmr, 0 }, - { X86::VPROTQri, X86::VPROTQmi, 0 }, - { X86::VPROTQrr, X86::VPROTQmr, 0 }, - { X86::VPROTWri, X86::VPROTWmi, 0 }, - { X86::VPROTWrr, X86::VPROTWmr, 0 }, - { X86::VPSHABrr, X86::VPSHABmr, 0 }, - { X86::VPSHADrr, X86::VPSHADmr, 0 }, - { X86::VPSHAQrr, X86::VPSHAQmr, 0 }, - { X86::VPSHAWrr, X86::VPSHAWmr, 0 }, - { X86::VPSHLBrr, X86::VPSHLBmr, 0 }, - { X86::VPSHLDrr, X86::VPSHLDmr, 0 }, - { X86::VPSHLQrr, X86::VPSHLQmr, 0 }, - { X86::VPSHLWrr, X86::VPSHLWmr, 0 }, - - // LWP foldable instructions - { X86::LWPINS32rri, X86::LWPINS32rmi, 0 }, - { X86::LWPINS64rri, X86::LWPINS64rmi, 0 }, - { X86::LWPVAL32rri, X86::LWPVAL32rmi, 0 }, - { X86::LWPVAL64rri, X86::LWPVAL64rmi, 0 }, - - // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions - { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, - { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, - { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 }, - { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 }, - { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 }, - { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 }, - { X86::BLCI32rr, X86::BLCI32rm, 0 }, - { X86::BLCI64rr, X86::BLCI64rm, 0 }, - { X86::BLCIC32rr, X86::BLCIC32rm, 0 }, - { X86::BLCIC64rr, X86::BLCIC64rm, 0 }, - { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 }, - { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 }, - { X86::BLCS32rr, X86::BLCS32rm, 0 }, - { X86::BLCS64rr, X86::BLCS64rm, 0 }, - { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 }, - { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 }, - { X86::BLSI32rr, X86::BLSI32rm, 0 }, - { X86::BLSI64rr, X86::BLSI64rm, 0 }, - { X86::BLSIC32rr, X86::BLSIC32rm, 0 }, - { X86::BLSIC64rr, X86::BLSIC64rm, 0 }, - { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 }, - { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 }, - { X86::BLSR32rr, X86::BLSR32rm, 0 }, - { X86::BLSR64rr, X86::BLSR64rm, 0 }, - { X86::BZHI32rr, X86::BZHI32rm, 0 }, - { X86::BZHI64rr, X86::BZHI64rm, 0 }, - { X86::LZCNT16rr, X86::LZCNT16rm, 0 }, - { X86::LZCNT32rr, X86::LZCNT32rm, 0 }, - { X86::LZCNT64rr, X86::LZCNT64rm, 0 }, - { X86::POPCNT16rr, X86::POPCNT16rm, 0 }, - { X86::POPCNT32rr, X86::POPCNT32rm, 0 }, - { X86::POPCNT64rr, X86::POPCNT64rm, 0 }, - { X86::RORX32ri, X86::RORX32mi, 0 }, - { X86::RORX64ri, X86::RORX64mi, 0 }, - { X86::SARX32rr, X86::SARX32rm, 0 }, - { X86::SARX64rr, X86::SARX64rm, 0 }, - { X86::SHRX32rr, X86::SHRX32rm, 0 }, - { X86::SHRX64rr, X86::SHRX64rm, 0 }, - { X86::SHLX32rr, X86::SHLX32rm, 0 }, - { X86::SHLX64rr, X86::SHLX64rm, 0 }, - { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 }, - { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 }, - { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, - { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, - { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, - { X86::TZMSK32rr, X86::TZMSK32rm, 0 }, - { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, - - // AVX-512 foldable instructions - { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, - { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, - { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 }, - { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 }, - { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, - { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, - { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, - { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, - { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 }, - { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 }, - { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 }, - { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 }, - { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, - { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, - { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, - { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE }, - { X86::VPABSBZrr, X86::VPABSBZrm, 0 }, - { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, - { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, - { X86::VPABSWZrr, X86::VPABSWZrm, 0 }, - { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 }, - { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 }, - { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, - { X86::VPERMQZri, X86::VPERMQZmi, 0 }, - { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 }, - { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE }, - { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 }, - { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 }, - { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 }, - { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 }, - { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 }, - { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE }, - { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 }, - { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 }, - { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 }, - { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 }, - { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 }, - { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 }, - { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 }, - { X86::VPSLLDQZ512rr, X86::VPSLLDQZ512rm, 0 }, - { X86::VPSLLDZri, X86::VPSLLDZmi, 0 }, - { X86::VPSLLQZri, X86::VPSLLQZmi, 0 }, - { X86::VPSLLWZri, X86::VPSLLWZmi, 0 }, - { X86::VPSRADZri, X86::VPSRADZmi, 0 }, - { X86::VPSRAQZri, X86::VPSRAQZmi, 0 }, - { X86::VPSRAWZri, X86::VPSRAWZmi, 0 }, - { X86::VPSRLDQZ512rr, X86::VPSRLDQZ512rm, 0 }, - { X86::VPSRLDZri, X86::VPSRLDZmi, 0 }, - { X86::VPSRLQZri, X86::VPSRLQZmi, 0 }, - { X86::VPSRLWZri, X86::VPSRLWZmi, 0 }, - - // AVX-512 foldable instructions (256-bit versions) - { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, - { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, - { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, - { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, - { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 }, - { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 }, - { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 }, - { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 }, - { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, - { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, - { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, - { X86::VPABSBZ256rr, X86::VPABSBZ256rm, 0 }, - { X86::VPABSDZ256rr, X86::VPABSDZ256rm, 0 }, - { X86::VPABSQZ256rr, X86::VPABSQZ256rm, 0 }, - { X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0 }, - { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 }, - { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 }, - { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 }, - { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 }, - { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 }, - { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 }, - { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 }, - { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 }, - { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 }, - { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 }, - { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE }, - { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 }, - { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 }, - { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 }, - { X86::VPSLLDQZ256rr, X86::VPSLLDQZ256rm, 0 }, - { X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 }, - { X86::VPSLLQZ256ri, X86::VPSLLQZ256mi, 0 }, - { X86::VPSLLWZ256ri, X86::VPSLLWZ256mi, 0 }, - { X86::VPSRADZ256ri, X86::VPSRADZ256mi, 0 }, - { X86::VPSRAQZ256ri, X86::VPSRAQZ256mi, 0 }, - { X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 }, - { X86::VPSRLDQZ256rr, X86::VPSRLDQZ256rm, 0 }, - { X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 }, - { X86::VPSRLQZ256ri, X86::VPSRLQZ256mi, 0 }, - { X86::VPSRLWZ256ri, X86::VPSRLWZ256mi, 0 }, - - // AVX-512 foldable instructions (128-bit versions) - { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, - { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, - { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, - { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, - { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 }, - { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 }, - { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 }, - { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 }, - { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, - { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, - { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, - { X86::VPABSBZ128rr, X86::VPABSBZ128rm, 0 }, - { X86::VPABSDZ128rr, X86::VPABSDZ128rm, 0 }, - { X86::VPABSQZ128rr, X86::VPABSQZ128rm, 0 }, - { X86::VPABSWZ128rr, X86::VPABSWZ128rm, 0 }, - { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 }, - { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 }, - { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE }, - { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 }, - { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 }, - { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 }, - { X86::VPSLLDQZ128rr, X86::VPSLLDQZ128rm, 0 }, - { X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 }, - { X86::VPSLLQZ128ri, X86::VPSLLQZ128mi, 0 }, - { X86::VPSLLWZ128ri, X86::VPSLLWZ128mi, 0 }, - { X86::VPSRADZ128ri, X86::VPSRADZ128mi, 0 }, - { X86::VPSRAQZ128ri, X86::VPSRAQZ128mi, 0 }, - { X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 }, - { X86::VPSRLDQZ128rr, X86::VPSRLDQZ128rm, 0 }, - { X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 }, - { X86::VPSRLQZ128ri, X86::VPSRLQZ128mi, 0 }, - { X86::VPSRLWZ128ri, X86::VPSRLWZ128mi, 0 }, - - // F16C foldable instructions - { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, - { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, - - // AES foldable instructions - { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, - { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, - { X86::VAESIMCrr, X86::VAESIMCrm, 0 }, - { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 } - }; - for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) { AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, Entry.RegOp, Entry.MemOp, @@ -1040,1394 +143,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD); } - static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { - { X86::ADC32rr, X86::ADC32rm, 0 }, - { X86::ADC64rr, X86::ADC64rm, 0 }, - { X86::ADD16rr, X86::ADD16rm, 0 }, - { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, - { X86::ADD32rr, X86::ADD32rm, 0 }, - { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, - { X86::ADD64rr, X86::ADD64rm, 0 }, - { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, - { X86::ADD8rr, X86::ADD8rm, 0 }, - { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, - { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, - { X86::ADDSDrr, X86::ADDSDrm, 0 }, - { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE }, - { X86::ADDSSrr, X86::ADDSSrm, 0 }, - { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE }, - { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, - { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, - { X86::AND16rr, X86::AND16rm, 0 }, - { X86::AND32rr, X86::AND32rm, 0 }, - { X86::AND64rr, X86::AND64rm, 0 }, - { X86::AND8rr, X86::AND8rm, 0 }, - { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 }, - { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 }, - { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 }, - { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 }, - { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 }, - { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 }, - { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 }, - { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 }, - { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, - { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, - { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, - { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 }, - { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 }, - { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 }, - { X86::CMOVB16rr, X86::CMOVB16rm, 0 }, - { X86::CMOVB32rr, X86::CMOVB32rm, 0 }, - { X86::CMOVB64rr, X86::CMOVB64rm, 0 }, - { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 }, - { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 }, - { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 }, - { X86::CMOVE16rr, X86::CMOVE16rm, 0 }, - { X86::CMOVE32rr, X86::CMOVE32rm, 0 }, - { X86::CMOVE64rr, X86::CMOVE64rm, 0 }, - { X86::CMOVG16rr, X86::CMOVG16rm, 0 }, - { X86::CMOVG32rr, X86::CMOVG32rm, 0 }, - { X86::CMOVG64rr, X86::CMOVG64rm, 0 }, - { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 }, - { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 }, - { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 }, - { X86::CMOVL16rr, X86::CMOVL16rm, 0 }, - { X86::CMOVL32rr, X86::CMOVL32rm, 0 }, - { X86::CMOVL64rr, X86::CMOVL64rm, 0 }, - { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 }, - { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 }, - { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 }, - { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 }, - { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 }, - { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 }, - { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 }, - { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 }, - { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 }, - { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 }, - { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 }, - { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 }, - { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 }, - { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 }, - { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 }, - { X86::CMOVO16rr, X86::CMOVO16rm, 0 }, - { X86::CMOVO32rr, X86::CMOVO32rm, 0 }, - { X86::CMOVO64rr, X86::CMOVO64rm, 0 }, - { X86::CMOVP16rr, X86::CMOVP16rm, 0 }, - { X86::CMOVP32rr, X86::CMOVP32rm, 0 }, - { X86::CMOVP64rr, X86::CMOVP64rm, 0 }, - { X86::CMOVS16rr, X86::CMOVS16rm, 0 }, - { X86::CMOVS32rr, X86::CMOVS32rm, 0 }, - { X86::CMOVS64rr, X86::CMOVS64rm, 0 }, - { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, - { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, - { X86::CMPSDrr, X86::CMPSDrm, 0 }, - { X86::CMPSSrr, X86::CMPSSrm, 0 }, - { X86::CRC32r32r32, X86::CRC32r32m32, 0 }, - { X86::CRC32r64r64, X86::CRC32r64m64, 0 }, - { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, - { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, - { X86::DIVSDrr, X86::DIVSDrm, 0 }, - { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE }, - { X86::DIVSSrr, X86::DIVSSrm, 0 }, - { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE }, - { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, - { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, - { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, - { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, - { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, - { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 }, - { X86::IMUL16rr, X86::IMUL16rm, 0 }, - { X86::IMUL32rr, X86::IMUL32rm, 0 }, - { X86::IMUL64rr, X86::IMUL64rm, 0 }, - { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE }, - { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE }, - { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE }, - { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, - { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, - { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, - { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, - { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE }, - { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, - { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 }, - { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, - { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 }, - { X86::MAXSDrr, X86::MAXSDrm, 0 }, - { X86::MAXCSDrr, X86::MAXCSDrm, 0 }, - { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE }, - { X86::MAXSSrr, X86::MAXSSrm, 0 }, - { X86::MAXCSSrr, X86::MAXCSSrm, 0 }, - { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE }, - { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, - { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 }, - { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, - { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 }, - { X86::MINSDrr, X86::MINSDrm, 0 }, - { X86::MINCSDrr, X86::MINCSDrm, 0 }, - { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE }, - { X86::MINSSrr, X86::MINSSrm, 0 }, - { X86::MINCSSrr, X86::MINCSSrm, 0 }, - { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE }, - { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, - { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, - { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, - { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, - { X86::MULSDrr, X86::MULSDrm, 0 }, - { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE }, - { X86::MULSSrr, X86::MULSSrm, 0 }, - { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE }, - { X86::OR16rr, X86::OR16rm, 0 }, - { X86::OR32rr, X86::OR32rm, 0 }, - { X86::OR64rr, X86::OR64rm, 0 }, - { X86::OR8rr, X86::OR8rm, 0 }, - { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 }, - { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 }, - { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 }, - { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 }, - { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 }, - { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 }, - { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 }, - { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 }, - { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 }, - { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 }, - { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 }, - { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 }, - { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 }, - { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 }, - { X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 }, - { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 }, - { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, - { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, - { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, - { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 }, - { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 }, - { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 }, - { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, - { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, - { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 }, - { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 }, - { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 }, - { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 }, - { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 }, - { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 }, - { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 }, - { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 }, - { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 }, - { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 }, - { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 }, - { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 }, - { X86::PINSRBrr, X86::PINSRBrm, 0 }, - { X86::PINSRDrr, X86::PINSRDrm, 0 }, - { X86::PINSRQrr, X86::PINSRQrm, 0 }, - { X86::PINSRWrri, X86::PINSRWrmi, 0 }, - { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 }, - { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, - { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 }, - { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 }, - { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, - { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, - { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, - { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, - { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 }, - { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 }, - { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, - { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, - { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 }, - { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 }, - { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, - { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 }, - { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, - { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, - { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, - { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 }, - { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 }, - { X86::PORrr, X86::PORrm, TB_ALIGN_16 }, - { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 }, - { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 }, - { X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 }, - { X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 }, - { X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 }, - { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 }, - { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 }, - { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 }, - { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 }, - { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 }, - { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 }, - { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 }, - { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 }, - { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 }, - { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 }, - { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 }, - { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 }, - { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 }, - { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 }, - { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 }, - { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 }, - { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 }, - { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 }, - { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 }, - { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 }, - { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 }, - { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 }, - { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, - { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, - { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, - { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE }, - { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE }, - { X86::SBB32rr, X86::SBB32rm, 0 }, - { X86::SBB64rr, X86::SBB64rm, 0 }, - { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, - { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 }, - { X86::SUB16rr, X86::SUB16rm, 0 }, - { X86::SUB32rr, X86::SUB32rm, 0 }, - { X86::SUB64rr, X86::SUB64rm, 0 }, - { X86::SUB8rr, X86::SUB8rm, 0 }, - { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, - { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, - { X86::SUBSDrr, X86::SUBSDrm, 0 }, - { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE }, - { X86::SUBSSrr, X86::SUBSSrm, 0 }, - { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE }, - // FIXME: TEST*rr -> swapped operand of TEST*mr. - { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, - { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, - { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, - { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 }, - { X86::XOR16rr, X86::XOR16rm, 0 }, - { X86::XOR32rr, X86::XOR32rm, 0 }, - { X86::XOR64rr, X86::XOR64rm, 0 }, - { X86::XOR8rr, X86::XOR8rm, 0 }, - { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 }, - { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 }, - - // MMX version of foldable instructions - { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 }, - { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 }, - { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 }, - { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 }, - { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 }, - { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 }, - { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 }, - { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 }, - { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 }, - { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 }, - { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 }, - { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 }, - { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 }, - { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 }, - { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 }, - { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 }, - { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 }, - { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 }, - { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 }, - { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 }, - { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 }, - { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 }, - { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 }, - { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 }, - { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 }, - { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 }, - { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 }, - { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 }, - { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 }, - { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 }, - { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 }, - { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 }, - { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 }, - { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 }, - { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 }, - { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 }, - { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 }, - { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 }, - { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 }, - { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 }, - { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 }, - { X86::MMX_PORirr, X86::MMX_PORirm, 0 }, - { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 }, - { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 }, - { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 }, - { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 }, - { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 }, - { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 }, - { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 }, - { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 }, - { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 }, - { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 }, - { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 }, - { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 }, - { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 }, - { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 }, - { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 }, - { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 }, - { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 }, - { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 }, - { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 }, - { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 }, - { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 }, - { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 }, - { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 }, - { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 }, - { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 }, - { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 }, - { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 }, - { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 }, - - // 3DNow! version of foldable instructions - { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 }, - { X86::PFACCrr, X86::PFACCrm, 0 }, - { X86::PFADDrr, X86::PFADDrm, 0 }, - { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 }, - { X86::PFCMPGErr, X86::PFCMPGErm, 0 }, - { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 }, - { X86::PFMAXrr, X86::PFMAXrm, 0 }, - { X86::PFMINrr, X86::PFMINrm, 0 }, - { X86::PFMULrr, X86::PFMULrm, 0 }, - { X86::PFNACCrr, X86::PFNACCrm, 0 }, - { X86::PFPNACCrr, X86::PFPNACCrm, 0 }, - { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 }, - { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 }, - { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 }, - { X86::PFSUBrr, X86::PFSUBrm, 0 }, - { X86::PFSUBRrr, X86::PFSUBRrm, 0 }, - { X86::PMULHRWrr, X86::PMULHRWrm, 0 }, - - // AVX 128-bit versions of foldable instructions - { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, - { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, - { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, - { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 }, - { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 }, - { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 }, - { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, - { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, - { X86::VADDPDrr, X86::VADDPDrm, 0 }, - { X86::VADDPSrr, X86::VADDPSrm, 0 }, - { X86::VADDSDrr, X86::VADDSDrm, 0 }, - { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE }, - { X86::VADDSSrr, X86::VADDSSrm, 0 }, - { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE }, - { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, - { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, - { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, - { X86::VANDNPSrr, X86::VANDNPSrm, 0 }, - { X86::VANDPDrr, X86::VANDPDrm, 0 }, - { X86::VANDPSrr, X86::VANDPSrm, 0 }, - { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 }, - { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 }, - { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 }, - { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 }, - { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, - { X86::VCMPPSrri, X86::VCMPPSrmi, 0 }, - { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, - { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, - { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, - { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, - { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, - { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE }, - { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, - { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE }, - { X86::VDPPDrri, X86::VDPPDrmi, 0 }, - { X86::VDPPSrri, X86::VDPPSrmi, 0 }, - { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, - { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, - { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, - { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, - { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE }, - { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE }, - { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 }, - { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 }, - { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 }, - { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 }, - { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, - { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, - { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, - { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE }, - { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, - { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE }, - { X86::VMINCPDrr, X86::VMINCPDrm, 0 }, - { X86::VMINCPSrr, X86::VMINCPSrm, 0 }, - { X86::VMINCSDrr, X86::VMINCSDrm, 0 }, - { X86::VMINCSSrr, X86::VMINCSSrm, 0 }, - { X86::VMINPDrr, X86::VMINPDrm, 0 }, - { X86::VMINPSrr, X86::VMINPSrm, 0 }, - { X86::VMINSDrr, X86::VMINSDrm, 0 }, - { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE }, - { X86::VMINSSrr, X86::VMINSSrm, 0 }, - { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE }, - { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE }, - { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, - { X86::VMULPDrr, X86::VMULPDrm, 0 }, - { X86::VMULPSrr, X86::VMULPSrm, 0 }, - { X86::VMULSDrr, X86::VMULSDrm, 0 }, - { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE }, - { X86::VMULSSrr, X86::VMULSSrm, 0 }, - { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE }, - { X86::VORPDrr, X86::VORPDrm, 0 }, - { X86::VORPSrr, X86::VORPSrm, 0 }, - { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, - { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 }, - { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 }, - { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 }, - { X86::VPADDBrr, X86::VPADDBrm, 0 }, - { X86::VPADDDrr, X86::VPADDDrm, 0 }, - { X86::VPADDQrr, X86::VPADDQrm, 0 }, - { X86::VPADDSBrr, X86::VPADDSBrm, 0 }, - { X86::VPADDSWrr, X86::VPADDSWrm, 0 }, - { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 }, - { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 }, - { X86::VPADDWrr, X86::VPADDWrm, 0 }, - { X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 }, - { X86::VPANDNrr, X86::VPANDNrm, 0 }, - { X86::VPANDrr, X86::VPANDrm, 0 }, - { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, - { X86::VPAVGWrr, X86::VPAVGWrm, 0 }, - { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 }, - { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, - { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 }, - { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 }, - { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 }, - { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 }, - { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 }, - { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 }, - { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 }, - { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 }, - { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 }, - { X86::VPHADDDrr, X86::VPHADDDrm, 0 }, - { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 }, - { X86::VPHADDWrr, X86::VPHADDWrm, 0 }, - { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 }, - { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 }, - { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 }, - { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 }, - { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 }, - { X86::VPINSRBrr, X86::VPINSRBrm, 0 }, - { X86::VPINSRDrr, X86::VPINSRDrm, 0 }, - { X86::VPINSRQrr, X86::VPINSRQrm, 0 }, - { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, - { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 }, - { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, - { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 }, - { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 }, - { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, - { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, - { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, - { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, - { X86::VPMINSBrr, X86::VPMINSBrm, 0 }, - { X86::VPMINSDrr, X86::VPMINSDrm, 0 }, - { X86::VPMINSWrr, X86::VPMINSWrm, 0 }, - { X86::VPMINUBrr, X86::VPMINUBrm, 0 }, - { X86::VPMINUDrr, X86::VPMINUDrm, 0 }, - { X86::VPMINUWrr, X86::VPMINUWrm, 0 }, - { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, - { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 }, - { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, - { X86::VPMULHWrr, X86::VPMULHWrm, 0 }, - { X86::VPMULLDrr, X86::VPMULLDrm, 0 }, - { X86::VPMULLWrr, X86::VPMULLWrm, 0 }, - { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 }, - { X86::VPORrr, X86::VPORrm, 0 }, - { X86::VPSADBWrr, X86::VPSADBWrm, 0 }, - { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 }, - { X86::VPSIGNBrr128, X86::VPSIGNBrm128, 0 }, - { X86::VPSIGNWrr128, X86::VPSIGNWrm128, 0 }, - { X86::VPSIGNDrr128, X86::VPSIGNDrm128, 0 }, - { X86::VPSLLDrr, X86::VPSLLDrm, 0 }, - { X86::VPSLLQrr, X86::VPSLLQrm, 0 }, - { X86::VPSLLWrr, X86::VPSLLWrm, 0 }, - { X86::VPSRADrr, X86::VPSRADrm, 0 }, - { X86::VPSRAWrr, X86::VPSRAWrm, 0 }, - { X86::VPSRLDrr, X86::VPSRLDrm, 0 }, - { X86::VPSRLQrr, X86::VPSRLQrm, 0 }, - { X86::VPSRLWrr, X86::VPSRLWrm, 0 }, - { X86::VPSUBBrr, X86::VPSUBBrm, 0 }, - { X86::VPSUBDrr, X86::VPSUBDrm, 0 }, - { X86::VPSUBQrr, X86::VPSUBQrm, 0 }, - { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 }, - { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 }, - { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 }, - { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 }, - { X86::VPSUBWrr, X86::VPSUBWrm, 0 }, - { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 }, - { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 }, - { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 }, - { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 }, - { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 }, - { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 }, - { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, - { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, - { X86::VPXORrr, X86::VPXORrm, 0 }, - { X86::VRCPSSr, X86::VRCPSSm, 0 }, - { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE }, - { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, - { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE }, - { X86::VROUNDSDr, X86::VROUNDSDm, 0 }, - { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE }, - { X86::VROUNDSSr, X86::VROUNDSSm, 0 }, - { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE }, - { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, - { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, - { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, - { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE }, - { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, - { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE }, - { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, - { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, - { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, - { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE }, - { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, - { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE }, - { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, - { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, - { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, - { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 }, - { X86::VXORPDrr, X86::VXORPDrm, 0 }, - { X86::VXORPSrr, X86::VXORPSrm, 0 }, - - // AVX 256-bit foldable instructions - { X86::VADDPDYrr, X86::VADDPDYrm, 0 }, - { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, - { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 }, - { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 }, - { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 }, - { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 }, - { X86::VANDPDYrr, X86::VANDPDYrm, 0 }, - { X86::VANDPSYrr, X86::VANDPSYrm, 0 }, - { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 }, - { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 }, - { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 }, - { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 }, - { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 }, - { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, - { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, - { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, - { X86::VDPPSYrri, X86::VDPPSYrmi, 0 }, - { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, - { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, - { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, - { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 }, - { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 }, - { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 }, - { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 }, - { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, - { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, - { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 }, - { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 }, - { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, - { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, - { X86::VMULPDYrr, X86::VMULPDYrm, 0 }, - { X86::VMULPSYrr, X86::VMULPSYrm, 0 }, - { X86::VORPDYrr, X86::VORPDYrm, 0 }, - { X86::VORPSYrr, X86::VORPSYrm, 0 }, - { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 }, - { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 }, - { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 }, - { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 }, - { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 }, - { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 }, - { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 }, - { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 }, - { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 }, - { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 }, - { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 }, - { X86::VXORPDYrr, X86::VXORPDYrm, 0 }, - { X86::VXORPSYrr, X86::VXORPSYrm, 0 }, - - // AVX2 foldable instructions - { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 }, - { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 }, - { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 }, - { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 }, - { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 }, - { X86::VPADDBYrr, X86::VPADDBYrm, 0 }, - { X86::VPADDDYrr, X86::VPADDDYrm, 0 }, - { X86::VPADDQYrr, X86::VPADDQYrm, 0 }, - { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 }, - { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 }, - { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 }, - { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 }, - { X86::VPADDWYrr, X86::VPADDWYrm, 0 }, - { X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 }, - { X86::VPANDNYrr, X86::VPANDNYrm, 0 }, - { X86::VPANDYrr, X86::VPANDYrm, 0 }, - { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 }, - { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 }, - { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 }, - { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 }, - { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 }, - { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, - { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 }, - { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 }, - { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 }, - { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 }, - { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 }, - { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 }, - { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 }, - { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 }, - { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 }, - { X86::VPERMDYrr, X86::VPERMDYrm, 0 }, - { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 }, - { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 }, - { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 }, - { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 }, - { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 }, - { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 }, - { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, - { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 }, - { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, - { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 }, - { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 }, - { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, - { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, - { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 }, - { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, - { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 }, - { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 }, - { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 }, - { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 }, - { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 }, - { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 }, - { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, - { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, - { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 }, - { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 }, - { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 }, - { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 }, - { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 }, - { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 }, - { X86::VPORYrr, X86::VPORYrm, 0 }, - { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 }, - { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 }, - { X86::VPSIGNBYrr256, X86::VPSIGNBYrm256, 0 }, - { X86::VPSIGNWYrr256, X86::VPSIGNWYrm256, 0 }, - { X86::VPSIGNDYrr256, X86::VPSIGNDYrm256, 0 }, - { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 }, - { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 }, - { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 }, - { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 }, - { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 }, - { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 }, - { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 }, - { X86::VPSRADYrr, X86::VPSRADYrm, 0 }, - { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, - { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, - { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, - { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, - { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, - { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, - { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 }, - { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 }, - { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 }, - { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 }, - { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 }, - { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 }, - { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 }, - { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 }, - { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 }, - { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 }, - { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 }, - { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 }, - { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 }, - { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 }, - { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 }, - { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 }, - { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 }, - { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 }, - { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 }, - { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 }, - { X86::VPXORYrr, X86::VPXORYrm, 0 }, - - // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE }, - { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE }, - { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE }, - { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE }, - { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE }, - { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE }, - { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE }, - { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE }, - { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE }, - { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE }, - { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE }, - { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE }, - { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE }, - { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE }, - { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE }, - { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE }, - { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE }, - { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE }, - { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE }, - { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE }, - { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE }, - { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE }, - { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE }, - - // XOP foldable instructions - { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 }, - { X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0 }, - { X86::VPCOMBri, X86::VPCOMBmi, 0 }, - { X86::VPCOMDri, X86::VPCOMDmi, 0 }, - { X86::VPCOMQri, X86::VPCOMQmi, 0 }, - { X86::VPCOMWri, X86::VPCOMWmi, 0 }, - { X86::VPCOMUBri, X86::VPCOMUBmi, 0 }, - { X86::VPCOMUDri, X86::VPCOMUDmi, 0 }, - { X86::VPCOMUQri, X86::VPCOMUQmi, 0 }, - { X86::VPCOMUWri, X86::VPCOMUWmi, 0 }, - { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 }, - { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYmr, 0 }, - { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 }, - { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYmr, 0 }, - { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 }, - { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 }, - { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 }, - { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 }, - { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 }, - { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 }, - { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 }, - { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 }, - { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 }, - { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 }, - { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 }, - { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 }, - { X86::VPPERMrrr, X86::VPPERMrmr, 0 }, - { X86::VPROTBrr, X86::VPROTBrm, 0 }, - { X86::VPROTDrr, X86::VPROTDrm, 0 }, - { X86::VPROTQrr, X86::VPROTQrm, 0 }, - { X86::VPROTWrr, X86::VPROTWrm, 0 }, - { X86::VPSHABrr, X86::VPSHABrm, 0 }, - { X86::VPSHADrr, X86::VPSHADrm, 0 }, - { X86::VPSHAQrr, X86::VPSHAQrm, 0 }, - { X86::VPSHAWrr, X86::VPSHAWrm, 0 }, - { X86::VPSHLBrr, X86::VPSHLBrm, 0 }, - { X86::VPSHLDrr, X86::VPSHLDrm, 0 }, - { X86::VPSHLQrr, X86::VPSHLQrm, 0 }, - { X86::VPSHLWrr, X86::VPSHLWrm, 0 }, - - // BMI/BMI2 foldable instructions - { X86::ANDN32rr, X86::ANDN32rm, 0 }, - { X86::ANDN64rr, X86::ANDN64rm, 0 }, - { X86::MULX32rr, X86::MULX32rm, 0 }, - { X86::MULX64rr, X86::MULX64rm, 0 }, - { X86::PDEP32rr, X86::PDEP32rm, 0 }, - { X86::PDEP64rr, X86::PDEP64rm, 0 }, - { X86::PEXT32rr, X86::PEXT32rm, 0 }, - { X86::PEXT64rr, X86::PEXT64rm, 0 }, - - // ADX foldable instructions - { X86::ADCX32rr, X86::ADCX32rm, 0 }, - { X86::ADCX64rr, X86::ADCX64rm, 0 }, - { X86::ADOX32rr, X86::ADOX32rm, 0 }, - { X86::ADOX64rr, X86::ADOX64rm, 0 }, - - // AVX-512 foldable instructions - { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, - { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, - { X86::VADDSDZrr, X86::VADDSDZrm, 0 }, - { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE }, - { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, - { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE }, - { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, - { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, - { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 }, - { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 }, - { X86::VANDPDZrr, X86::VANDPDZrm, 0 }, - { X86::VANDPSZrr, X86::VANDPSZrm, 0 }, - { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 }, - { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 }, - { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 }, - { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE }, - { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 }, - { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE }, - { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, - { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, - { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 }, - { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE }, - { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, - { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE }, - { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 }, - { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 }, - { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 }, - { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 }, - { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 }, - { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 }, - { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 }, - { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 }, - { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 }, - { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 }, - { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 }, - { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 }, - { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, - { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, - { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 }, - { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE }, - { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 }, - { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE }, - { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 }, - { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 }, - { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 }, - { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 }, - { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, - { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, - { X86::VMINSDZrr, X86::VMINSDZrm, 0 }, - { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE }, - { X86::VMINSSZrr, X86::VMINSSZrm, 0 }, - { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE }, - { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE }, - { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, - { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, - { X86::VMULSDZrr, X86::VMULSDZrm, 0 }, - { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE }, - { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, - { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE }, - { X86::VORPDZrr, X86::VORPDZrm, 0 }, - { X86::VORPSZrr, X86::VORPSZrm, 0 }, - { X86::VPACKSSDWZrr, X86::VPACKSSDWZrm, 0 }, - { X86::VPACKSSWBZrr, X86::VPACKSSWBZrm, 0 }, - { X86::VPACKUSDWZrr, X86::VPACKUSDWZrm, 0 }, - { X86::VPACKUSWBZrr, X86::VPACKUSWBZrm, 0 }, - { X86::VPADDBZrr, X86::VPADDBZrm, 0 }, - { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, - { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, - { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 }, - { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 }, - { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 }, - { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 }, - { X86::VPADDWZrr, X86::VPADDWZrm, 0 }, - { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 }, - { X86::VPANDDZrr, X86::VPANDDZrm, 0 }, - { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 }, - { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 }, - { X86::VPANDQZrr, X86::VPANDQZrm, 0 }, - { X86::VPAVGBZrr, X86::VPAVGBZrm, 0 }, - { X86::VPAVGWZrr, X86::VPAVGWZrm, 0 }, - { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 }, - { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 }, - { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 }, - { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 }, - { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 }, - { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 }, - { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 }, - { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 }, - { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 }, - { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 }, - { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 }, - { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 }, - { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 }, - { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 }, - { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 }, - { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 }, - { X86::VPERMBZrr, X86::VPERMBZrm, 0 }, - { X86::VPERMDZrr, X86::VPERMDZrm, 0 }, - { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 }, - { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 }, - { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 }, - { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, - { X86::VPERMQZrr, X86::VPERMQZrm, 0 }, - { X86::VPERMWZrr, X86::VPERMWZrm, 0 }, - { X86::VPINSRBZrr, X86::VPINSRBZrm, 0 }, - { X86::VPINSRDZrr, X86::VPINSRDZrm, 0 }, - { X86::VPINSRQZrr, X86::VPINSRQZrm, 0 }, - { X86::VPINSRWZrr, X86::VPINSRWZrm, 0 }, - { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 }, - { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 }, - { X86::VPMAXSBZrr, X86::VPMAXSBZrm, 0 }, - { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 }, - { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 }, - { X86::VPMAXSWZrr, X86::VPMAXSWZrm, 0 }, - { X86::VPMAXUBZrr, X86::VPMAXUBZrm, 0 }, - { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 }, - { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 }, - { X86::VPMAXUWZrr, X86::VPMAXUWZrm, 0 }, - { X86::VPMINSBZrr, X86::VPMINSBZrm, 0 }, - { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 }, - { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 }, - { X86::VPMINSWZrr, X86::VPMINSWZrm, 0 }, - { X86::VPMINUBZrr, X86::VPMINUBZrm, 0 }, - { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 }, - { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 }, - { X86::VPMINUWZrr, X86::VPMINUWZrm, 0 }, - { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 }, - { X86::VPMULLDZrr, X86::VPMULLDZrm, 0 }, - { X86::VPMULLQZrr, X86::VPMULLQZrm, 0 }, - { X86::VPMULLWZrr, X86::VPMULLWZrm, 0 }, - { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, - { X86::VPORDZrr, X86::VPORDZrm, 0 }, - { X86::VPORQZrr, X86::VPORQZrm, 0 }, - { X86::VPSADBWZ512rr, X86::VPSADBWZ512rm, 0 }, - { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 }, - { X86::VPSLLDZrr, X86::VPSLLDZrm, 0 }, - { X86::VPSLLQZrr, X86::VPSLLQZrm, 0 }, - { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, - { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, - { X86::VPSLLVWZrr, X86::VPSLLVWZrm, 0 }, - { X86::VPSLLWZrr, X86::VPSLLWZrm, 0 }, - { X86::VPSRADZrr, X86::VPSRADZrm, 0 }, - { X86::VPSRAQZrr, X86::VPSRAQZrm, 0 }, - { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, - { X86::VPSRAVQZrr, X86::VPSRAVQZrm, 0 }, - { X86::VPSRAVWZrr, X86::VPSRAVWZrm, 0 }, - { X86::VPSRAWZrr, X86::VPSRAWZrm, 0 }, - { X86::VPSRLDZrr, X86::VPSRLDZrm, 0 }, - { X86::VPSRLQZrr, X86::VPSRLQZrm, 0 }, - { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, - { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, - { X86::VPSRLVWZrr, X86::VPSRLVWZrm, 0 }, - { X86::VPSRLWZrr, X86::VPSRLWZrm, 0 }, - { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 }, - { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 }, - { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, - { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 }, - { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 }, - { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 }, - { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 }, - { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 }, - { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 }, - { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 }, - { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 }, - { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 }, - { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 }, - { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 }, - { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 }, - { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 }, - { X86::VPXORDZrr, X86::VPXORDZrm, 0 }, - { X86::VPXORQZrr, X86::VPXORQZrm, 0 }, - { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, - { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, - { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, - { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, - { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, - { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE }, - { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, - { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE }, - { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 }, - { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 }, - { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 }, - { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 }, - { X86::VXORPDZrr, X86::VXORPDZrm, 0 }, - { X86::VXORPSZrr, X86::VXORPSZrm, 0 }, - - // AVX-512{F,VL} foldable instructions - { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 }, - { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, - { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, - { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, - { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 }, - { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 }, - { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 }, - { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 }, - { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 }, - { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 }, - { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 }, - { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 }, - { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 }, - { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 }, - { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 }, - { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 }, - { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 }, - { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 }, - { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 }, - { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 }, - { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 }, - { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 }, - { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 }, - { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 }, - { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 }, - { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 }, - { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 }, - { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 }, - { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 }, - { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 }, - { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 }, - { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 }, - { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 }, - { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 }, - { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 }, - { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 }, - { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 }, - { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 }, - { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 }, - { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 }, - { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 }, - { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 }, - { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 }, - { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 }, - { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 }, - { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 }, - { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 }, - { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 }, - { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 }, - { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 }, - { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 }, - { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 }, - { X86::VPACKSSDWZ256rr, X86::VPACKSSDWZ256rm, 0 }, - { X86::VPACKSSDWZ128rr, X86::VPACKSSDWZ128rm, 0 }, - { X86::VPACKSSWBZ256rr, X86::VPACKSSWBZ256rm, 0 }, - { X86::VPACKSSWBZ128rr, X86::VPACKSSWBZ128rm, 0 }, - { X86::VPACKUSDWZ256rr, X86::VPACKUSDWZ256rm, 0 }, - { X86::VPACKUSDWZ128rr, X86::VPACKUSDWZ128rm, 0 }, - { X86::VPACKUSWBZ256rr, X86::VPACKUSWBZ256rm, 0 }, - { X86::VPACKUSWBZ128rr, X86::VPACKUSWBZ128rm, 0 }, - { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 }, - { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 }, - { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 }, - { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 }, - { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 }, - { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 }, - { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 }, - { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 }, - { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 }, - { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 }, - { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 }, - { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 }, - { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 }, - { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 }, - { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 }, - { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 }, - { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 }, - { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 }, - { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 }, - { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 }, - { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 }, - { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 }, - { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 }, - { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 }, - { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 }, - { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 }, - { X86::VPAVGBZ128rr, X86::VPAVGBZ128rm, 0 }, - { X86::VPAVGBZ256rr, X86::VPAVGBZ256rm, 0 }, - { X86::VPAVGWZ128rr, X86::VPAVGWZ128rm, 0 }, - { X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0 }, - { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 }, - { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 }, - { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 }, - { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 }, - { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 }, - { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 }, - { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 }, - { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 }, - { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 }, - { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 }, - { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 }, - { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 }, - { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 }, - { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 }, - { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 }, - { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 }, - { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 }, - { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 }, - { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 }, - { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 }, - { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 }, - { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 }, - { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 }, - { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 }, - { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 }, - { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 }, - { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 }, - { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 }, - { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 }, - { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 }, - { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 }, - { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 }, - { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 }, - { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 }, - { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 }, - { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 }, - { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 }, - { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 }, - { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 }, - { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 }, - { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 }, - { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 }, - { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 }, - { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 }, - { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 }, - { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 }, - { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 }, - { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 }, - { X86::VPMAXSBZ128rr, X86::VPMAXSBZ128rm, 0 }, - { X86::VPMAXSBZ256rr, X86::VPMAXSBZ256rm, 0 }, - { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rm, 0 }, - { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rm, 0 }, - { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rm, 0 }, - { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rm, 0 }, - { X86::VPMAXSWZ128rr, X86::VPMAXSWZ128rm, 0 }, - { X86::VPMAXSWZ256rr, X86::VPMAXSWZ256rm, 0 }, - { X86::VPMAXUBZ128rr, X86::VPMAXUBZ128rm, 0 }, - { X86::VPMAXUBZ256rr, X86::VPMAXUBZ256rm, 0 }, - { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rm, 0 }, - { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rm, 0 }, - { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rm, 0 }, - { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rm, 0 }, - { X86::VPMAXUWZ128rr, X86::VPMAXUWZ128rm, 0 }, - { X86::VPMAXUWZ256rr, X86::VPMAXUWZ256rm, 0 }, - { X86::VPMINSBZ128rr, X86::VPMINSBZ128rm, 0 }, - { X86::VPMINSBZ256rr, X86::VPMINSBZ256rm, 0 }, - { X86::VPMINSDZ128rr, X86::VPMINSDZ128rm, 0 }, - { X86::VPMINSDZ256rr, X86::VPMINSDZ256rm, 0 }, - { X86::VPMINSQZ128rr, X86::VPMINSQZ128rm, 0 }, - { X86::VPMINSQZ256rr, X86::VPMINSQZ256rm, 0 }, - { X86::VPMINSWZ128rr, X86::VPMINSWZ128rm, 0 }, - { X86::VPMINSWZ256rr, X86::VPMINSWZ256rm, 0 }, - { X86::VPMINUBZ128rr, X86::VPMINUBZ128rm, 0 }, - { X86::VPMINUBZ256rr, X86::VPMINUBZ256rm, 0 }, - { X86::VPMINUDZ128rr, X86::VPMINUDZ128rm, 0 }, - { X86::VPMINUDZ256rr, X86::VPMINUDZ256rm, 0 }, - { X86::VPMINUQZ128rr, X86::VPMINUQZ128rm, 0 }, - { X86::VPMINUQZ256rr, X86::VPMINUQZ256rm, 0 }, - { X86::VPMINUWZ128rr, X86::VPMINUWZ128rm, 0 }, - { X86::VPMINUWZ256rr, X86::VPMINUWZ256rm, 0 }, - { X86::VPMULDQZ128rr, X86::VPMULDQZ128rm, 0 }, - { X86::VPMULDQZ256rr, X86::VPMULDQZ256rm, 0 }, - { X86::VPMULLDZ128rr, X86::VPMULLDZ128rm, 0 }, - { X86::VPMULLDZ256rr, X86::VPMULLDZ256rm, 0 }, - { X86::VPMULLQZ128rr, X86::VPMULLQZ128rm, 0 }, - { X86::VPMULLQZ256rr, X86::VPMULLQZ256rm, 0 }, - { X86::VPMULLWZ128rr, X86::VPMULLWZ128rm, 0 }, - { X86::VPMULLWZ256rr, X86::VPMULLWZ256rm, 0 }, - { X86::VPMULUDQZ128rr, X86::VPMULUDQZ128rm, 0 }, - { X86::VPMULUDQZ256rr, X86::VPMULUDQZ256rm, 0 }, - { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 }, - { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 }, - { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 }, - { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 }, - { X86::VPSADBWZ128rr, X86::VPSADBWZ128rm, 0 }, - { X86::VPSADBWZ256rr, X86::VPSADBWZ256rm, 0 }, - { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 }, - { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 }, - { X86::VPSLLDZ128rr, X86::VPSLLDZ128rm, 0 }, - { X86::VPSLLDZ256rr, X86::VPSLLDZ256rm, 0 }, - { X86::VPSLLQZ128rr, X86::VPSLLQZ128rm, 0 }, - { X86::VPSLLQZ256rr, X86::VPSLLQZ256rm, 0 }, - { X86::VPSLLVDZ128rr, X86::VPSLLVDZ128rm, 0 }, - { X86::VPSLLVDZ256rr, X86::VPSLLVDZ256rm, 0 }, - { X86::VPSLLVQZ128rr, X86::VPSLLVQZ128rm, 0 }, - { X86::VPSLLVQZ256rr, X86::VPSLLVQZ256rm, 0 }, - { X86::VPSLLVWZ128rr, X86::VPSLLVWZ128rm, 0 }, - { X86::VPSLLVWZ256rr, X86::VPSLLVWZ256rm, 0 }, - { X86::VPSLLWZ128rr, X86::VPSLLWZ128rm, 0 }, - { X86::VPSLLWZ256rr, X86::VPSLLWZ256rm, 0 }, - { X86::VPSRADZ128rr, X86::VPSRADZ128rm, 0 }, - { X86::VPSRADZ256rr, X86::VPSRADZ256rm, 0 }, - { X86::VPSRAQZ128rr, X86::VPSRAQZ128rm, 0 }, - { X86::VPSRAQZ256rr, X86::VPSRAQZ256rm, 0 }, - { X86::VPSRAVDZ128rr, X86::VPSRAVDZ128rm, 0 }, - { X86::VPSRAVDZ256rr, X86::VPSRAVDZ256rm, 0 }, - { X86::VPSRAVQZ128rr, X86::VPSRAVQZ128rm, 0 }, - { X86::VPSRAVQZ256rr, X86::VPSRAVQZ256rm, 0 }, - { X86::VPSRAVWZ128rr, X86::VPSRAVWZ128rm, 0 }, - { X86::VPSRAVWZ256rr, X86::VPSRAVWZ256rm, 0 }, - { X86::VPSRAWZ128rr, X86::VPSRAWZ128rm, 0 }, - { X86::VPSRAWZ256rr, X86::VPSRAWZ256rm, 0 }, - { X86::VPSRLDZ128rr, X86::VPSRLDZ128rm, 0 }, - { X86::VPSRLDZ256rr, X86::VPSRLDZ256rm, 0 }, - { X86::VPSRLQZ128rr, X86::VPSRLQZ128rm, 0 }, - { X86::VPSRLQZ256rr, X86::VPSRLQZ256rm, 0 }, - { X86::VPSRLVDZ128rr, X86::VPSRLVDZ128rm, 0 }, - { X86::VPSRLVDZ256rr, X86::VPSRLVDZ256rm, 0 }, - { X86::VPSRLVQZ128rr, X86::VPSRLVQZ128rm, 0 }, - { X86::VPSRLVQZ256rr, X86::VPSRLVQZ256rm, 0 }, - { X86::VPSRLVWZ128rr, X86::VPSRLVWZ128rm, 0 }, - { X86::VPSRLVWZ256rr, X86::VPSRLVWZ256rm, 0 }, - { X86::VPSRLWZ128rr, X86::VPSRLWZ128rm, 0 }, - { X86::VPSRLWZ256rr, X86::VPSRLWZ256rm, 0 }, - { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 }, - { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 }, - { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 }, - { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 }, - { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 }, - { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 }, - { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 }, - { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 }, - { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 }, - { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 }, - { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 }, - { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 }, - { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 }, - { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 }, - { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 }, - { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 }, - { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 }, - { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 }, - { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 }, - { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 }, - { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 }, - { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 }, - { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 }, - { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 }, - { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 }, - { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 }, - { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 }, - { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 }, - { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 }, - { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 }, - { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 }, - { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 }, - { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 }, - { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 }, - { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 }, - { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 }, - { X86::VSHUFPDZ128rri, X86::VSHUFPDZ128rmi, 0 }, - { X86::VSHUFPDZ256rri, X86::VSHUFPDZ256rmi, 0 }, - { X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmi, 0 }, - { X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmi, 0 }, - { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 }, - { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 }, - { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 }, - { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 }, - { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 }, - { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 }, - { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 }, - { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 }, - { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 }, - { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 }, - { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 }, - { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 }, - { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 }, - { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 }, - { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 }, - { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 }, - - // AVX-512 masked foldable instructions - { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, - { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 }, - { X86::VPABSDZrrkz, X86::VPABSDZrmkz, 0 }, - { X86::VPABSQZrrkz, X86::VPABSQZrmkz, 0 }, - { X86::VPABSWZrrkz, X86::VPABSWZrmkz, 0 }, - { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 }, - { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 }, - { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 }, - { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 }, - { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 }, - { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 }, - { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 }, - { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 }, - { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 }, - { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 }, - { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 }, - { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 }, - { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 }, - { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 }, - { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 }, - { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 }, - { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 }, - { X86::VPSLLDZrikz, X86::VPSLLDZmikz, 0 }, - { X86::VPSLLQZrikz, X86::VPSLLQZmikz, 0 }, - { X86::VPSLLWZrikz, X86::VPSLLWZmikz, 0 }, - { X86::VPSRADZrikz, X86::VPSRADZmikz, 0 }, - { X86::VPSRAQZrikz, X86::VPSRAQZmikz, 0 }, - { X86::VPSRAWZrikz, X86::VPSRAWZmikz, 0 }, - { X86::VPSRLDZrikz, X86::VPSRLDZmikz, 0 }, - { X86::VPSRLQZrikz, X86::VPSRLQZmikz, 0 }, - { X86::VPSRLWZrikz, X86::VPSRLWZmikz, 0 }, - - // AVX-512VL 256-bit masked foldable instructions - { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, - { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 }, - { X86::VPABSDZ256rrkz, X86::VPABSDZ256rmkz, 0 }, - { X86::VPABSQZ256rrkz, X86::VPABSQZ256rmkz, 0 }, - { X86::VPABSWZ256rrkz, X86::VPABSWZ256rmkz, 0 }, - { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 }, - { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 }, - { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 }, - { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 }, - { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 }, - { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 }, - { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 }, - { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 }, - { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 }, - { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 }, - { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE }, - { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 }, - { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 }, - { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 }, - { X86::VPSLLDZ256rikz, X86::VPSLLDZ256mikz, 0 }, - { X86::VPSLLQZ256rikz, X86::VPSLLQZ256mikz, 0 }, - { X86::VPSLLWZ256rikz, X86::VPSLLWZ256mikz, 0 }, - { X86::VPSRADZ256rikz, X86::VPSRADZ256mikz, 0 }, - { X86::VPSRAQZ256rikz, X86::VPSRAQZ256mikz, 0 }, - { X86::VPSRAWZ256rikz, X86::VPSRAWZ256mikz, 0 }, - { X86::VPSRLDZ256rikz, X86::VPSRLDZ256mikz, 0 }, - { X86::VPSRLQZ256rikz, X86::VPSRLQZ256mikz, 0 }, - { X86::VPSRLWZ256rikz, X86::VPSRLWZ256mikz, 0 }, - - // AVX-512VL 128-bit masked foldable instructions - { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, - { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 }, - { X86::VPABSDZ128rrkz, X86::VPABSDZ128rmkz, 0 }, - { X86::VPABSQZ128rrkz, X86::VPABSQZ128rmkz, 0 }, - { X86::VPABSWZ128rrkz, X86::VPABSWZ128rmkz, 0 }, - { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 }, - { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 }, - { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE }, - { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 }, - { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 }, - { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 }, - { X86::VPSLLDZ128rikz, X86::VPSLLDZ128mikz, 0 }, - { X86::VPSLLQZ128rikz, X86::VPSLLQZ128mikz, 0 }, - { X86::VPSLLWZ128rikz, X86::VPSLLWZ128mikz, 0 }, - { X86::VPSRADZ128rikz, X86::VPSRADZ128mikz, 0 }, - { X86::VPSRAQZ128rikz, X86::VPSRAQZ128mikz, 0 }, - { X86::VPSRAWZ128rikz, X86::VPSRAWZ128mikz, 0 }, - { X86::VPSRLDZ128rikz, X86::VPSRLDZ128mikz, 0 }, - { X86::VPSRLQZ128rikz, X86::VPSRLQZ128mikz, 0 }, - { X86::VPSRLWZ128rikz, X86::VPSRLWZ128mikz, 0 }, - - // AES foldable instructions - { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, - { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 }, - { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 }, - { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 }, - { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 }, - { X86::VAESDECrr, X86::VAESDECrm, 0 }, - { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 }, - { X86::VAESENCrr, X86::VAESENCrm, 0 }, - - // SHA foldable instructions - { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 }, - { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 }, - { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 }, - { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 }, - { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 }, - { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 }, - { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 } - }; - for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) { AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, Entry.RegOp, Entry.MemOp, @@ -2435,1103 +150,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD); } - static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { - // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE }, - { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE }, - { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE }, - { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE }, - { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE }, - { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE }, - { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE }, - { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE }, - { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE }, - { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE }, - { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE }, - { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE }, - { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE }, - { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE }, - { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE }, - { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE }, - { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE }, - { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE }, - { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE }, - { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE }, - { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE }, - { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE }, - { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE }, - - // XOP foldable instructions - { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 }, - { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 }, - { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 }, - { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYrm, 0 }, - { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 }, - { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYrm, 0 }, - { X86::VPPERMrrr, X86::VPPERMrrm, 0 }, - - // AVX-512 instructions with 3 source operands. - { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 }, - { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, - { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, - { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, - { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, - { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 }, - { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 }, - { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 }, - { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 }, - { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 }, - { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 }, - { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 }, - { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 }, - { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 }, - - // AVX-512VL 256-bit instructions with 3 source operands. - { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 }, - { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 }, - { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 }, - { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 }, - { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 }, - { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 }, - { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 }, - { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 }, - { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 }, - { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 }, - { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 }, - { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 }, - { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 }, - { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 }, - - // AVX-512VL 128-bit instructions with 3 source operands. - { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 }, - { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 }, - { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 }, - { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 }, - { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 }, - { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 }, - { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 }, - { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 }, - { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 }, - { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 }, - { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 }, - { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 }, - { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 }, - { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 }, - - // AVX-512 masked instructions - { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, - { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, - { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE }, - { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE }, - { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 }, - { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 }, - { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 }, - { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 }, - { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 }, - { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 }, - { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, - { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, - { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE }, - { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE }, - { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 }, - { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 }, - { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 }, - { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 }, - { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 }, - { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 }, - { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 }, - { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 }, - { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 }, - { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 }, - { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, - { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, - { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, 0 }, - { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, 0 }, - { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 }, - { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 }, - { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, - { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, - { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, 0 }, - { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, 0 }, - { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, - { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, - { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE }, - { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE }, - { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 }, - { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 }, - { X86::VPACKSSDWZrrkz, X86::VPACKSSDWZrmkz, 0 }, - { X86::VPACKSSWBZrrkz, X86::VPACKSSWBZrmkz, 0 }, - { X86::VPACKUSDWZrrkz, X86::VPACKUSDWZrmkz, 0 }, - { X86::VPACKUSWBZrrkz, X86::VPACKUSWBZrmkz, 0 }, - { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 }, - { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 }, - { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 }, - { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 }, - { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 }, - { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 }, - { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 }, - { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 }, - { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 }, - { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 }, - { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 }, - { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 }, - { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 }, - { X86::VPAVGBZrrkz, X86::VPAVGBZrmkz, 0 }, - { X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0 }, - { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 }, - { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 }, - { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 }, - { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 }, - { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 }, - { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 }, - { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 }, - { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 }, - { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 }, - { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 }, - { X86::VPMAXSBZrrkz, X86::VPMAXSBZrmkz, 0 }, - { X86::VPMAXSDZrrkz, X86::VPMAXSDZrmkz, 0 }, - { X86::VPMAXSQZrrkz, X86::VPMAXSQZrmkz, 0 }, - { X86::VPMAXSWZrrkz, X86::VPMAXSWZrmkz, 0 }, - { X86::VPMAXUBZrrkz, X86::VPMAXUBZrmkz, 0 }, - { X86::VPMAXUDZrrkz, X86::VPMAXUDZrmkz, 0 }, - { X86::VPMAXUQZrrkz, X86::VPMAXUQZrmkz, 0 }, - { X86::VPMAXUWZrrkz, X86::VPMAXUWZrmkz, 0 }, - { X86::VPMINSBZrrkz, X86::VPMINSBZrmkz, 0 }, - { X86::VPMINSDZrrkz, X86::VPMINSDZrmkz, 0 }, - { X86::VPMINSQZrrkz, X86::VPMINSQZrmkz, 0 }, - { X86::VPMINSWZrrkz, X86::VPMINSWZrmkz, 0 }, - { X86::VPMINUBZrrkz, X86::VPMINUBZrmkz, 0 }, - { X86::VPMINUDZrrkz, X86::VPMINUDZrmkz, 0 }, - { X86::VPMINUQZrrkz, X86::VPMINUQZrmkz, 0 }, - { X86::VPMINUWZrrkz, X86::VPMINUWZrmkz, 0 }, - { X86::VPMULLDZrrkz, X86::VPMULLDZrmkz, 0 }, - { X86::VPMULLQZrrkz, X86::VPMULLQZrmkz, 0 }, - { X86::VPMULLWZrrkz, X86::VPMULLWZrmkz, 0 }, - { X86::VPMULDQZrrkz, X86::VPMULDQZrmkz, 0 }, - { X86::VPMULUDQZrrkz, X86::VPMULUDQZrmkz, 0 }, - { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 }, - { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 }, - { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 }, - { X86::VPSLLDZrrkz, X86::VPSLLDZrmkz, 0 }, - { X86::VPSLLQZrrkz, X86::VPSLLQZrmkz, 0 }, - { X86::VPSLLVDZrrkz, X86::VPSLLVDZrmkz, 0 }, - { X86::VPSLLVQZrrkz, X86::VPSLLVQZrmkz, 0 }, - { X86::VPSLLVWZrrkz, X86::VPSLLVWZrmkz, 0 }, - { X86::VPSLLWZrrkz, X86::VPSLLWZrmkz, 0 }, - { X86::VPSRADZrrkz, X86::VPSRADZrmkz, 0 }, - { X86::VPSRAQZrrkz, X86::VPSRAQZrmkz, 0 }, - { X86::VPSRAVDZrrkz, X86::VPSRAVDZrmkz, 0 }, - { X86::VPSRAVQZrrkz, X86::VPSRAVQZrmkz, 0 }, - { X86::VPSRAVWZrrkz, X86::VPSRAVWZrmkz, 0 }, - { X86::VPSRAWZrrkz, X86::VPSRAWZrmkz, 0 }, - { X86::VPSRLDZrrkz, X86::VPSRLDZrmkz, 0 }, - { X86::VPSRLQZrrkz, X86::VPSRLQZrmkz, 0 }, - { X86::VPSRLVDZrrkz, X86::VPSRLVDZrmkz, 0 }, - { X86::VPSRLVQZrrkz, X86::VPSRLVQZrmkz, 0 }, - { X86::VPSRLVWZrrkz, X86::VPSRLVWZrmkz, 0 }, - { X86::VPSRLWZrrkz, X86::VPSRLWZrmkz, 0 }, - { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 }, - { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 }, - { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 }, - { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 }, - { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 }, - { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 }, - { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 }, - { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 }, - { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 }, - { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 }, - { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 }, - { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 }, - { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 }, - { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 }, - { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 }, - { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 }, - { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 }, - { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 }, - { X86::VSHUFPDZrrikz, X86::VSHUFPDZrmikz, 0 }, - { X86::VSHUFPSZrrikz, X86::VSHUFPSZrmikz, 0 }, - { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, - { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, - { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE }, - { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE }, - { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 }, - { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 }, - { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 }, - { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 }, - { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 }, - { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 }, - - // AVX-512{F,VL} masked arithmetic instructions 256-bit - { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, - { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, - { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 }, - { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 }, - { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 }, - { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 }, - { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 }, - { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 }, - { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, - { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, - { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 }, - { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 }, - { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 }, - { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 }, - { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 }, - { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 }, - { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, - { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, - { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 }, - { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 }, - { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, - { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, - { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, - { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, - { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 }, - { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 }, - { X86::VPACKSSDWZ256rrkz, X86::VPACKSSDWZ256rmkz, 0 }, - { X86::VPACKSSWBZ256rrkz, X86::VPACKSSWBZ256rmkz, 0 }, - { X86::VPACKUSDWZ256rrkz, X86::VPACKUSDWZ256rmkz, 0 }, - { X86::VPACKUSWBZ256rrkz, X86::VPACKUSWBZ256rmkz, 0 }, - { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 }, - { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 }, - { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 }, - { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 }, - { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 }, - { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 }, - { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 }, - { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 }, - { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 }, - { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 }, - { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 }, - { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 }, - { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 }, - { X86::VPAVGBZ256rrkz, X86::VPAVGBZ256rmkz, 0 }, - { X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0 }, - { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 }, - { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 }, - { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 }, - { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 }, - { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 }, - { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 }, - { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 }, - { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 }, - { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 }, - { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 }, - { X86::VPMAXSBZ256rrkz, X86::VPMAXSBZ256rmkz, 0 }, - { X86::VPMAXSDZ256rrkz, X86::VPMAXSDZ256rmkz, 0 }, - { X86::VPMAXSQZ256rrkz, X86::VPMAXSQZ256rmkz, 0 }, - { X86::VPMAXSWZ256rrkz, X86::VPMAXSWZ256rmkz, 0 }, - { X86::VPMAXUBZ256rrkz, X86::VPMAXUBZ256rmkz, 0 }, - { X86::VPMAXUDZ256rrkz, X86::VPMAXUDZ256rmkz, 0 }, - { X86::VPMAXUQZ256rrkz, X86::VPMAXUQZ256rmkz, 0 }, - { X86::VPMAXUWZ256rrkz, X86::VPMAXUWZ256rmkz, 0 }, - { X86::VPMINSBZ256rrkz, X86::VPMINSBZ256rmkz, 0 }, - { X86::VPMINSDZ256rrkz, X86::VPMINSDZ256rmkz, 0 }, - { X86::VPMINSQZ256rrkz, X86::VPMINSQZ256rmkz, 0 }, - { X86::VPMINSWZ256rrkz, X86::VPMINSWZ256rmkz, 0 }, - { X86::VPMINUBZ256rrkz, X86::VPMINUBZ256rmkz, 0 }, - { X86::VPMINUDZ256rrkz, X86::VPMINUDZ256rmkz, 0 }, - { X86::VPMINUQZ256rrkz, X86::VPMINUQZ256rmkz, 0 }, - { X86::VPMINUWZ256rrkz, X86::VPMINUWZ256rmkz, 0 }, - { X86::VPMULDQZ256rrkz, X86::VPMULDQZ256rmkz, 0 }, - { X86::VPMULLDZ256rrkz, X86::VPMULLDZ256rmkz, 0 }, - { X86::VPMULLQZ256rrkz, X86::VPMULLQZ256rmkz, 0 }, - { X86::VPMULLWZ256rrkz, X86::VPMULLWZ256rmkz, 0 }, - { X86::VPMULUDQZ256rrkz, X86::VPMULUDQZ256rmkz, 0 }, - { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 }, - { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 }, - { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 }, - { X86::VPSLLDZ256rrkz, X86::VPSLLDZ256rmkz, 0 }, - { X86::VPSLLQZ256rrkz, X86::VPSLLQZ256rmkz, 0 }, - { X86::VPSLLVDZ256rrkz, X86::VPSLLVDZ256rmkz, 0 }, - { X86::VPSLLVQZ256rrkz, X86::VPSLLVQZ256rmkz, 0 }, - { X86::VPSLLVWZ256rrkz, X86::VPSLLVWZ256rmkz, 0 }, - { X86::VPSLLWZ256rrkz, X86::VPSLLWZ256rmkz, 0 }, - { X86::VPSRADZ256rrkz, X86::VPSRADZ256rmkz, 0 }, - { X86::VPSRAQZ256rrkz, X86::VPSRAQZ256rmkz, 0 }, - { X86::VPSRAVDZ256rrkz, X86::VPSRAVDZ256rmkz, 0 }, - { X86::VPSRAVQZ256rrkz, X86::VPSRAVQZ256rmkz, 0 }, - { X86::VPSRAVWZ256rrkz, X86::VPSRAVWZ256rmkz, 0 }, - { X86::VPSRAWZ256rrkz, X86::VPSRAWZ256rmkz, 0 }, - { X86::VPSRLDZ256rrkz, X86::VPSRLDZ256rmkz, 0 }, - { X86::VPSRLQZ256rrkz, X86::VPSRLQZ256rmkz, 0 }, - { X86::VPSRLVDZ256rrkz, X86::VPSRLVDZ256rmkz, 0 }, - { X86::VPSRLVQZ256rrkz, X86::VPSRLVQZ256rmkz, 0 }, - { X86::VPSRLVWZ256rrkz, X86::VPSRLVWZ256rmkz, 0 }, - { X86::VPSRLWZ256rrkz, X86::VPSRLWZ256rmkz, 0 }, - { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 }, - { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 }, - { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 }, - { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 }, - { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 }, - { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 }, - { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 }, - { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 }, - { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 }, - { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 }, - { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 }, - { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 }, - { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 }, - { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 }, - { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 }, - { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 }, - { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 }, - { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 }, - { X86::VSHUFPDZ256rrikz, X86::VSHUFPDZ256rmikz, 0 }, - { X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmikz, 0 }, - { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, - { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, - { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 }, - { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 }, - { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 }, - { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 }, - { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 }, - { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 }, - - // AVX-512{F,VL} masked arithmetic instructions 128-bit - { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, - { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, - { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 }, - { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 }, - { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 }, - { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 }, - { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 }, - { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 }, - { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, - { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, - { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 }, - { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 }, - { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }, - { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, - { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 }, - { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 }, - { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, - { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, - { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, - { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, - { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 }, - { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 }, - { X86::VPACKSSDWZ128rrkz, X86::VPACKSSDWZ128rmkz, 0 }, - { X86::VPACKSSWBZ128rrkz, X86::VPACKSSWBZ128rmkz, 0 }, - { X86::VPACKUSDWZ128rrkz, X86::VPACKUSDWZ128rmkz, 0 }, - { X86::VPACKUSWBZ128rrkz, X86::VPACKUSWBZ128rmkz, 0 }, - { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 }, - { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 }, - { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 }, - { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 }, - { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 }, - { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 }, - { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 }, - { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 }, - { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 }, - { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 }, - { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 }, - { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 }, - { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 }, - { X86::VPAVGBZ128rrkz, X86::VPAVGBZ128rmkz, 0 }, - { X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0 }, - { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 }, - { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 }, - { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 }, - { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 }, - { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 }, - { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 }, - { X86::VPMAXSBZ128rrkz, X86::VPMAXSBZ128rmkz, 0 }, - { X86::VPMAXSDZ128rrkz, X86::VPMAXSDZ128rmkz, 0 }, - { X86::VPMAXSQZ128rrkz, X86::VPMAXSQZ128rmkz, 0 }, - { X86::VPMAXSWZ128rrkz, X86::VPMAXSWZ128rmkz, 0 }, - { X86::VPMAXUBZ128rrkz, X86::VPMAXUBZ128rmkz, 0 }, - { X86::VPMAXUDZ128rrkz, X86::VPMAXUDZ128rmkz, 0 }, - { X86::VPMAXUQZ128rrkz, X86::VPMAXUQZ128rmkz, 0 }, - { X86::VPMAXUWZ128rrkz, X86::VPMAXUWZ128rmkz, 0 }, - { X86::VPMINSBZ128rrkz, X86::VPMINSBZ128rmkz, 0 }, - { X86::VPMINSDZ128rrkz, X86::VPMINSDZ128rmkz, 0 }, - { X86::VPMINSQZ128rrkz, X86::VPMINSQZ128rmkz, 0 }, - { X86::VPMINSWZ128rrkz, X86::VPMINSWZ128rmkz, 0 }, - { X86::VPMINUBZ128rrkz, X86::VPMINUBZ128rmkz, 0 }, - { X86::VPMINUDZ128rrkz, X86::VPMINUDZ128rmkz, 0 }, - { X86::VPMINUQZ128rrkz, X86::VPMINUQZ128rmkz, 0 }, - { X86::VPMINUWZ128rrkz, X86::VPMINUWZ128rmkz, 0 }, - { X86::VPMULDQZ128rrkz, X86::VPMULDQZ128rmkz, 0 }, - { X86::VPMULLDZ128rrkz, X86::VPMULLDZ128rmkz, 0 }, - { X86::VPMULLQZ128rrkz, X86::VPMULLQZ128rmkz, 0 }, - { X86::VPMULLWZ128rrkz, X86::VPMULLWZ128rmkz, 0 }, - { X86::VPMULUDQZ128rrkz, X86::VPMULUDQZ128rmkz, 0 }, - { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 }, - { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 }, - { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 }, - { X86::VPSLLDZ128rrkz, X86::VPSLLDZ128rmkz, 0 }, - { X86::VPSLLQZ128rrkz, X86::VPSLLQZ128rmkz, 0 }, - { X86::VPSLLVDZ128rrkz, X86::VPSLLVDZ128rmkz, 0 }, - { X86::VPSLLVQZ128rrkz, X86::VPSLLVQZ128rmkz, 0 }, - { X86::VPSLLVWZ128rrkz, X86::VPSLLVWZ128rmkz, 0 }, - { X86::VPSLLWZ128rrkz, X86::VPSLLWZ128rmkz, 0 }, - { X86::VPSRADZ128rrkz, X86::VPSRADZ128rmkz, 0 }, - { X86::VPSRAQZ128rrkz, X86::VPSRAQZ128rmkz, 0 }, - { X86::VPSRAVDZ128rrkz, X86::VPSRAVDZ128rmkz, 0 }, - { X86::VPSRAVQZ128rrkz, X86::VPSRAVQZ128rmkz, 0 }, - { X86::VPSRAVWZ128rrkz, X86::VPSRAVWZ128rmkz, 0 }, - { X86::VPSRAWZ128rrkz, X86::VPSRAWZ128rmkz, 0 }, - { X86::VPSRLDZ128rrkz, X86::VPSRLDZ128rmkz, 0 }, - { X86::VPSRLQZ128rrkz, X86::VPSRLQZ128rmkz, 0 }, - { X86::VPSRLVDZ128rrkz, X86::VPSRLVDZ128rmkz, 0 }, - { X86::VPSRLVQZ128rrkz, X86::VPSRLVQZ128rmkz, 0 }, - { X86::VPSRLVWZ128rrkz, X86::VPSRLVWZ128rmkz, 0 }, - { X86::VPSRLWZ128rrkz, X86::VPSRLWZ128rmkz, 0 }, - { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 }, - { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 }, - { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 }, - { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 }, - { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 }, - { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 }, - { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 }, - { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 }, - { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 }, - { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 }, - { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 }, - { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 }, - { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 }, - { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 }, - { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 }, - { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 }, - { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 }, - { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 }, - { X86::VSHUFPDZ128rrikz, X86::VSHUFPDZ128rmikz, 0 }, - { X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmikz, 0 }, - { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, - { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, - { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 }, - { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 }, - { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 }, - { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 }, - { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 }, - { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 }, - - // AVX-512 masked foldable instructions - { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, - { X86::VPABSBZrrk, X86::VPABSBZrmk, 0 }, - { X86::VPABSDZrrk, X86::VPABSDZrmk, 0 }, - { X86::VPABSQZrrk, X86::VPABSQZrmk, 0 }, - { X86::VPABSWZrrk, X86::VPABSWZrmk, 0 }, - { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 }, - { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 }, - { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 }, - { X86::VPERMQZrik, X86::VPERMQZmik, 0 }, - { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 }, - { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE }, - { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 }, - { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 }, - { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 }, - { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 }, - { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 }, - { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE }, - { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 }, - { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 }, - { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 }, - { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 }, - { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 }, - { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 }, - { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 }, - { X86::VPSLLDZrik, X86::VPSLLDZmik, 0 }, - { X86::VPSLLQZrik, X86::VPSLLQZmik, 0 }, - { X86::VPSLLWZrik, X86::VPSLLWZmik, 0 }, - { X86::VPSRADZrik, X86::VPSRADZmik, 0 }, - { X86::VPSRAQZrik, X86::VPSRAQZmik, 0 }, - { X86::VPSRAWZrik, X86::VPSRAWZmik, 0 }, - { X86::VPSRLDZrik, X86::VPSRLDZmik, 0 }, - { X86::VPSRLQZrik, X86::VPSRLQZmik, 0 }, - { X86::VPSRLWZrik, X86::VPSRLWZmik, 0 }, - - // AVX-512VL 256-bit masked foldable instructions - { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, - { X86::VPABSBZ256rrk, X86::VPABSBZ256rmk, 0 }, - { X86::VPABSDZ256rrk, X86::VPABSDZ256rmk, 0 }, - { X86::VPABSQZ256rrk, X86::VPABSQZ256rmk, 0 }, - { X86::VPABSWZ256rrk, X86::VPABSWZ256rmk, 0 }, - { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 }, - { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 }, - { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 }, - { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 }, - { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 }, - { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 }, - { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 }, - { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 }, - { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 }, - { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 }, - { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE }, - { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 }, - { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 }, - { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 }, - { X86::VPSLLDZ256rik, X86::VPSLLDZ256mik, 0 }, - { X86::VPSLLQZ256rik, X86::VPSLLQZ256mik, 0 }, - { X86::VPSLLWZ256rik, X86::VPSLLWZ256mik, 0 }, - { X86::VPSRADZ256rik, X86::VPSRADZ256mik, 0 }, - { X86::VPSRAQZ256rik, X86::VPSRAQZ256mik, 0 }, - { X86::VPSRAWZ256rik, X86::VPSRAWZ256mik, 0 }, - { X86::VPSRLDZ256rik, X86::VPSRLDZ256mik, 0 }, - { X86::VPSRLQZ256rik, X86::VPSRLQZ256mik, 0 }, - { X86::VPSRLWZ256rik, X86::VPSRLWZ256mik, 0 }, - - // AVX-512VL 128-bit masked foldable instructions - { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, - { X86::VPABSBZ128rrk, X86::VPABSBZ128rmk, 0 }, - { X86::VPABSDZ128rrk, X86::VPABSDZ128rmk, 0 }, - { X86::VPABSQZ128rrk, X86::VPABSQZ128rmk, 0 }, - { X86::VPABSWZ128rrk, X86::VPABSWZ128rmk, 0 }, - { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 }, - { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 }, - { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE }, - { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 }, - { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 }, - { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 }, - { X86::VPSLLDZ128rik, X86::VPSLLDZ128mik, 0 }, - { X86::VPSLLQZ128rik, X86::VPSLLQZ128mik, 0 }, - { X86::VPSLLWZ128rik, X86::VPSLLWZ128mik, 0 }, - { X86::VPSRADZ128rik, X86::VPSRADZ128mik, 0 }, - { X86::VPSRAQZ128rik, X86::VPSRAQZ128mik, 0 }, - { X86::VPSRAWZ128rik, X86::VPSRAWZ128mik, 0 }, - { X86::VPSRLDZ128rik, X86::VPSRLDZ128mik, 0 }, - { X86::VPSRLQZ128rik, X86::VPSRLQZ128mik, 0 }, - { X86::VPSRLWZ128rik, X86::VPSRLWZ128mik, 0 }, - }; - for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) { AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, Entry.RegOp, Entry.MemOp, // Index 3, folded load Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } - auto I = X86InstrFMA3Info::rm_begin(); - auto E = X86InstrFMA3Info::rm_end(); - for (; I != E; ++I) { - if (!I.getGroup()->isKMasked()) { - // Intrinsic forms need to pass TB_NO_REVERSE. - if (I.getGroup()->isIntrinsic()) { - AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, - I.getRegOpcode(), I.getMemOpcode(), - TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE); - } else { - AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, - I.getRegOpcode(), I.getMemOpcode(), - TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD); - } - } - } - - static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { - // AVX-512 foldable masked instructions - { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, - { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, - { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE }, - { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE }, - { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 }, - { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 }, - { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 }, - { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 }, - { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 }, - { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 }, - { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, - { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, - { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE }, - { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE }, - { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 }, - { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 }, - { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 }, - { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 }, - { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 }, - { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 }, - { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 }, - { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 }, - { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 }, - { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 }, - { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, - { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, - { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, 0 }, - { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, 0 }, - { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 }, - { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 }, - { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, - { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, - { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, 0 }, - { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, 0 }, - { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, - { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, - { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE }, - { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE }, - { X86::VORPDZrrk, X86::VORPDZrmk, 0 }, - { X86::VORPSZrrk, X86::VORPSZrmk, 0 }, - { X86::VPACKSSDWZrrk, X86::VPACKSSDWZrmk, 0 }, - { X86::VPACKSSWBZrrk, X86::VPACKSSWBZrmk, 0 }, - { X86::VPACKUSDWZrrk, X86::VPACKUSDWZrmk, 0 }, - { X86::VPACKUSWBZrrk, X86::VPACKUSWBZrmk, 0 }, - { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 }, - { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 }, - { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 }, - { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 }, - { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 }, - { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 }, - { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 }, - { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 }, - { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 }, - { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 }, - { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 }, - { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 }, - { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 }, - { X86::VPAVGBZrrk, X86::VPAVGBZrmk, 0 }, - { X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0 }, - { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 }, - { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 }, - { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 }, - { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 }, - { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 }, - { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 }, - { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 }, - { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 }, - { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 }, - { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 }, - { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 }, - { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 }, - { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 }, - { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 }, - { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 }, - { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 }, - { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 }, - { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 }, - { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 }, - { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 }, - { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 }, - { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 }, - { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 }, - { X86::VPMAXSDZrrk, X86::VPMAXSDZrmk, 0 }, - { X86::VPMAXSQZrrk, X86::VPMAXSQZrmk, 0 }, - { X86::VPMAXSWZrrk, X86::VPMAXSWZrmk, 0 }, - { X86::VPMAXUBZrrk, X86::VPMAXUBZrmk, 0 }, - { X86::VPMAXUDZrrk, X86::VPMAXUDZrmk, 0 }, - { X86::VPMAXUQZrrk, X86::VPMAXUQZrmk, 0 }, - { X86::VPMAXUWZrrk, X86::VPMAXUWZrmk, 0 }, - { X86::VPMINSBZrrk, X86::VPMINSBZrmk, 0 }, - { X86::VPMINSDZrrk, X86::VPMINSDZrmk, 0 }, - { X86::VPMINSQZrrk, X86::VPMINSQZrmk, 0 }, - { X86::VPMINSWZrrk, X86::VPMINSWZrmk, 0 }, - { X86::VPMINUBZrrk, X86::VPMINUBZrmk, 0 }, - { X86::VPMINUDZrrk, X86::VPMINUDZrmk, 0 }, - { X86::VPMINUQZrrk, X86::VPMINUQZrmk, 0 }, - { X86::VPMINUWZrrk, X86::VPMINUWZrmk, 0 }, - { X86::VPMULDQZrrk, X86::VPMULDQZrmk, 0 }, - { X86::VPMULLDZrrk, X86::VPMULLDZrmk, 0 }, - { X86::VPMULLQZrrk, X86::VPMULLQZrmk, 0 }, - { X86::VPMULLWZrrk, X86::VPMULLWZrmk, 0 }, - { X86::VPMULUDQZrrk, X86::VPMULUDQZrmk, 0 }, - { X86::VPORDZrrk, X86::VPORDZrmk, 0 }, - { X86::VPORQZrrk, X86::VPORQZrmk, 0 }, - { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 }, - { X86::VPSLLDZrrk, X86::VPSLLDZrmk, 0 }, - { X86::VPSLLQZrrk, X86::VPSLLQZrmk, 0 }, - { X86::VPSLLVDZrrk, X86::VPSLLVDZrmk, 0 }, - { X86::VPSLLVQZrrk, X86::VPSLLVQZrmk, 0 }, - { X86::VPSLLVWZrrk, X86::VPSLLVWZrmk, 0 }, - { X86::VPSLLWZrrk, X86::VPSLLWZrmk, 0 }, - { X86::VPSRADZrrk, X86::VPSRADZrmk, 0 }, - { X86::VPSRAQZrrk, X86::VPSRAQZrmk, 0 }, - { X86::VPSRAVDZrrk, X86::VPSRAVDZrmk, 0 }, - { X86::VPSRAVQZrrk, X86::VPSRAVQZrmk, 0 }, - { X86::VPSRAVWZrrk, X86::VPSRAVWZrmk, 0 }, - { X86::VPSRAWZrrk, X86::VPSRAWZrmk, 0 }, - { X86::VPSRLDZrrk, X86::VPSRLDZrmk, 0 }, - { X86::VPSRLQZrrk, X86::VPSRLQZrmk, 0 }, - { X86::VPSRLVDZrrk, X86::VPSRLVDZrmk, 0 }, - { X86::VPSRLVQZrrk, X86::VPSRLVQZrmk, 0 }, - { X86::VPSRLVWZrrk, X86::VPSRLVWZrmk, 0 }, - { X86::VPSRLWZrrk, X86::VPSRLWZrmk, 0 }, - { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 }, - { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 }, - { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 }, - { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 }, - { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 }, - { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 }, - { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 }, - { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 }, - { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 }, - { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 }, - { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 }, - { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 }, - { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 }, - { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 }, - { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 }, - { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 }, - { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 }, - { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 }, - { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 }, - { X86::VSHUFPDZrrik, X86::VSHUFPDZrmik, 0 }, - { X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 }, - { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, - { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, - { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE }, - { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE }, - { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 }, - { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 }, - { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 }, - { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 }, - { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 }, - { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, - - // AVX-512{F,VL} foldable masked instructions 256-bit - { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, - { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, - { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 }, - { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 }, - { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 }, - { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 }, - { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 }, - { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 }, - { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, - { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, - { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 }, - { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 }, - { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 }, - { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 }, - { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 }, - { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 }, - { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, - { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, - { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 }, - { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 }, - { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, - { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, - { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, - { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, - { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 }, - { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 }, - { X86::VPACKSSDWZ256rrk, X86::VPACKSSDWZ256rmk, 0 }, - { X86::VPACKSSWBZ256rrk, X86::VPACKSSWBZ256rmk, 0 }, - { X86::VPACKUSDWZ256rrk, X86::VPACKUSDWZ256rmk, 0 }, - { X86::VPACKUSWBZ256rrk, X86::VPACKUSWBZ256rmk, 0 }, - { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 }, - { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 }, - { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 }, - { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 }, - { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 }, - { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 }, - { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 }, - { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 }, - { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 }, - { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 }, - { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 }, - { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 }, - { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, - { X86::VPAVGBZ256rrk, X86::VPAVGBZ256rmk, 0 }, - { X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0 }, - { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 }, - { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 }, - { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 }, - { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 }, - { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 }, - { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 }, - { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 }, - { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 }, - { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 }, - { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 }, - { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 }, - { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 }, - { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 }, - { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 }, - { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 }, - { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 }, - { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 }, - { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 }, - { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 }, - { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 }, - { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 }, - { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 }, - { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 }, - { X86::VPMAXSDZ256rrk, X86::VPMAXSDZ256rmk, 0 }, - { X86::VPMAXSQZ256rrk, X86::VPMAXSQZ256rmk, 0 }, - { X86::VPMAXSWZ256rrk, X86::VPMAXSWZ256rmk, 0 }, - { X86::VPMAXUBZ256rrk, X86::VPMAXUBZ256rmk, 0 }, - { X86::VPMAXUDZ256rrk, X86::VPMAXUDZ256rmk, 0 }, - { X86::VPMAXUQZ256rrk, X86::VPMAXUQZ256rmk, 0 }, - { X86::VPMAXUWZ256rrk, X86::VPMAXUWZ256rmk, 0 }, - { X86::VPMINSBZ256rrk, X86::VPMINSBZ256rmk, 0 }, - { X86::VPMINSDZ256rrk, X86::VPMINSDZ256rmk, 0 }, - { X86::VPMINSQZ256rrk, X86::VPMINSQZ256rmk, 0 }, - { X86::VPMINSWZ256rrk, X86::VPMINSWZ256rmk, 0 }, - { X86::VPMINUBZ256rrk, X86::VPMINUBZ256rmk, 0 }, - { X86::VPMINUDZ256rrk, X86::VPMINUDZ256rmk, 0 }, - { X86::VPMINUQZ256rrk, X86::VPMINUQZ256rmk, 0 }, - { X86::VPMINUWZ256rrk, X86::VPMINUWZ256rmk, 0 }, - { X86::VPMULDQZ256rrk, X86::VPMULDQZ256rmk, 0 }, - { X86::VPMULLDZ256rrk, X86::VPMULLDZ256rmk, 0 }, - { X86::VPMULLQZ256rrk, X86::VPMULLQZ256rmk, 0 }, - { X86::VPMULLWZ256rrk, X86::VPMULLWZ256rmk, 0 }, - { X86::VPMULUDQZ256rrk, X86::VPMULUDQZ256rmk, 0 }, - { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 }, - { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 }, - { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 }, - { X86::VPSLLDZ256rrk, X86::VPSLLDZ256rmk, 0 }, - { X86::VPSLLQZ256rrk, X86::VPSLLQZ256rmk, 0 }, - { X86::VPSLLVDZ256rrk, X86::VPSLLVDZ256rmk, 0 }, - { X86::VPSLLVQZ256rrk, X86::VPSLLVQZ256rmk, 0 }, - { X86::VPSLLVWZ256rrk, X86::VPSLLVWZ256rmk, 0 }, - { X86::VPSLLWZ256rrk, X86::VPSLLWZ256rmk, 0 }, - { X86::VPSRADZ256rrk, X86::VPSRADZ256rmk, 0 }, - { X86::VPSRAQZ256rrk, X86::VPSRAQZ256rmk, 0 }, - { X86::VPSRAVDZ256rrk, X86::VPSRAVDZ256rmk, 0 }, - { X86::VPSRAVQZ256rrk, X86::VPSRAVQZ256rmk, 0 }, - { X86::VPSRAVWZ256rrk, X86::VPSRAVWZ256rmk, 0 }, - { X86::VPSRAWZ256rrk, X86::VPSRAWZ256rmk, 0 }, - { X86::VPSRLDZ256rrk, X86::VPSRLDZ256rmk, 0 }, - { X86::VPSRLQZ256rrk, X86::VPSRLQZ256rmk, 0 }, - { X86::VPSRLVDZ256rrk, X86::VPSRLVDZ256rmk, 0 }, - { X86::VPSRLVQZ256rrk, X86::VPSRLVQZ256rmk, 0 }, - { X86::VPSRLVWZ256rrk, X86::VPSRLVWZ256rmk, 0 }, - { X86::VPSRLWZ256rrk, X86::VPSRLWZ256rmk, 0 }, - { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 }, - { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 }, - { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 }, - { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 }, - { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 }, - { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 }, - { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 }, - { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 }, - { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 }, - { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 }, - { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 }, - { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 }, - { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 }, - { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 }, - { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 }, - { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 }, - { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 }, - { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 }, - { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 }, - { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 }, - { X86::VSHUFPDZ256rrik, X86::VSHUFPDZ256rmik, 0 }, - { X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 }, - { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, - { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, - { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 }, - { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 }, - { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 }, - { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 }, - { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 }, - { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 }, - - // AVX-512{F,VL} foldable instructions 128-bit - { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, - { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, - { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 }, - { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 }, - { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 }, - { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 }, - { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 }, - { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 }, - { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, - { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, - { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 }, - { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 }, - { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }, - { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, - { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 }, - { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 }, - { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, - { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, - { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, - { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, - { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 }, - { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 }, - { X86::VPACKSSDWZ128rrk, X86::VPACKSSDWZ128rmk, 0 }, - { X86::VPACKSSWBZ128rrk, X86::VPACKSSWBZ128rmk, 0 }, - { X86::VPACKUSDWZ128rrk, X86::VPACKUSDWZ128rmk, 0 }, - { X86::VPACKUSWBZ128rrk, X86::VPACKUSWBZ128rmk, 0 }, - { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 }, - { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 }, - { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 }, - { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 }, - { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 }, - { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 }, - { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 }, - { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 }, - { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 }, - { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 }, - { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 }, - { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 }, - { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, - { X86::VPAVGBZ128rrk, X86::VPAVGBZ128rmk, 0 }, - { X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0 }, - { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 }, - { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 }, - { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 }, - { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 }, - { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 }, - { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 }, - { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 }, - { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 }, - { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 }, - { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 }, - { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 }, - { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 }, - { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 }, - { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 }, - { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 }, - { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, - { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, - { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, - { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 }, - { X86::VPMAXSDZ128rrk, X86::VPMAXSDZ128rmk, 0 }, - { X86::VPMAXSQZ128rrk, X86::VPMAXSQZ128rmk, 0 }, - { X86::VPMAXSWZ128rrk, X86::VPMAXSWZ128rmk, 0 }, - { X86::VPMAXUBZ128rrk, X86::VPMAXUBZ128rmk, 0 }, - { X86::VPMAXUDZ128rrk, X86::VPMAXUDZ128rmk, 0 }, - { X86::VPMAXUQZ128rrk, X86::VPMAXUQZ128rmk, 0 }, - { X86::VPMAXUWZ128rrk, X86::VPMAXUWZ128rmk, 0 }, - { X86::VPMINSBZ128rrk, X86::VPMINSBZ128rmk, 0 }, - { X86::VPMINSDZ128rrk, X86::VPMINSDZ128rmk, 0 }, - { X86::VPMINSQZ128rrk, X86::VPMINSQZ128rmk, 0 }, - { X86::VPMINSWZ128rrk, X86::VPMINSWZ128rmk, 0 }, - { X86::VPMINUBZ128rrk, X86::VPMINUBZ128rmk, 0 }, - { X86::VPMINUDZ128rrk, X86::VPMINUDZ128rmk, 0 }, - { X86::VPMINUQZ128rrk, X86::VPMINUQZ128rmk, 0 }, - { X86::VPMINUWZ128rrk, X86::VPMINUWZ128rmk, 0 }, - { X86::VPMULDQZ128rrk, X86::VPMULDQZ128rmk, 0 }, - { X86::VPMULLDZ128rrk, X86::VPMULLDZ128rmk, 0 }, - { X86::VPMULLQZ128rrk, X86::VPMULLQZ128rmk, 0 }, - { X86::VPMULLWZ128rrk, X86::VPMULLWZ128rmk, 0 }, - { X86::VPMULUDQZ128rrk, X86::VPMULUDQZ128rmk, 0 }, - { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 }, - { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 }, - { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 }, - { X86::VPSLLDZ128rrk, X86::VPSLLDZ128rmk, 0 }, - { X86::VPSLLQZ128rrk, X86::VPSLLQZ128rmk, 0 }, - { X86::VPSLLVDZ128rrk, X86::VPSLLVDZ128rmk, 0 }, - { X86::VPSLLVQZ128rrk, X86::VPSLLVQZ128rmk, 0 }, - { X86::VPSLLVWZ128rrk, X86::VPSLLVWZ128rmk, 0 }, - { X86::VPSLLWZ128rrk, X86::VPSLLWZ128rmk, 0 }, - { X86::VPSRADZ128rrk, X86::VPSRADZ128rmk, 0 }, - { X86::VPSRAQZ128rrk, X86::VPSRAQZ128rmk, 0 }, - { X86::VPSRAVDZ128rrk, X86::VPSRAVDZ128rmk, 0 }, - { X86::VPSRAVQZ128rrk, X86::VPSRAVQZ128rmk, 0 }, - { X86::VPSRAVWZ128rrk, X86::VPSRAVWZ128rmk, 0 }, - { X86::VPSRAWZ128rrk, X86::VPSRAWZ128rmk, 0 }, - { X86::VPSRLDZ128rrk, X86::VPSRLDZ128rmk, 0 }, - { X86::VPSRLQZ128rrk, X86::VPSRLQZ128rmk, 0 }, - { X86::VPSRLVDZ128rrk, X86::VPSRLVDZ128rmk, 0 }, - { X86::VPSRLVQZ128rrk, X86::VPSRLVQZ128rmk, 0 }, - { X86::VPSRLVWZ128rrk, X86::VPSRLVWZ128rmk, 0 }, - { X86::VPSRLWZ128rrk, X86::VPSRLWZ128rmk, 0 }, - { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 }, - { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 }, - { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 }, - { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 }, - { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 }, - { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 }, - { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 }, - { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 }, - { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 }, - { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 }, - { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 }, - { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 }, - { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 }, - { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 }, - { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 }, - { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 }, - { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 }, - { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 }, - { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 }, - { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 }, - { X86::VSHUFPDZ128rrik, X86::VSHUFPDZ128rmik, 0 }, - { X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0 }, - { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, - { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, - { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 }, - { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 }, - { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 }, - { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 }, - { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 }, - { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 }, - - // 512-bit three source instructions with zero masking. - { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 }, - { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 }, - { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 }, - { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 }, - { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 }, - { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 }, - { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 }, - { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 }, - { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 }, - { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 }, - { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 }, - { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 }, - { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, - { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, - - // 256-bit three source instructions with zero masking. - { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 }, - { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 }, - { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 }, - { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 }, - { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 }, - { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 }, - { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 }, - { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 }, - { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 }, - { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 }, - { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 }, - { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 }, - { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, - { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, - - // 128-bit three source instructions with zero masking. - { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 }, - { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 }, - { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 }, - { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 }, - { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 }, - { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 }, - { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 }, - { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 }, - { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 }, - { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 }, - { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 }, - { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 }, - { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, - { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, - }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) { AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, @@ -3539,20 +163,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // Index 4, folded load Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD); } - for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) { - if (I.getGroup()->isKMasked()) { - // Intrinsics need to pass TB_NO_REVERSE. - if (I.getGroup()->isIntrinsic()) { - AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, - I.getRegOpcode(), I.getMemOpcode(), - TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE); - } else { - AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, - I.getRegOpcode(), I.getMemOpcode(), - TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD); - } - } - } } void @@ -5930,7 +2540,7 @@ void X86InstrInfo::replaceBranchWithTailCall( // Add implicit uses and defs of all live regs potentially clobbered by the // call. This way they still appear live across the call. - LivePhysRegs LiveRegs(&getRegisterInfo()); + LivePhysRegs LiveRegs(getRegisterInfo()); LiveRegs.addLiveOuts(MBB); SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers; LiveRegs.stepForward(*MIB, Clobbers); @@ -6545,9 +3155,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // first frame index. // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment. - const TargetRegisterInfo *TRI = &getRegisterInfo(); + const TargetRegisterInfo &TRI = getRegisterInfo(); MachineBasicBlock::LivenessQueryResult LQR = - MBB.computeRegisterLiveness(TRI, AX, MI); + MBB.computeRegisterLiveness(&TRI, AX, MI); // We do not want to save and restore AX if we do not have to. // Moreover, if we do so whereas AX is dead, we would need to set // an undef flag on the use of AX, otherwise the verifier will @@ -6564,7 +3174,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, } // AX contains the top most register in the aliasing hierarchy. // It may not be live, but one of its aliases may be. - for (MCRegAliasIterator AI(AX, TRI, true); + for (MCRegAliasIterator AI(AX, &TRI, true); AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI) LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live : MachineBasicBlock::LQR_Dead; @@ -8374,7 +4984,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, unsigned Opc = LoadMI.getOpcode(); unsigned UserOpc = UserMI.getOpcode(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = + const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg()); unsigned RegSize = TRI.getRegSizeInBits(*RC); @@ -10473,7 +7083,7 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const { // catch it. if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) || MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) || - MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) + MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) return MachineOutlinerInstrType::Illegal; // Outlined calls change the instruction pointer, so don't read from it. diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 01df07e1715f..fab70e918b8a 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -813,6 +813,8 @@ def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; def HasCDI : Predicate<"Subtarget->hasCDI()">, AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">; +def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">, + AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">; def HasPFI : Predicate<"Subtarget->hasPFI()">, AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">; def HasERI : Predicate<"Subtarget->hasERI()">, @@ -1436,11 +1438,14 @@ def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), // Longer forms that use a ModR/M byte. Needed for disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src), - "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, + FoldGenData<"MOV8ri">; def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src), - "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16, + FoldGenData<"MOV16ri">; def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), - "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32, + FoldGenData<"MOV32ri">; } } // SchedRW @@ -1563,13 +1568,17 @@ def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst), let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), - "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, + FoldGenData<"MOV8rr">; def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), - "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16, + FoldGenData<"MOV16rr">; def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32, + FoldGenData<"MOV32rr">; def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, + FoldGenData<"MOV64rr">; } let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index dc3800ce381b..2c047722db24 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -248,7 +248,8 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (MMX_X86movd2w (x86mmx VR64:$src)))], - IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; + IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>, + FoldGenData<"MMX_MOVD64rr">; let isBitcast = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), @@ -277,7 +278,7 @@ def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src), "movq\t{$src, $dst|$dst, $src}", [], - IIC_MMX_MOVQ_RR>; + IIC_MMX_MOVQ_RR>, FoldGenData<"MMX_MOVQ64rr">; } } // SchedRW diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f73d85e7e01b..a3e677209305 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -507,7 +507,8 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, string base_opc, - string asm_opr, Domain d = GenericDomain> { + string asm_opr, Domain d = GenericDomain, + string Name> { let isCommutable = 1 in def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), @@ -521,15 +522,17 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), !strconcat(base_opc, asm_opr), - [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; + [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>, + FoldGenData<Name#rr>; } multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, string OpcodeStr, - Domain d = GenericDomain> { + Domain d = GenericDomain, string Name> { // AVX defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, + "V"#Name>, VEX_4V, VEX_LIG, VEX_WIG; def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), @@ -539,7 +542,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, // SSE1 & 2 let Constraints = "$src1 = $dst" in { defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, - "\t{$src2, $dst|$dst, $src2}", d>; + "\t{$src2, $dst|$dst, $src2}", d, Name>; } def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), @@ -563,9 +566,9 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, } defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", - SSEPackedSingle>, XS; + SSEPackedSingle, "MOVSS">, XS; defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", - SSEPackedDouble>, XD; + SSEPackedDouble, "MOVSD">, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", @@ -864,35 +867,43 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG, + FoldGenData<"VMOVAPSrr">; def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG, + FoldGenData<"VMOVAPDrr">; def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG, + FoldGenData<"VMOVUPSrr">; def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG, + FoldGenData<"VMOVUPDrr">; def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVAPSYrr">; def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVAPDYrr">; def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVUPSYrr">; def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVUPDYrr">; } // Aliases to help the assembler pick two byte VEX encodings by swapping the @@ -938,16 +949,16 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, SchedRW = [WriteFShuffle] in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>; + IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPSrr">; def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>; + IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPDrr">; def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>; + IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPSrr">; def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>; + IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPDrr">; } let Predicates = [HasAVX, NoVLX] in { @@ -3752,17 +3763,19 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, - VEX, VEX_WIG; + VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movdqa\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVDQAYrr">; def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, - VEX, VEX_WIG; + VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movdqu\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVDQUYrr">; } let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, @@ -3820,11 +3833,12 @@ def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>; + IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVDQArr">; def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>, + FoldGenData<"MOVDQUrr">; } } // SchedRW @@ -5915,7 +5929,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[WriteShuffle]>; + []>, Sched<[WriteShuffle]>, FoldGenData<NAME#ri>; let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteShuffleLd, WriteRMW] in diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 53224431c0e9..5dde2d07babe 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -111,7 +111,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - XOP_4V, VEX_W, Sched<[WriteVarVecShift]>; + XOP_4V, VEX_W, Sched<[WriteVarVecShift]>, FoldGenData<NAME#rr>; } let ExeDomain = SSEPackedInt in { @@ -282,7 +282,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W; + []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>; } let ExeDomain = SSEPackedInt in { @@ -318,7 +318,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W; + []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>; } let ExeDomain = SSEPackedInt in { @@ -357,7 +357,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - []>, VEX_W; + []>, VEX_W, FoldGenData<NAME#rr>; } let ExeDomain = SSEPackedDouble in { diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 61956f741820..77dead8d2413 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -302,6 +302,26 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + } else if (Ty.isVector() && Ty.getSizeInBits() == 256) { + if (Alignment >= 32) + return Isload ? (HasVLX ? X86::VMOVAPSZ256rm + : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX + : X86::VMOVAPSYrm) + : (HasVLX ? X86::VMOVAPSZ256mr + : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX + : X86::VMOVAPSYmr); + else + return Isload ? (HasVLX ? X86::VMOVUPSZ256rm + : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX + : X86::VMOVUPSYrm) + : (HasVLX ? X86::VMOVUPSZ256mr + : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX + : X86::VMOVUPSYmr); + } else if (Ty.isVector() && Ty.getSizeInBits() == 512) { + if (Alignment >= 64) + return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; + else + return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } return Opc; } diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index da724f5d8989..979aaee110aa 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -35,6 +35,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizerInfoSSE1(); setLegalizerInfoSSE2(); setLegalizerInfoSSE41(); + setLegalizerInfoAVX(); setLegalizerInfoAVX2(); setLegalizerInfoAVX512(); setLegalizerInfoAVX512DQ(); @@ -209,6 +210,18 @@ void X86LegalizerInfo::setLegalizerInfoSSE41() { setAction({G_MUL, v4s32}, Legal); } +void X86LegalizerInfo::setLegalizerInfoAVX() { + if (!Subtarget.hasAVX()) + return; + + const LLT v8s32 = LLT::vector(8, 32); + const LLT v4s64 = LLT::vector(4, 64); + + for (unsigned MemOp : {G_LOAD, G_STORE}) + for (auto Ty : {v8s32, v4s64}) + setAction({MemOp, Ty}, Legal); +} + void X86LegalizerInfo::setLegalizerInfoAVX2() { if (!Subtarget.hasAVX2()) return; @@ -239,6 +252,10 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() { setAction({G_MUL, v16s32}, Legal); + for (unsigned MemOp : {G_LOAD, G_STORE}) + for (auto Ty : {v16s32, v8s64}) + setAction({MemOp, Ty}, Legal); + /************ VLX *******************/ if (!Subtarget.hasVLX()) return; diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h index ab5405a70427..135950a95f84 100644 --- a/lib/Target/X86/X86LegalizerInfo.h +++ b/lib/Target/X86/X86LegalizerInfo.h @@ -39,6 +39,7 @@ private: void setLegalizerInfoSSE1(); void setLegalizerInfoSSE2(); void setLegalizerInfoSSE41(); + void setLegalizerInfoAVX(); void setLegalizerInfoAVX2(); void setLegalizerInfoAVX512(); void setLegalizerInfoAVX512DQ(); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 2b1f43bffd71..84ec98484f8e 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -286,6 +286,7 @@ void X86Subtarget::initializeEnvironment() { HasCDI = false; HasPFI = false; HasDQI = false; + HasVPOPCNTDQ = false; HasBWI = false; HasVLX = false; HasADX = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index a9f3a2aee1be..550e95c39ab5 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -270,6 +270,9 @@ protected: /// Processor has AVX-512 Conflict Detection Instructions bool HasCDI; + /// Processor has AVX-512 population count Instructions + bool HasVPOPCNTDQ; + /// Processor has AVX-512 Doubleword and Quadword instructions bool HasDQI; @@ -494,6 +497,7 @@ public: bool slow3OpsLEA() const { return Slow3OpsLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } + bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } bool hasDQI() const { return HasDQI; } |