diff options
Diffstat (limited to 'contrib/llvm/lib/Target')
146 files changed, 2190 insertions, 1024 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 43a3ae77a170..572d1c22feea 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -3774,7 +3774,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { CallingConv::ID CC = F.getCallingConv(); SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0c72f2ebee18..de762a7bb1d4 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8580,7 +8580,7 @@ static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, - std::vector<SDNode *> *Created) const { + SmallVectorImpl<SDNode *> &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV @@ -8603,11 +8603,9 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); - if (Created) { - Created->push_back(Cmp.getNode()); - Created->push_back(Add.getNode()); - Created->push_back(CSel.getNode()); - } + Created.push_back(Cmp.getNode()); + Created.push_back(Add.getNode()); + Created.push_back(CSel.getNode()); // Divide by pow2. SDValue SRA = @@ -8618,8 +8616,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, if (Divisor.isNonNegative()) return SRA; - if (Created) - Created->push_back(SRA.getNode()); + Created.push_back(SRA.getNode()); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 592845640a44..d783c8a6048c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -644,7 +644,7 @@ private: SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, - std::vector<SDNode *> *Created) const override; + SmallVectorImpl<SDNode *> &Created) const override; SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 1060c64f7b5d..15d61cd1ad26 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -57,6 +57,14 @@ class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> { let Size = 4; } +// Enum describing whether an instruction is +// destructive in its first source operand. +class DestructiveInstTypeEnum<bits<1> val> { + bits<1> Value = val; +} +def NotDestructive : DestructiveInstTypeEnum<0>; +def Destructive : DestructiveInstTypeEnum<1>; + // Normal instructions class I<dag oops, dag iops, string asm, string operands, string cstr, list<dag> pattern> @@ -64,6 +72,13 @@ class I<dag oops, dag iops, string asm, string operands, string cstr, dag OutOperandList = oops; dag InOperandList = iops; let AsmString = !strconcat(asm, operands); + + // Destructive operations (SVE) + DestructiveInstTypeEnum DestructiveInstType = NotDestructive; + ElementSizeEnum ElementSize = ElementSizeB; + + let TSFlags{3} = DestructiveInstType.Value; + let TSFlags{2-0} = ElementSize.Value; } class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 230480cf1cea..032d53d19620 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4851,75 +4851,92 @@ AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { return makeArrayRef(TargetFlags); } - /// Constants defining how certain sequences should be outlined. - /// This encompasses how an outlined function should be called, and what kind of - /// frame should be emitted for that outlined function. - /// - /// \p MachineOutlinerDefault implies that the function should be called with - /// a save and restore of LR to the stack. - /// - /// That is, - /// - /// I1 Save LR OUTLINED_FUNCTION: - /// I2 --> BL OUTLINED_FUNCTION I1 - /// I3 Restore LR I2 - /// I3 - /// RET - /// - /// * Call construction overhead: 3 (save + BL + restore) - /// * Frame construction overhead: 1 (ret) - /// * Requires stack fixups? Yes - /// - /// \p MachineOutlinerTailCall implies that the function is being created from - /// a sequence of instructions ending in a return. - /// - /// That is, - /// - /// I1 OUTLINED_FUNCTION: - /// I2 --> B OUTLINED_FUNCTION I1 - /// RET I2 - /// RET - /// - /// * Call construction overhead: 1 (B) - /// * Frame construction overhead: 0 (Return included in sequence) - /// * Requires stack fixups? No - /// - /// \p MachineOutlinerNoLRSave implies that the function should be called using - /// a BL instruction, but doesn't require LR to be saved and restored. This - /// happens when LR is known to be dead. - /// - /// That is, - /// - /// I1 OUTLINED_FUNCTION: - /// I2 --> BL OUTLINED_FUNCTION I1 - /// I3 I2 - /// I3 - /// RET - /// - /// * Call construction overhead: 1 (BL) - /// * Frame construction overhead: 1 (RET) - /// * Requires stack fixups? No - /// - /// \p MachineOutlinerThunk implies that the function is being created from - /// a sequence of instructions ending in a call. The outlined function is - /// called with a BL instruction, and the outlined function tail-calls the - /// original call destination. - /// - /// That is, - /// - /// I1 OUTLINED_FUNCTION: - /// I2 --> BL OUTLINED_FUNCTION I1 - /// BL f I2 - /// B f - /// * Call construction overhead: 1 (BL) - /// * Frame construction overhead: 0 - /// * Requires stack fixups? No - /// +/// Constants defining how certain sequences should be outlined. +/// This encompasses how an outlined function should be called, and what kind of +/// frame should be emitted for that outlined function. +/// +/// \p MachineOutlinerDefault implies that the function should be called with +/// a save and restore of LR to the stack. +/// +/// That is, +/// +/// I1 Save LR OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 Restore LR I2 +/// I3 +/// RET +/// +/// * Call construction overhead: 3 (save + BL + restore) +/// * Frame construction overhead: 1 (ret) +/// * Requires stack fixups? Yes +/// +/// \p MachineOutlinerTailCall implies that the function is being created from +/// a sequence of instructions ending in a return. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> B OUTLINED_FUNCTION I1 +/// RET I2 +/// RET +/// +/// * Call construction overhead: 1 (B) +/// * Frame construction overhead: 0 (Return included in sequence) +/// * Requires stack fixups? No +/// +/// \p MachineOutlinerNoLRSave implies that the function should be called using +/// a BL instruction, but doesn't require LR to be saved and restored. This +/// happens when LR is known to be dead. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 I2 +/// I3 +/// RET +/// +/// * Call construction overhead: 1 (BL) +/// * Frame construction overhead: 1 (RET) +/// * Requires stack fixups? No +/// +/// \p MachineOutlinerThunk implies that the function is being created from +/// a sequence of instructions ending in a call. The outlined function is +/// called with a BL instruction, and the outlined function tail-calls the +/// original call destination. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// BL f I2 +/// B f +/// * Call construction overhead: 1 (BL) +/// * Frame construction overhead: 0 +/// * Requires stack fixups? No +/// +/// \p MachineOutlinerRegSave implies that the function should be called with a +/// save and restore of LR to an available register. This allows us to avoid +/// stack fixups. Note that this outlining variant is compatible with the +/// NoLRSave case. +/// +/// That is, +/// +/// I1 Save LR OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 Restore LR I2 +/// I3 +/// RET +/// +/// * Call construction overhead: 3 (save + BL + restore) +/// * Frame construction overhead: 1 (ret) +/// * Requires stack fixups? No enum MachineOutlinerClass { MachineOutlinerDefault, /// Emit a save, restore, call, and return. MachineOutlinerTailCall, /// Only emit a branch. MachineOutlinerNoLRSave, /// Emit a call and return. MachineOutlinerThunk, /// Emit a call and tail-call. + MachineOutlinerRegSave /// Same as default, but save to a register. }; enum MachineOutlinerMBBFlags { @@ -4927,6 +4944,27 @@ enum MachineOutlinerMBBFlags { HasCalls = 0x4 }; +unsigned +AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { + MachineFunction *MF = C.getMF(); + const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + + // Check if there is an available register across the sequence that we can + // use. + for (unsigned Reg : AArch64::GPR64RegClass) { + if (!ARI->isReservedReg(*MF, Reg) && + Reg != AArch64::LR && // LR is not reserved, but don't use it. + Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. + Reg != AArch64::X17 && // Ditto for X17. + C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) + return Reg; + } + + // No suitable register. Return 0. + return 0u; +} + outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { @@ -5015,11 +5053,27 @@ AArch64InstrInfo::getOutliningCandidateInfo( SetCandidateCallInfo(MachineOutlinerNoLRSave, 4); } - // LR is live, so we need to save it to the stack. + // LR is live, so we need to save it. Decide whether it should be saved to + // the stack, or if it can be saved to a register. else { - FrameID = MachineOutlinerDefault; - NumBytesToCreateFrame = 4; - SetCandidateCallInfo(MachineOutlinerDefault, 12); + if (std::all_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), + [this](outliner::Candidate &C) { + return findRegisterToSaveLRTo(C); + })) { + // Every candidate has an available callee-saved register for the save. + // We can save LR to a register. + FrameID = MachineOutlinerRegSave; + NumBytesToCreateFrame = 4; + SetCandidateCallInfo(MachineOutlinerRegSave, 12); + } + + else { + // At least one candidate does not have an available callee-saved + // register. We must save LR to the stack. + FrameID = MachineOutlinerDefault; + NumBytesToCreateFrame = 4; + SetCandidateCallInfo(MachineOutlinerDefault, 12); + } } // Check if the range contains a call. These require a save + restore of the @@ -5088,7 +5142,7 @@ AArch64InstrInfo::getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const { MBB.rend(), [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); - if (!LRU.available(AArch64::LR)) + if (!LRU.available(AArch64::LR)) Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; return Flags; @@ -5114,14 +5168,14 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, // ahead and skip over them. if (MI.isKill()) return outliner::InstrType::Invisible; - + // Is this a terminator for a basic block? if (MI.isTerminator()) { // Is this the end of a function? if (MI.getParent()->succ_empty()) return outliner::InstrType::Legal; - + // It's not, so don't outline it. return outliner::InstrType::Illegal; } @@ -5424,7 +5478,7 @@ void AArch64InstrInfo::buildOutlinedFrame( MBB.insert(MBB.end(), ret); // Did we have to modify the stack by saving the link register? - if (OF.FrameConstructionID == MachineOutlinerNoLRSave) + if (OF.FrameConstructionID != MachineOutlinerDefault) return; // We modified the stack. @@ -5457,13 +5511,41 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( // We want to return the spot where we inserted the call. MachineBasicBlock::iterator CallPt; - // We have a default call. Save the link register. - MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) - .addReg(AArch64::SP, RegState::Define) - .addReg(AArch64::LR) - .addReg(AArch64::SP) - .addImm(-16); - It = MBB.insert(It, STRXpre); + // Instructions for saving and restoring LR around the call instruction we're + // going to insert. + MachineInstr *Save; + MachineInstr *Restore; + // Can we save to a register? + if (C.CallConstructionID == MachineOutlinerRegSave) { + // FIXME: This logic should be sunk into a target-specific interface so that + // we don't have to recompute the register. + unsigned Reg = findRegisterToSaveLRTo(C); + assert(Reg != 0 && "No callee-saved register available?"); + + // Save and restore LR from that register. + Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) + .addReg(AArch64::XZR) + .addReg(AArch64::LR) + .addImm(0); + Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) + .addReg(AArch64::XZR) + .addReg(Reg) + .addImm(0); + } else { + // We have the default case. Save and restore from SP. + Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(-16); + Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR, RegState::Define) + .addReg(AArch64::SP) + .addImm(16); + } + + It = MBB.insert(It, Save); It++; // Insert the call. @@ -5472,13 +5554,11 @@ MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( CallPt = It; It++; - // Restore the link register. - MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) - .addReg(AArch64::SP, RegState::Define) - .addReg(AArch64::LR, RegState::Define) - .addReg(AArch64::SP) - .addImm(16); - It = MBB.insert(It, LDRXpost); - + It = MBB.insert(It, Restore); return CallPt; } + +bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( + MachineFunction &MF) const { + return MF.getFunction().optForMinSize(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 0e5953f6216d..11882e238b70 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -249,6 +249,7 @@ public: insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; + bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; /// Returns true if the instruction sets to an immediate value that can be /// executed more efficiently. bool isExynosResetFast(const MachineInstr &MI) const; @@ -271,6 +272,10 @@ private: ArrayRef<MachineOperand> Cond) const; bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg, const MachineRegisterInfo *MRI) const; + + /// Returns an unused general-purpose register which can be used for + /// constructing an outlined call if one exists. Returns 0 otherwise. + unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; }; /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg @@ -339,6 +344,32 @@ static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; } +// struct TSFlags { +#define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits +#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 1-bit +// } + +namespace AArch64 { + +enum ElementSizeType { + ElementSizeMask = TSFLAG_ELEMENT_SIZE_TYPE(0x7), + ElementSizeNone = TSFLAG_ELEMENT_SIZE_TYPE(0x0), + ElementSizeB = TSFLAG_ELEMENT_SIZE_TYPE(0x1), + ElementSizeH = TSFLAG_ELEMENT_SIZE_TYPE(0x2), + ElementSizeS = TSFLAG_ELEMENT_SIZE_TYPE(0x3), + ElementSizeD = TSFLAG_ELEMENT_SIZE_TYPE(0x4), +}; + +enum DestructiveInstType { + DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), + NotDestructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0), + Destructive = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1), +}; + +#undef TSFLAG_ELEMENT_SIZE_TYPE +#undef TSFLAG_DESTRUCTIVE_INST_TYPE +} + } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp index 4d7ca2349ed1..b2b500320b5c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -21,6 +21,7 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -94,6 +95,10 @@ private: void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const; + // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. + void materializeLargeCMVal(MachineInstr &I, const Value *V, + unsigned char OpFlags) const; + const AArch64TargetMachine &TM; const AArch64Subtarget &STI; const AArch64InstrInfo &TII; @@ -655,6 +660,45 @@ bool AArch64InstructionSelector::selectVaStartDarwin( return true; } +void AArch64InstructionSelector::materializeLargeCMVal( + MachineInstr &I, const Value *V, unsigned char OpFlags) const { + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineIRBuilder MIB(I); + + auto MovZ = MIB.buildInstr(AArch64::MOVZXi, &AArch64::GPR64RegClass); + MovZ->addOperand(MF, I.getOperand(1)); + MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | + AArch64II::MO_NC); + MovZ->addOperand(MF, MachineOperand::CreateImm(0)); + constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); + + auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags, unsigned Offset, + unsigned ForceDstReg) { + unsigned DstReg = ForceDstReg + ? ForceDstReg + : MRI.createVirtualRegister(&AArch64::GPR64RegClass); + auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg); + if (auto *GV = dyn_cast<GlobalValue>(V)) { + MovI->addOperand(MF, MachineOperand::CreateGA( + GV, MovZ->getOperand(1).getOffset(), Flags)); + } else { + MovI->addOperand( + MF, MachineOperand::CreateBA(cast<BlockAddress>(V), + MovZ->getOperand(1).getOffset(), Flags)); + } + MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); + constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); + return DstReg; + }; + unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(), + AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); + DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); + BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); + return; +} + bool AArch64InstructionSelector::select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { assert(I.getParent() && "Instruction should be in a basic block!"); @@ -936,36 +980,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, I.getOperand(1).setTargetFlags(OpFlags); } else if (TM.getCodeModel() == CodeModel::Large) { // Materialize the global using movz/movk instructions. - unsigned MovZDstReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - auto InsertPt = std::next(I.getIterator()); - auto MovZ = - BuildMI(MBB, InsertPt, I.getDebugLoc(), TII.get(AArch64::MOVZXi)) - .addDef(MovZDstReg); - MovZ->addOperand(MF, I.getOperand(1)); - MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 | - AArch64II::MO_NC); - MovZ->addOperand(MF, MachineOperand::CreateImm(0)); - constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI); - - auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags, - unsigned Offset, unsigned ForceDstReg) { - unsigned DstReg = - ForceDstReg ? ForceDstReg - : MRI.createVirtualRegister(&AArch64::GPR64RegClass); - auto MovI = BuildMI(MBB, InsertPt, MovZ->getDebugLoc(), - TII.get(AArch64::MOVKXi)) - .addDef(DstReg) - .addReg(SrcReg); - MovI->addOperand(MF, MachineOperand::CreateGA( - GV, MovZ->getOperand(1).getOffset(), Flags)); - MovI->addOperand(MF, MachineOperand::CreateImm(Offset)); - constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI); - return DstReg; - }; - unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(), - AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0); - DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0); - BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg()); + materializeLargeCMVal(I, GV, OpFlags); I.eraseFromParent(); return true; } else { @@ -1482,7 +1497,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I, .addImm(1); I.eraseFromParent(); return true; - case TargetOpcode::G_IMPLICIT_DEF: + case TargetOpcode::G_IMPLICIT_DEF: { I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); const unsigned DstReg = I.getOperand(0).getReg(); @@ -1492,6 +1507,25 @@ bool AArch64InstructionSelector::select(MachineInstr &I, RBI.constrainGenericRegister(DstReg, *DstRC, MRI); return true; } + case TargetOpcode::G_BLOCK_ADDR: { + if (TM.getCodeModel() == CodeModel::Large) { + materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0); + I.eraseFromParent(); + return true; + } else { + I.setDesc(TII.get(AArch64::MOVaddrBA)); + auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA), + I.getOperand(0).getReg()) + .addBlockAddress(I.getOperand(1).getBlockAddress(), + /* Offset */ 0, AArch64II::MO_PAGE) + .addBlockAddress( + I.getOperand(1).getBlockAddress(), /* Offset */ 0, + AArch64II::MO_NC | AArch64II::MO_PAGEOFF); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); + } + } + } return false; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 9b8c0a34efba..327c758a7f8e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -293,6 +293,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Monotonic))); } + getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0}); + // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 798340f8fed8..e42214d15699 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -146,7 +146,7 @@ public: Optional<bool> hasRedZone() const { return HasRedZone; } void setHasRedZone(bool s) { HasRedZone = s; } - + int getVarArgsStackIndex() const { return VarArgsStackIndex; } void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 7a653e117fd1..bbf401b474ca 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -764,18 +764,35 @@ def Z30 : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>; def Z31 : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>; } +// Enum descibing the element size for destructive +// operations. +class ElementSizeEnum<bits<3> val> { + bits<3> Value = val; +} + +def ElementSizeNone : ElementSizeEnum<0>; +def ElementSizeB : ElementSizeEnum<1>; +def ElementSizeH : ElementSizeEnum<2>; +def ElementSizeS : ElementSizeEnum<3>; +def ElementSizeD : ElementSizeEnum<4>; +def ElementSizeQ : ElementSizeEnum<5>; // Unused + class SVERegOp <string Suffix, AsmOperandClass C, + ElementSizeEnum Size, RegisterClass RC> : RegisterOperand<RC> { + ElementSizeEnum ElementSize; + + let ElementSize = Size; let PrintMethod = !if(!eq(Suffix, ""), "printSVERegOp<>", "printSVERegOp<'" # Suffix # "'>"); let ParserMatchClass = C; } -class PPRRegOp <string Suffix, AsmOperandClass C, - RegisterClass RC> : SVERegOp<Suffix, C, RC> {} -class ZPRRegOp <string Suffix, AsmOperandClass C, - RegisterClass RC> : SVERegOp<Suffix, C, RC> {} +class PPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size, + RegisterClass RC> : SVERegOp<Suffix, C, Size, RC> {} +class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size, + RegisterClass RC> : SVERegOp<Suffix, C, Size, RC> {} //****************************************************************************** @@ -805,11 +822,11 @@ def PPRAsmOp16 : PPRAsmOperand<"PredicateH", "PPR", 16>; def PPRAsmOp32 : PPRAsmOperand<"PredicateS", "PPR", 32>; def PPRAsmOp64 : PPRAsmOperand<"PredicateD", "PPR", 64>; -def PPRAny : PPRRegOp<"", PPRAsmOpAny, PPR>; -def PPR8 : PPRRegOp<"b", PPRAsmOp8, PPR>; -def PPR16 : PPRRegOp<"h", PPRAsmOp16, PPR>; -def PPR32 : PPRRegOp<"s", PPRAsmOp32, PPR>; -def PPR64 : PPRRegOp<"d", PPRAsmOp64, PPR>; +def PPRAny : PPRRegOp<"", PPRAsmOpAny, ElementSizeNone, PPR>; +def PPR8 : PPRRegOp<"b", PPRAsmOp8, ElementSizeB, PPR>; +def PPR16 : PPRRegOp<"h", PPRAsmOp16, ElementSizeH, PPR>; +def PPR32 : PPRRegOp<"s", PPRAsmOp32, ElementSizeS, PPR>; +def PPR64 : PPRRegOp<"d", PPRAsmOp64, ElementSizeD, PPR>; def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b", 0>; def PPRAsmOp3b8 : PPRAsmOperand<"Predicate3bB", "PPR_3b", 8>; @@ -817,11 +834,11 @@ def PPRAsmOp3b16 : PPRAsmOperand<"Predicate3bH", "PPR_3b", 16>; def PPRAsmOp3b32 : PPRAsmOperand<"Predicate3bS", "PPR_3b", 32>; def PPRAsmOp3b64 : PPRAsmOperand<"Predicate3bD", "PPR_3b", 64>; -def PPR3bAny : PPRRegOp<"", PPRAsmOp3bAny, PPR_3b>; -def PPR3b8 : PPRRegOp<"b", PPRAsmOp3b8, PPR_3b>; -def PPR3b16 : PPRRegOp<"h", PPRAsmOp3b16, PPR_3b>; -def PPR3b32 : PPRRegOp<"s", PPRAsmOp3b32, PPR_3b>; -def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, PPR_3b>; +def PPR3bAny : PPRRegOp<"", PPRAsmOp3bAny, ElementSizeNone, PPR_3b>; +def PPR3b8 : PPRRegOp<"b", PPRAsmOp3b8, ElementSizeB, PPR_3b>; +def PPR3b16 : PPRRegOp<"h", PPRAsmOp3b16, ElementSizeH, PPR_3b>; +def PPR3b32 : PPRRegOp<"s", PPRAsmOp3b32, ElementSizeS, PPR_3b>; +def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, ElementSizeD, PPR_3b>; //****************************************************************************** @@ -874,28 +891,28 @@ def ZPRAsmOp32 : ZPRAsmOperand<"VectorS", 32>; def ZPRAsmOp64 : ZPRAsmOperand<"VectorD", 64>; def ZPRAsmOp128 : ZPRAsmOperand<"VectorQ", 128>; -def ZPRAny : ZPRRegOp<"", ZPRAsmOpAny, ZPR>; -def ZPR8 : ZPRRegOp<"b", ZPRAsmOp8, ZPR>; -def ZPR16 : ZPRRegOp<"h", ZPRAsmOp16, ZPR>; -def ZPR32 : ZPRRegOp<"s", ZPRAsmOp32, ZPR>; -def ZPR64 : ZPRRegOp<"d", ZPRAsmOp64, ZPR>; -def ZPR128 : ZPRRegOp<"q", ZPRAsmOp128, ZPR>; +def ZPRAny : ZPRRegOp<"", ZPRAsmOpAny, ElementSizeNone, ZPR>; +def ZPR8 : ZPRRegOp<"b", ZPRAsmOp8, ElementSizeB, ZPR>; +def ZPR16 : ZPRRegOp<"h", ZPRAsmOp16, ElementSizeH, ZPR>; +def ZPR32 : ZPRRegOp<"s", ZPRAsmOp32, ElementSizeS, ZPR>; +def ZPR64 : ZPRRegOp<"d", ZPRAsmOp64, ElementSizeD, ZPR>; +def ZPR128 : ZPRRegOp<"q", ZPRAsmOp128, ElementSizeQ, ZPR>; def ZPRAsmOp3b8 : ZPRAsmOperand<"Vector3bB", 8, "_3b">; def ZPRAsmOp3b16 : ZPRAsmOperand<"Vector3bH", 16, "_3b">; def ZPRAsmOp3b32 : ZPRAsmOperand<"Vector3bS", 32, "_3b">; -def ZPR3b8 : ZPRRegOp<"b", ZPRAsmOp3b8, ZPR_3b>; -def ZPR3b16 : ZPRRegOp<"h", ZPRAsmOp3b16, ZPR_3b>; -def ZPR3b32 : ZPRRegOp<"s", ZPRAsmOp3b32, ZPR_3b>; +def ZPR3b8 : ZPRRegOp<"b", ZPRAsmOp3b8, ElementSizeB, ZPR_3b>; +def ZPR3b16 : ZPRRegOp<"h", ZPRAsmOp3b16, ElementSizeH, ZPR_3b>; +def ZPR3b32 : ZPRRegOp<"s", ZPRAsmOp3b32, ElementSizeS, ZPR_3b>; def ZPRAsmOp4b16 : ZPRAsmOperand<"Vector4bH", 16, "_4b">; def ZPRAsmOp4b32 : ZPRAsmOperand<"Vector4bS", 32, "_4b">; def ZPRAsmOp4b64 : ZPRAsmOperand<"Vector4bD", 64, "_4b">; -def ZPR4b16 : ZPRRegOp<"h", ZPRAsmOp4b16, ZPR_4b>; -def ZPR4b32 : ZPRRegOp<"s", ZPRAsmOp4b32, ZPR_4b>; -def ZPR4b64 : ZPRRegOp<"d", ZPRAsmOp4b64, ZPR_4b>; +def ZPR4b16 : ZPRRegOp<"h", ZPRAsmOp4b16, ElementSizeH, ZPR_4b>; +def ZPR4b32 : ZPRRegOp<"s", ZPRAsmOp4b32, ElementSizeS, ZPR_4b>; +def ZPR4b64 : ZPRRegOp<"d", ZPRAsmOp4b64, ElementSizeD, ZPR_4b>; class FPRasZPR<int Width> : AsmOperandClass{ let Name = "FPR" # Width # "asZPR"; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 16e6ddda6398..0fde68011e86 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -220,10 +220,33 @@ let Predicates = [HasSVE] in { def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">; def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">; + defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">; + defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">; + def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>; def FEXPA_ZZ_H : sve_int_bin_cons_misc_0_c<0b01000000, "fexpa", ZPR16>; def FEXPA_ZZ_S : sve_int_bin_cons_misc_0_c<0b10000000, "fexpa", ZPR32>; def FEXPA_ZZ_D : sve_int_bin_cons_misc_0_c<0b11000000, "fexpa", ZPR64>; + def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">; + def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">; + def BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb">; + def BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs">; + + def BRKN_PPzP : sve_int_brkn<0b0, "brkn">; + def BRKNS_PPzP : sve_int_brkn<0b1, "brkns">; + + defm BRKA_PPzP : sve_int_break_z<0b000, "brka">; + defm BRKA_PPmP : sve_int_break_m<0b001, "brka">; + defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas">; + defm BRKB_PPzP : sve_int_break_z<0b100, "brkb">; + defm BRKB_PPmP : sve_int_break_m<0b101, "brkb">; + defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs">; + + def PTEST_PP : sve_int_ptest<0b010000, "ptest">; + def PFALSE : sve_int_pfalse<0b000000, "pfalse">; + defm PFIRST : sve_int_pfirst<0b00000, "pfirst">; + defm PNEXT : sve_int_pnext<0b00110, "pnext">; + def AND_PPzPP : sve_int_pred_log<0b0000, "and">; def BIC_PPzPP : sve_int_pred_log<0b0001, "bic">; def EOR_PPzPP : sve_int_pred_log<0b0010, "eor">; @@ -731,6 +754,21 @@ let Predicates = [HasSVE] in { defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">; defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">; + defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt">; + defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele">; + defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo">; + defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels">; + + defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt">; + defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele">; + defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo">; + defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels">; + + def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>; + def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>; + def CTERMEQ_XX : sve_int_cterm<0b1, 0b0, "ctermeq", GPR64>; + def CTERMNE_XX : sve_int_cterm<0b1, 0b1, "ctermne", GPR64>; + def RDVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdvl">; def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">; def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">; @@ -854,40 +892,40 @@ let Predicates = [HasSVE] in { defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">; defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">; - def FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16>; - def FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32>; - def SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16>; - def SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32>; - def UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32>; - def UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16>; - def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16>; - def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32>; - def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16>; - def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32>; - def FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16>; - def FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64>; - def FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32>; - def FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64>; - def SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64>; - def UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64>; - def UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16>; - def SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32>; - def SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16>; - def SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16>; - def UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32>; - def UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16>; - def SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64>; - def UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64>; - def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32>; - def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32>; - def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64>; - def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32>; - def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64>; - def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32>; - def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64>; - def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64>; - def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64>; - def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64>; + def FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, ElementSizeS>; + def FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, ElementSizeS>; + def SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, ElementSizeH>; + def SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, ElementSizeS>; + def UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, ElementSizeS>; + def UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, ElementSizeH>; + def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, ElementSizeH>; + def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, ElementSizeS>; + def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, ElementSizeH>; + def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, ElementSizeS>; + def FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, ElementSizeD>; + def FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, ElementSizeD>; + def FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, ElementSizeD>; + def FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, ElementSizeD>; + def SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, ElementSizeD>; + def UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, ElementSizeD>; + def UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, ElementSizeS>; + def SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, ElementSizeD>; + def SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, ElementSizeS>; + def SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, ElementSizeD>; + def UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, ElementSizeD>; + def UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, ElementSizeD>; + def SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, ElementSizeD>; + def UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, ElementSizeD>; + def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, ElementSizeD>; + def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, ElementSizeD>; + def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, ElementSizeD>; + def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, ElementSizeS>; + def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, ElementSizeD>; + def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, ElementSizeS>; + def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, ElementSizeD>; + def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, ElementSizeD>; + def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, ElementSizeD>; + def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, ElementSizeD>; defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn">; defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp">; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 01a997e5aed7..120d71381c67 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -255,6 +255,9 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, // AArch64 supports the MachineOutliner. setMachineOutliner(true); + + // AArch64 supports default outlining behaviour. + setSupportsDefaultOutlining(true); } AArch64TargetMachine::~AArch64TargetMachine() = default; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index d75fef7b0171..96e751e86971 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -577,7 +577,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, unsigned NumVectorInstToHideOverhead = 10; int MaxMergeDistance = 64; - if (Ty->isVectorTy() && SE && + if (Ty->isVectorTy() && SE && !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index a51c41d70915..30a9a08f2346 100644 --- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -11,6 +11,7 @@ #include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "MCTargetDesc/AArch64TargetStreamer.h" +#include "AArch64InstrInfo.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -79,6 +80,67 @@ private: // Map of register aliases registers via the .req directive. StringMap<std::pair<RegKind, unsigned>> RegisterReqs; + class PrefixInfo { + public: + static PrefixInfo CreateFromInst(const MCInst &Inst, uint64_t TSFlags) { + PrefixInfo Prefix; + switch (Inst.getOpcode()) { + case AArch64::MOVPRFX_ZZ: + Prefix.Active = true; + Prefix.Dst = Inst.getOperand(0).getReg(); + break; + case AArch64::MOVPRFX_ZPmZ_B: + case AArch64::MOVPRFX_ZPmZ_H: + case AArch64::MOVPRFX_ZPmZ_S: + case AArch64::MOVPRFX_ZPmZ_D: + Prefix.Active = true; + Prefix.Predicated = true; + Prefix.ElementSize = TSFlags & AArch64::ElementSizeMask; + assert(Prefix.ElementSize != AArch64::ElementSizeNone && + "No destructive element size set for movprfx"); + Prefix.Dst = Inst.getOperand(0).getReg(); + Prefix.Pg = Inst.getOperand(2).getReg(); + break; + case AArch64::MOVPRFX_ZPzZ_B: + case AArch64::MOVPRFX_ZPzZ_H: + case AArch64::MOVPRFX_ZPzZ_S: + case AArch64::MOVPRFX_ZPzZ_D: + Prefix.Active = true; + Prefix.Predicated = true; + Prefix.ElementSize = TSFlags & AArch64::ElementSizeMask; + assert(Prefix.ElementSize != AArch64::ElementSizeNone && + "No destructive element size set for movprfx"); + Prefix.Dst = Inst.getOperand(0).getReg(); + Prefix.Pg = Inst.getOperand(1).getReg(); + break; + default: + break; + } + + return Prefix; + } + + PrefixInfo() : Active(false), Predicated(false) {} + bool isActive() const { return Active; } + bool isPredicated() const { return Predicated; } + unsigned getElementSize() const { + assert(Predicated); + return ElementSize; + } + unsigned getDstReg() const { return Dst; } + unsigned getPgReg() const { + assert(Predicated); + return Pg; + } + + private: + bool Active; + bool Predicated; + unsigned ElementSize; + unsigned Dst; + unsigned Pg; + } NextPrefix; + AArch64TargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); return static_cast<AArch64TargetStreamer &>(TS); @@ -113,7 +175,8 @@ private: bool parseDirectiveReq(StringRef Name, SMLoc L); bool parseDirectiveUnreq(SMLoc L); - bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc); + bool validateInstruction(MCInst &Inst, SMLoc &IDLoc, + SmallVectorImpl<SMLoc> &Loc); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, @@ -3665,12 +3728,89 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, return false; } +static inline bool isMatchingOrAlias(unsigned ZReg, unsigned Reg) { + assert((ZReg >= AArch64::Z0) && (ZReg <= AArch64::Z31)); + return (ZReg == ((Reg - AArch64::B0) + AArch64::Z0)) || + (ZReg == ((Reg - AArch64::H0) + AArch64::Z0)) || + (ZReg == ((Reg - AArch64::S0) + AArch64::Z0)) || + (ZReg == ((Reg - AArch64::D0) + AArch64::Z0)) || + (ZReg == ((Reg - AArch64::Q0) + AArch64::Z0)) || + (ZReg == ((Reg - AArch64::Z0) + AArch64::Z0)); +} + // FIXME: This entire function is a giant hack to provide us with decent // operand range validation/diagnostics until TableGen/MC can be extended // to support autogeneration of this kind of validation. -bool AArch64AsmParser::validateInstruction(MCInst &Inst, - SmallVectorImpl<SMLoc> &Loc) { +bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, + SmallVectorImpl<SMLoc> &Loc) { const MCRegisterInfo *RI = getContext().getRegisterInfo(); + const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); + + // A prefix only applies to the instruction following it. Here we extract + // prefix information for the next instruction before validating the current + // one so that in the case of failure we don't erronously continue using the + // current prefix. + PrefixInfo Prefix = NextPrefix; + NextPrefix = PrefixInfo::CreateFromInst(Inst, MCID.TSFlags); + + // Before validating the instruction in isolation we run through the rules + // applicable when it follows a prefix instruction. + // NOTE: brk & hlt can be prefixed but require no additional validation. + if (Prefix.isActive() && + (Inst.getOpcode() != AArch64::BRK) && + (Inst.getOpcode() != AArch64::HLT)) { + + // Prefixed intructions must have a destructive operand. + if ((MCID.TSFlags & AArch64::DestructiveInstTypeMask) == + AArch64::NotDestructive) + return Error(IDLoc, "instruction is unpredictable when following a" + " movprfx, suggest replacing movprfx with mov"); + + // Destination operands must match. + if (Inst.getOperand(0).getReg() != Prefix.getDstReg()) + return Error(Loc[0], "instruction is unpredictable when following a" + " movprfx writing to a different destination"); + + // Destination operand must not be used in any other location. + for (unsigned i = 1; i < Inst.getNumOperands(); ++i) { + if (Inst.getOperand(i).isReg() && + (MCID.getOperandConstraint(i, MCOI::TIED_TO) == -1) && + isMatchingOrAlias(Prefix.getDstReg(), Inst.getOperand(i).getReg())) + return Error(Loc[0], "instruction is unpredictable when following a" + " movprfx and destination also used as non-destructive" + " source"); + } + + auto PPRRegClass = AArch64MCRegisterClasses[AArch64::PPRRegClassID]; + if (Prefix.isPredicated()) { + int PgIdx = -1; + + // Find the instructions general predicate. + for (unsigned i = 1; i < Inst.getNumOperands(); ++i) + if (Inst.getOperand(i).isReg() && + PPRRegClass.contains(Inst.getOperand(i).getReg())) { + PgIdx = i; + break; + } + + // Instruction must be predicated if the movprfx is predicated. + if (PgIdx == -1 || + (MCID.TSFlags & AArch64::ElementSizeMask) == AArch64::ElementSizeNone) + return Error(IDLoc, "instruction is unpredictable when following a" + " predicated movprfx, suggest using unpredicated movprfx"); + + // Instruction must use same general predicate as the movprfx. + if (Inst.getOperand(PgIdx).getReg() != Prefix.getPgReg()) + return Error(IDLoc, "instruction is unpredictable when following a" + " predicated movprfx using a different general predicate"); + + // Instruction element type must match the movprfx. + if ((MCID.TSFlags & AArch64::ElementSizeMask) != Prefix.getElementSize()) + return Error(IDLoc, "instruction is unpredictable when following a" + " predicated movprfx with a different element size"); + } + } + // Check for indexed addressing modes w/ the base register being the // same as a destination/source register or pair load where // the Rt == Rt2. All of those are undefined behaviour. @@ -4516,7 +4656,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, NumOperands = Operands.size(); for (unsigned i = 1; i < NumOperands; ++i) OperandLocs.push_back(Operands[i]->getStartLoc()); - if (validateInstruction(Inst, OperandLocs)) + if (validateInstruction(Inst, IDLoc, OperandLocs)) return true; Inst.setLoc(IDLoc); @@ -4719,7 +4859,6 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { const MCObjectFileInfo::Environment Format = getContext().getObjectFileInfo()->getObjectFileType(); bool IsMachO = Format == MCObjectFileInfo::IsMachO; - bool IsCOFF = Format == MCObjectFileInfo::IsCOFF; StringRef IDVal = DirectiveID.getIdentifier(); SMLoc Loc = DirectiveID.getLoc(); @@ -4733,14 +4872,14 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { parseDirectiveLtorg(Loc); else if (IDVal == ".unreq") parseDirectiveUnreq(Loc); - else if (!IsMachO && !IsCOFF) { - if (IDVal == ".inst") - parseDirectiveInst(Loc); + else if (IDVal == ".inst") + parseDirectiveInst(Loc); + else if (IsMachO) { + if (IDVal == MCLOHDirectiveName()) + parseDirectiveLOH(IDVal, Loc); else return true; - } else if (IDVal == MCLOHDirectiveName()) - parseDirectiveLOH(IDVal, Loc); - else + } else return true; return false; } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 1b949b54590c..dee964df2635 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -39,4 +39,16 @@ void AArch64TargetStreamer::emitCurrentConstantPool() { // finish() - write out any non-empty assembler constant pools. void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); } -void AArch64TargetStreamer::emitInst(uint32_t Inst) {} +void AArch64TargetStreamer::emitInst(uint32_t Inst) { + char Buffer[4]; + + // We can't just use EmitIntValue here, as that will swap the + // endianness on big-endian systems (instructions are always + // little-endian). + for (unsigned I = 0; I < 4; ++I) { + Buffer[I] = uint8_t(Inst); + Inst >>= 8; + } + + getStreamer().EmitBytes(StringRef(Buffer, 4)); +} diff --git a/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td b/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td index 17b3f6041279..7a8dd8bc5aee 100644 --- a/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -282,6 +282,79 @@ let Predicates = [HasSVE] in { //===----------------------------------------------------------------------===// +// SVE Predicate Misc Group +//===----------------------------------------------------------------------===// + +class sve_int_pfalse<bits<6> opc, string asm> +: I<(outs PPR8:$Pd), (ins), + asm, "\t$Pd", + "", + []>, Sched<[]> { + bits<4> Pd; + let Inst{31-24} = 0b00100101; + let Inst{23-22} = opc{5-4}; + let Inst{21-19} = 0b011; + let Inst{18-16} = opc{3-1}; + let Inst{15-10} = 0b111001; + let Inst{9} = opc{0}; + let Inst{8-4} = 0b00000; + let Inst{3-0} = Pd; +} + +class sve_int_ptest<bits<6> opc, string asm> +: I<(outs), (ins PPRAny:$Pg, PPR8:$Pn), + asm, "\t$Pg, $Pn", + "", + []>, Sched<[]> { + bits<4> Pg; + bits<4> Pn; + let Inst{31-24} = 0b00100101; + let Inst{23-22} = opc{5-4}; + let Inst{21-19} = 0b010; + let Inst{18-16} = opc{3-1}; + let Inst{15-14} = 0b11; + let Inst{13-10} = Pg; + let Inst{9} = opc{0}; + let Inst{8-5} = Pn; + let Inst{4-0} = 0b00000; + + let Defs = [NZCV]; +} + +class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm, + PPRRegOp pprty> +: I<(outs pprty:$Pdn), (ins PPRAny:$Pg, pprty:$_Pdn), + asm, "\t$Pdn, $Pg, $_Pdn", + "", + []>, Sched<[]> { + bits<4> Pdn; + bits<4> Pg; + let Inst{31-24} = 0b00100101; + let Inst{23-22} = sz8_64; + let Inst{21-19} = 0b011; + let Inst{18-16} = opc{4-2}; + let Inst{15-11} = 0b11000; + let Inst{10-9} = opc{1-0}; + let Inst{8-5} = Pg; + let Inst{4} = 0; + let Inst{3-0} = Pdn; + + let Constraints = "$Pdn = $_Pdn"; + let Defs = [NZCV]; +} + +multiclass sve_int_pfirst<bits<5> opc, string asm> { + def : sve_int_pfirst_next<0b01, opc, asm, PPR8>; +} + +multiclass sve_int_pnext<bits<5> opc, string asm> { + def _B : sve_int_pfirst_next<0b00, opc, asm, PPR8>; + def _H : sve_int_pfirst_next<0b01, opc, asm, PPR16>; + def _S : sve_int_pfirst_next<0b10, opc, asm, PPR32>; + def _D : sve_int_pfirst_next<0b11, opc, asm, PPR64>; +} + +//===----------------------------------------------------------------------===// // SVE Predicate Count Group //===----------------------------------------------------------------------===// @@ -348,6 +421,8 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_count_v<bits<5> opc, string asm> { @@ -433,6 +508,8 @@ class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty> { @@ -738,6 +815,8 @@ class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_perm_insrs<string asm> { @@ -762,6 +841,8 @@ class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_perm_insrv<string asm> { @@ -790,6 +871,8 @@ class sve_int_perm_extract_i<string asm> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } //===----------------------------------------------------------------------===// @@ -883,6 +966,8 @@ class sve_int_log_imm<bits<2> opc, string asm> let Constraints = "$Zdn = $_Zdn"; let DecoderMethod = "DecodeSVELogicalImmInstruction"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_log_imm<bits<2> opc, string asm, string alias> { @@ -993,6 +1078,8 @@ class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, Operand imm_ty> { @@ -1020,6 +1107,8 @@ class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_fp_2op_p_zds<bits<4> opc, string asm> { @@ -1045,6 +1134,8 @@ class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_fp_ftmad<string asm> { @@ -1106,6 +1197,8 @@ class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm> { @@ -1135,6 +1228,8 @@ class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm> { @@ -1163,6 +1258,8 @@ class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm> { @@ -1253,6 +1350,8 @@ class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_fp_fcmla<string asm> { @@ -1284,6 +1383,8 @@ class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_fp_fcmla_by_indexed_elem<string asm> { @@ -1325,6 +1426,8 @@ class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_fp_fcadd<string asm> { @@ -1405,7 +1508,7 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm> { //===----------------------------------------------------------------------===// class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype, - RegisterOperand o_zprtype> + RegisterOperand o_zprtype, ElementSizeEnum size> : I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn), asm, "\t$Zd, $Pg/m, $Zn", "", @@ -1423,12 +1526,14 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = size; } multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm> { - def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16>; - def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32>; - def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64>; + def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>; + def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>; + def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>; } //===----------------------------------------------------------------------===// @@ -1480,6 +1585,8 @@ class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_bin_pred_log<bits<3> opc, string asm> { @@ -1541,6 +1648,8 @@ class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_mladdsub_vvv_pred<bits<1> opc, string asm> { @@ -1571,6 +1680,8 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm> { @@ -1601,6 +1712,8 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = zprty1.ElementSize; } multiclass sve_intx_dot<bit opc, string asm> { @@ -1629,6 +1742,8 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> { @@ -1670,6 +1785,8 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> { @@ -1800,6 +1917,8 @@ class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_arith_imm0<bits<3> opc, string asm> { @@ -1825,6 +1944,8 @@ class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_arith_imm1<bits<2> opc, string asm, Operand immtype> { @@ -1885,6 +2006,8 @@ class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_dup_fpimm_pred<string asm> { @@ -1917,6 +2040,9 @@ class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm, let Inst{13} = imm{8}; // sh let Inst{12-5} = imm{7-0}; // imm8 let Inst{4-0} = Zd; + + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_dup_imm_pred_merge<string asm> { @@ -2083,6 +2209,65 @@ multiclass sve_int_ucmp_vi<bits<2> opc, string asm> { //===----------------------------------------------------------------------===// +// SVE Integer Compare - Scalars Group +//===----------------------------------------------------------------------===// + +class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt> +: I<(outs), (ins rt:$Rn, rt:$Rm), + asm, "\t$Rn, $Rm", + "", + []>, Sched<[]> { + bits<5> Rm; + bits<5> Rn; + let Inst{31-23} = 0b001001011; + let Inst{22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Rm; + let Inst{15-10} = 0b001000; + let Inst{9-5} = Rn; + let Inst{4} = opc; + let Inst{3-0} = 0b0000; + + let Defs = [NZCV]; +} + +class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm, + RegisterClass gprty, PPRRegOp pprty> +: I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm), + asm, "\t$Pd, $Rn, $Rm", + "", []>, Sched<[]> { + bits<4> Pd; + bits<5> Rm; + bits<5> Rn; + let Inst{31-24} = 0b00100101; + let Inst{23-22} = sz8_64; + let Inst{21} = 0b1; + let Inst{20-16} = Rm; + let Inst{15-13} = 0b000; + let Inst{12-10} = opc{3-1}; + let Inst{9-5} = Rn; + let Inst{4} = opc{0}; + let Inst{3-0} = Pd; + + let Defs = [NZCV]; +} + +multiclass sve_int_while4_rr<bits<3> opc, string asm> { + def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>; + def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>; + def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>; + def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>; +} + +multiclass sve_int_while8_rr<bits<3> opc, string asm> { + def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>; + def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>; + def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>; + def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>; +} + + +//===----------------------------------------------------------------------===// // SVE Floating Point Fast Reduction Group //===----------------------------------------------------------------------===// @@ -2312,9 +2497,9 @@ multiclass sve_int_index_rr<string asm> { //===----------------------------------------------------------------------===// // SVE Bitwise Shift - Predicated Group //===----------------------------------------------------------------------===// - class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<3> opc, string asm, - ZPRRegOp zprty, Operand immtype> + ZPRRegOp zprty, Operand immtype, + ElementSizeEnum size> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm), asm, "\t$Zdn, $Pg/m, $_Zdn, $imm", "", @@ -2333,31 +2518,41 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = size; } multiclass sve_int_bin_pred_shift_imm_left<bits<3> opc, string asm> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { + def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8, + ElementSizeB>; + def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16, + ElementSizeH> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32, + ElementSizeS> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64, + ElementSizeD> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } } multiclass sve_int_bin_pred_shift_imm_right<bits<3> opc, string asm> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { + def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8, + ElementSizeB>; + def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16, + ElementSizeH> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32, + ElementSizeS> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64, + ElementSizeD> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } @@ -2383,6 +2578,8 @@ class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_bin_pred_shift<bits<3> opc, string asm> { @@ -3017,6 +3214,8 @@ class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_perm_clast_zz<bit ab, string asm> { @@ -3094,6 +3293,8 @@ class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = Destructive; + let ElementSize = ElementSizeNone; } multiclass sve_int_perm_splice<string asm> { @@ -3122,6 +3323,8 @@ class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_perm_rev_rbit<string asm> { @@ -3163,6 +3366,8 @@ class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_perm_cpy_r<string asm> { @@ -3198,6 +3403,8 @@ class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = Destructive; + let ElementSize = zprty.ElementSize; } multiclass sve_int_perm_cpy_v<string asm> { @@ -4117,3 +4324,133 @@ multiclass sve_int_reduce_2<bits<3> opc, string asm> { def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>; def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>; } + +class sve_int_movprfx_pred<bits<2> sz8_32, bits<3> opc, string asm, + ZPRRegOp zprty, string pg_suffix, dag iops> +: I<(outs zprty:$Zd), iops, + asm, "\t$Zd, $Pg"#pg_suffix#", $Zn", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Zd; + bits<5> Zn; + let Inst{31-24} = 0b00000100; + let Inst{23-22} = sz8_32; + let Inst{21-19} = 0b010; + let Inst{18-16} = opc; + let Inst{15-13} = 0b001; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let ElementSize = zprty.ElementSize; +} + +multiclass sve_int_movprfx_pred_merge<bits<3> opc, string asm> { +let Constraints = "$Zd = $_Zd" in { + def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/m", + (ins ZPR8:$_Zd, PPR3bAny:$Pg, ZPR8:$Zn)>; + def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/m", + (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR16:$Zn)>; + def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/m", + (ins ZPR32:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn)>; + def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/m", + (ins ZPR64:$_Zd, PPR3bAny:$Pg, ZPR64:$Zn)>; +} +} + +multiclass sve_int_movprfx_pred_zero<bits<3> opc, string asm> { + def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/z", + (ins PPR3bAny:$Pg, ZPR8:$Zn)>; + def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/z", + (ins PPR3bAny:$Pg, ZPR16:$Zn)>; + def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/z", + (ins PPR3bAny:$Pg, ZPR32:$Zn)>; + def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/z", + (ins PPR3bAny:$Pg, ZPR64:$Zn)>; +} + +//===----------------------------------------------------------------------===// +// SVE Propagate Break Group +//===----------------------------------------------------------------------===// + +class sve_int_brkp<bits<2> opc, string asm> +: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm), + asm, "\t$Pd, $Pg/z, $Pn, $Pm", + "", + []>, Sched<[]> { + bits<4> Pd; + bits<4> Pg; + bits<4> Pm; + bits<4> Pn; + let Inst{31-24} = 0b00100101; + let Inst{23} = 0b0; + let Inst{22} = opc{1}; + let Inst{21-20} = 0b00; + let Inst{19-16} = Pm; + let Inst{15-14} = 0b11; + let Inst{13-10} = Pg; + let Inst{9} = 0b0; + let Inst{8-5} = Pn; + let Inst{4} = opc{0}; + let Inst{3-0} = Pd; + + let Defs = !if(!eq (opc{1}, 1), [NZCV], []); +} + + +//===----------------------------------------------------------------------===// +// SVE Partition Break Group +//===----------------------------------------------------------------------===// + +class sve_int_brkn<bit S, string asm> +: I<(outs PPR8:$Pdm), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$_Pdm), + asm, "\t$Pdm, $Pg/z, $Pn, $_Pdm", + "", + []>, Sched<[]> { + bits<4> Pdm; + bits<4> Pg; + bits<4> Pn; + let Inst{31-23} = 0b001001010; + let Inst{22} = S; + let Inst{21-14} = 0b01100001; + let Inst{13-10} = Pg; + let Inst{9} = 0b0; + let Inst{8-5} = Pn; + let Inst{4} = 0b0; + let Inst{3-0} = Pdm; + + let Constraints = "$Pdm = $_Pdm"; + let Defs = !if(!eq (S, 0b1), [NZCV], []); +} + +class sve_int_break<bits<3> opc, string asm, string suffix, dag iops> +: I<(outs PPR8:$Pd), iops, + asm, "\t$Pd, $Pg"#suffix#", $Pn", + "", + []>, Sched<[]> { + bits<4> Pd; + bits<4> Pg; + bits<4> Pn; + let Inst{31-24} = 0b00100101; + let Inst{23-22} = opc{2-1}; + let Inst{21-14} = 0b01000001; + let Inst{13-10} = Pg; + let Inst{9} = 0b0; + let Inst{8-5} = Pn; + let Inst{4} = opc{0}; + let Inst{3-0} = Pd; + + let Constraints = !if(!eq (opc{0}, 1), "$Pd = $_Pd", ""); + let Defs = !if(!eq (opc{1}, 1), [NZCV], []); + +} + +multiclass sve_int_break_m<bits<3> opc, string asm> { + def NAME : sve_int_break<opc, asm, "/m", (ins PPR8:$_Pd, PPRAny:$Pg, PPR8:$Pn)>; +} + +multiclass sve_int_break_z<bits<3> opc, string asm> { + def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>; +} + diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b201126c593b..21e44e9589d3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -554,6 +554,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FTRUNC: case ISD::FRINT: case ISD::FNEARBYINT: + case ISD::FCANONICALIZE: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RCP_IFLAG: @@ -907,6 +908,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( LLVMContext &Ctx = Fn.getParent()->getContext(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); + CallingConv::ID CC = Fn.getCallingConv(); unsigned MaxAlign = 1; uint64_t ExplicitArgOffset = 0; @@ -940,16 +942,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( EVT ArgVT = ValueVTs[Value]; EVT MemVT = ArgVT; - MVT RegisterVT = - getRegisterTypeForCallingConv(Ctx, ArgVT); - unsigned NumRegs = - getNumRegistersForCallingConv(Ctx, ArgVT); - - if (!Subtarget->isAmdHsaOS() && - (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) { - // The ABI says the caller will extend these values to 32-bits. - MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32; - } else if (NumRegs == 1) { + MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT); + unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT); + + if (NumRegs == 1) { // This argument is not split, so the IR type is the memory type. if (ArgVT.isExtended()) { // We have an extended type, like i24, so we should just use the @@ -3600,6 +3596,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, case ISD::FRINT: case ISD::FNEARBYINT: // XXX - Should fround be handled? case ISD::FSIN: + case ISD::FCANONICALIZE: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RCP_IFLAG: diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 96b7568eec1f..7442a59e594f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -342,8 +342,9 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", - SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, - SDTCisFP<0>, SDTCisVec<1>]>, + SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, + SDTCisFP<0>, SDTCisVec<1>, + SDTCisInt<4>]>, []>; def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 9426df399597..c9c932ef2f5f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -567,6 +567,7 @@ int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP16_ONE = 0x3C00; +int FP16_NEG_ONE = 0xBC00; int V2FP16_ONE = 0x3C003C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 8cc7e38f7b29..c147830e12ed 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -100,16 +100,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); - - // Clover seems to always pad i8/i16 to i32, but doesn't properly align - // them? - // Make sure the struct elements have correct size and alignment for ext - // args. These seem to be padded up to 4-bytes but not correctly aligned. - bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) && - !ST.isAmdHsaOS(); - if (IsExtArg) - AllocSize = 4; - uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; @@ -164,8 +154,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { ArgPtr->getName() + ".cast"); } - assert((!IsExtArg || !IsV3) && "incompatible situation"); - if (IsV3 && Size >= 32) { V4Ty = VectorType::get(VT->getVectorElementType(), 4); // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads @@ -212,20 +200,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // TODO: Convert noalias arg to !noalias if (Size < 32 && !ArgTy->isAggregateType()) { - if (IsExtArg && OffsetDiff == 0) { - Type *I32Ty = Builder.getInt32Ty(); - bool IsSext = Arg.hasSExtAttr(); - Metadata *LowAndHigh[] = { - ConstantAsMetadata::get( - ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)), - ConstantAsMetadata::get( - ConstantInt::get(I32Ty, - IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1)) - }; - - Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh)); - } - Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); diff --git a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 1e0bc62c45a6..44c2d366e461 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -66,6 +66,22 @@ def MIMGDimInfoTable : GenericTable { let PrimaryKeyName = "getMIMGDimInfo"; } +class MIMGLZMapping<MIMGBaseOpcode l, MIMGBaseOpcode lz> { + MIMGBaseOpcode L = l; + MIMGBaseOpcode LZ = lz; +} + +def MIMGLZMappingTable : GenericTable { + let FilterClass = "MIMGLZMapping"; + let CppTypeName = "MIMGLZMappingInfo"; + let Fields = ["L", "LZ"]; + GenericEnum TypeOf_L = MIMGBaseOpcode; + GenericEnum TypeOf_LZ = MIMGBaseOpcode; + + let PrimaryKey = ["L"]; + let PrimaryKeyName = "getMIMGLZMappingInfo"; +} + class mimg <bits<7> si, bits<7> vi = si> { field bits<7> SI = si; field bits<7> VI = vi; @@ -547,3 +563,13 @@ foreach intr = !listconcat(AMDGPUImageDimIntrinsics, AMDGPUImageDimAtomicIntrinsics) in { def : ImageDimIntrinsicInfo<intr>; } + +// L to LZ Optimization Mapping +def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>; +def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>; +def : MIMGLZMapping<IMAGE_SAMPLE_L_O, IMAGE_SAMPLE_LZ_O>; +def : MIMGLZMapping<IMAGE_SAMPLE_C_L_O, IMAGE_SAMPLE_C_LZ_O>; +def : MIMGLZMapping<IMAGE_GATHER4_L, IMAGE_GATHER4_LZ>; +def : MIMGLZMapping<IMAGE_GATHER4_C_L, IMAGE_GATHER4_C_LZ>; +def : MIMGLZMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_LZ_O>; +def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5b7fc2656a20..25007861fd15 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -694,6 +694,87 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { return false; } +MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + // TODO: Consider splitting all arguments into 32-bit pieces. + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + EVT ScalarVT = VT.getScalarType(); + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32) + return ScalarVT.getSimpleVT(); + + if (Size == 64) + return MVT::i32; + + if (Size == 16 && + Subtarget->has16BitInsts() && + isPowerOf2_32(VT.getVectorNumElements())) + return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + } + + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); +} + +unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); + EVT ScalarVT = VT.getScalarType(); + unsigned Size = ScalarVT.getSizeInBits(); + + if (Size == 32) + return NumElts; + + if (Size == 64) + return 2 * NumElts; + + // FIXME: Fails to break down as we want with v3. + if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) + return VT.getVectorNumElements() / 2; + } + + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); +} + +unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, + EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); + EVT ScalarVT = VT.getScalarType(); + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32) { + RegisterVT = ScalarVT.getSimpleVT(); + IntermediateVT = RegisterVT; + NumIntermediates = NumElts; + return NumIntermediates; + } + + if (Size == 64) { + RegisterVT = MVT::i32; + IntermediateVT = RegisterVT; + NumIntermediates = 2 * NumElts; + return NumIntermediates; + } + + // FIXME: We should fix the ABI to be the same on targets without 16-bit + // support, but unless we can properly handle 3-vectors, it will be still be + // inconsistent. + if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) { + RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + IntermediateVT = RegisterVT; + NumIntermediates = NumElts / 2; + return NumIntermediates; + } + } + + return TargetLowering::getVectorTypeBreakdownForCallingConv( + Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -1268,6 +1349,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; + assert(!Arg->VT.isVector() && "vector type argument should have been split"); + // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { @@ -1301,25 +1384,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, ++PSInputNum; } - // Second split vertices into their elements. - if (Arg->VT.isVector()) { - ISD::InputArg NewArg = *Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg->VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg->getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned J = 0; J != NumElements; ++J) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - } else { - Splits.push_back(*Arg); - } + Splits.push_back(*Arg); } } @@ -4490,6 +4555,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); + const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = + AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); + unsigned IntrOpcode = Intr->BaseOpcode; SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; @@ -4575,6 +4643,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<SDValue, 4> VAddrs; for (unsigned i = 0; i < NumVAddrs; ++i) VAddrs.push_back(Op.getOperand(AddrIdx + i)); + + // Optimize _L to _LZ when _L is zero + if (LZMappingInfo) { + if (auto ConstantLod = + dyn_cast<ConstantFPSDNode>(VAddrs[NumVAddrs-1].getNode())) { + if (ConstantLod->isZero() || ConstantLod->isNegative()) { + IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l + VAddrs.pop_back(); // remove 'lod' + } + } + } + SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); @@ -4634,10 +4714,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, int Opcode = -1; if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8, + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, NumVDataDwords, NumVAddrDwords); if (Opcode == -1) - Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6, + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, NumVDataDwords, NumVAddrDwords); assert(Opcode != -1); @@ -4945,7 +5025,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fdot2: return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -6754,10 +6835,6 @@ static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || ST->hasFP16Denormals(); - case ISD::FP16_TO_FP: - case ISD::FP_TO_FP16: - return ST->hasFP16Denormals(); - // It can/will be lowered or combined as a bit operation. // Need to check their input recursively to handle. case ISD::FNEG: @@ -6799,8 +6876,16 @@ SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; - ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0)); + SDValue N0 = N->getOperand(0); + // fcanonicalize undef -> qnan + if (N0.isUndef()) { + EVT VT = N->getValueType(0); + APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT)); + return DAG.getConstantFP(QNaN, SDLoc(N), VT); + } + + ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0); if (!CFP) { SDValue N0 = N->getOperand(0); EVT VT = N0.getValueType().getScalarType(); @@ -6853,7 +6938,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine( return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); } - return N->getOperand(0); + return N0; } static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { @@ -7544,8 +7629,10 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, return SDValue(); if ((Vec1 == Vec3 && Vec2 == Vec4) || - (Vec1 == Vec4 && Vec2 == Vec3)) - return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc); + (Vec1 == Vec4 && Vec2 == Vec3)) { + return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc, + DAG.getTargetConstant(0, SL, MVT::i1)); + } } return SDValue(); } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index ad049f2a71c3..5b3d49b3d8e3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -25,6 +25,19 @@ class SITargetLowering final : public AMDGPUTargetLowering { private: const GCNSubtarget *Subtarget; +public: + MVT getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override; + unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override; + + unsigned getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const override; + +private: SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 61c8f359e168..dc9397cf7b85 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -133,28 +133,10 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; - // V_READFIRSTLANE/V_READLANE destination register may be used as operand - // by some SALU instruction. If exec mask is zero vector instruction - // defining the register that is used by the scalar one is not executed - // and scalar instruction will operate on undefined data. For - // V_READFIRSTLANE/V_READLANE we should avoid predicated execution. - if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) || - (I->getOpcode() == AMDGPU::V_READLANE_B32)) { + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) return true; - } - - if (I->isInlineAsm()) { - const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - const char *AsmStr = I->getOperand(0).getSymbolName(); - - // inlineasm length estimate is number of bytes assuming the longest - // instruction. - uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); - NumInstr += MaxAsmSize / MAI->getMaxInstLength(); - } else { - ++NumInstr; - } + ++NumInstr; if (NumInstr >= SkipThreshold) return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6c85c92454c3..f3745382a6f4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2332,6 +2332,36 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, changesVGPRIndexingMode(MI); } +bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + + if (MI.mayStore() && isSMRD(MI)) + return true; // scalar store or atomic + + // These instructions cause shader I/O that may cause hardware lockups + // when executed with an empty EXEC mask. + // + // Note: exp with VM = DONE = 0 is automatically skipped by hardware when + // EXEC = 0, but checking for that case here seems not worth it + // given the typical code patterns. + if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || + Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE) + return true; + + if (MI.isInlineAsm()) + return true; // conservative assumption + + // These are like SALU instructions in terms of effects, so it's questionable + // whether we should return true for those. + // + // However, executing them with EXEC = 0 causes them to operate on undefined + // data, which we avoid by returning true here. + if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) + return true; + + return false; +} + bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { switch (Imm.getBitWidth()) { case 32: diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 0a735257d34e..d681b926504e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -597,6 +597,9 @@ public: return !RI.isSGPRReg(MRI, Dest); } + /// Whether we must prevent this instruction from executing with EXEC = 0. + bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index c3f8bfb53ef4..5c10646161b3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1387,6 +1387,11 @@ def : GCNPat< >; def : GCNPat< + (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src, 0, 0) +>; + +def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) >; @@ -1411,6 +1416,11 @@ def : GCNPat< (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0) >; + +def : GCNPat< + (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), + (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src, 0, 0) +>; } let OtherPredicates = [FP32Denormals] in { diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3fd3c75874a3..4eba19382315 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -110,6 +110,7 @@ struct MIMGInfo { #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL +#define GET_MIMGLZMappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 70681c271697..5b7af8268cda 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -42,6 +42,7 @@ namespace AMDGPU { #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL +#define GET_MIMGLZMapping_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -211,6 +212,14 @@ struct MIMGDimInfo { LLVM_READONLY const MIMGDimInfo *getMIMGDimInfo(unsigned Dim); +struct MIMGLZMappingInfo { + MIMGBaseOpcode L; + MIMGBaseOpcode LZ; +}; + +LLVM_READONLY +const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L); + LLVM_READONLY int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords); diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 5c78ada3211e..b51828b54679 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -167,13 +167,30 @@ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; let SubtargetPredicate = HasDLInsts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>; -def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>; -def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>; -def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>; -def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>; -def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>; -def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>; +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; +def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; +def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; +def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; +def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; + +multiclass DotPats<SDPatternOperator dot_op, + VOP3PInst dot_inst> { + def : GCNPat < + (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), + (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), + (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp), + (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>; +} + +defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>; +defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>; +defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>; +defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>; +defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>; +defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>; +defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>; } // End SubtargetPredicate = HasDLInsts diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 2196f9b47f3b..b227eaed8d61 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -117,7 +117,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // globals from all functions in PromotedGlobals. for (auto *GV : AFI->getGlobalsPromotedToConstantPool()) PromotedGlobals.insert(GV); - + // Calculate this function's optimization goal. unsigned OptimizationGoal; if (F.hasFnAttribute(Attribute::OptimizeNone)) @@ -367,8 +367,9 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); unsigned RC; - InlineAsm::hasRegClassConstraint(Flags, RC); - if (RC == ARM::GPRPairRegClassID) { + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + if (InlineAsm::hasRegClassConstraint(Flags, RC) && + ARM::GPRPairRegClass.hasSubClassEq(TRI->getRegClass(RC))) { if (NumVals != 1) return true; const MachineOperand &MO = MI->getOperand(OpNum); @@ -990,7 +991,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, if (Subtarget->isThumb1Only()) EmitAlignment(2); - + MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); OutStreamer->EmitLabel(JTISymbol); diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 43e8b7d66c62..5342e6e2cd13 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -584,7 +584,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // don't know for sure yet whether we'll need that, so we guess based // on whether there are any local variables that would trigger it. unsigned StackAlign = TFI->getStackAlignment(); - if (TFI->hasFP(MF) && + if (TFI->hasFP(MF) && !((MFI.getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) { if (isFrameOffsetLegal(MI, getFrameRegister(MF), FPOffset)) return false; diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h index 63bf48abb7ac..543165de38d0 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h @@ -269,14 +269,15 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, for (auto Reg : RegList) State.AllocateReg(Reg); + // After the first item has been allocated, the rest are packed as tightly as + // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll + // be allocating a bunch of i32 slots). + unsigned RestAlign = std::min(Align, Size); + for (auto &It : PendingMembers) { It.convertToMem(State.AllocateStack(Size, Align)); State.addLoc(It); - - // After the first item has been allocated, the rest are packed as tightly - // as possible. (E.g. an incoming i64 would have starting Align of 8, but - // we'll be allocating a bunch of i32 slots). - Align = Size; + Align = RestAlign; } // All pending members have now been allocated diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index de08eb8c6985..2c4738d3cb74 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -2128,7 +2128,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() { unsigned DeadSize = 0; bool CanDeleteLEA = false; bool BaseRegKill = false; - + unsigned IdxReg = ~0U; bool IdxRegKill = true; if (isThumb2) { diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h index 5139a18f9263..55194ed94532 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -113,7 +113,7 @@ public: bool isLSDA() const { return Kind == ARMCP::CPLSDA; } bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; } bool isPromotedGlobal() const{ return Kind == ARMCP::CPPromotedGlobal; } - + int getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) override; diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp index 26d4aaa12acf..a66cd7053c0a 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -2116,7 +2116,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) { CallingConv::ID CC = F.getCallingConv(); if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp index af983ce2606a..a8c75702d7b5 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -372,7 +372,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. DebugLoc dl; - + unsigned FramePtr = RegInfo->getFrameRegister(MF); // Determine the sizes of each callee-save spill areas and record which frame diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 081d4ff033bd..9592dd53c347 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2539,7 +2539,7 @@ void ARMDAGToDAGISel::SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI) { return CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops); } }; - + if (Range->second == 0) { // 1. Mask includes the LSB -> Simply shift the top N bits off NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first); @@ -2633,7 +2633,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { MachineMemOperand::MOLoad, 4, 4); cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp+1); - + ReplaceNode(N, ResNode); return; } @@ -2920,7 +2920,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { assert(N3.getOpcode() == ISD::Register); unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue(); - + if (InFlag.getOpcode() == ARMISD::CMPZ) { bool SwitchEQNEToPLMI; SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI); @@ -3023,7 +3023,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } - + case ARMISD::VZIP: { unsigned Opc = 0; EVT VT = N->getValueType(0); diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index 47222a66f798..ede276dd91bb 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3096,7 +3096,7 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, // need to be duplicated) or duplicating the constant wouldn't increase code // size (implying the constant is no larger than 4 bytes). const Function &F = DAG.getMachineFunction().getFunction(); - + // We rely on this decision to inline being idemopotent and unrelated to the // use-site. We know that if we inline a variable at one use site, we'll // inline it elsewhere too (and reuse the constant pool entry). Fast-isel @@ -5162,7 +5162,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, return SDValue(); // SoftFP: read half-precision arguments: // - // t2: i32,ch = ... + // t2: i32,ch = ... // t7: i16 = truncate t2 <~~~~ Op // t8: f16 = bitcast t7 <~~~~ N // @@ -5173,7 +5173,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, return SDValue(); } - // Half-precision return values + // Half-precision return values if (SrcVT == MVT::f16 && DstVT == MVT::i16) { if (!HasFullFP16) return SDValue(); @@ -13461,13 +13461,13 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); if (!RHS || RHS->getZExtValue() != 4) return false; - + Offset = Op->getOperand(1); Base = Op->getOperand(0); AM = ISD::POST_INC; return true; } - + bool isInc; bool isLegal = false; if (Subtarget->isThumb2()) diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 901138dbdfd5..db5f28480e90 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1275,7 +1275,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { // we're minimizing code size. if (!MBB.getParent()->getFunction().optForMinSize() || !BaseKill) return false; - + bool HighRegsUsed = false; for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) if (MI->getOperand(i).getReg() >= ARM::R8) { diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 816116772995..91310e81e398 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -126,7 +126,7 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// The amount the literal pool has been increasedby due to promoted globals. int PromotedGlobalsIncrease = 0; - + public: ARMFunctionInfo() = default; diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index d4fbf76f299f..4d685158e258 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -49,7 +49,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( case RTLIB::MEMMOVE: AEABILibcall = AEABI_MEMMOVE; break; - case RTLIB::MEMSET: + case RTLIB::MEMSET: AEABILibcall = AEABI_MEMSET; if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src)) if (ConstantSrc->getZExtValue() == 0) @@ -93,14 +93,14 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( else if (Src.getValueType().bitsLT(MVT::i32)) Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); - Entry.Node = Src; + Entry.Node = Src; Entry.Ty = Type::getInt32Ty(*DAG.getContext()); Entry.IsSExt = false; Args.push_back(Entry); } else { Entry.Node = Src; Args.push_back(Entry); - + Entry.Node = Size; Args.push_back(Entry); } @@ -121,7 +121,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( std::move(Args)) .setDiscardResult(); std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI); - + return CallResult.second; } diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index f8cae31641ff..94f9cefe429c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -389,7 +389,7 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, unsigned NumVectorInstToHideOverhead = 10; int MaxMergeDistance = 64; - if (Ty->isVectorTy() && SE && + if (Ty->isVectorTy() && SE && !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index cd9fa0709020..e0cd2d8e26a6 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -153,7 +153,7 @@ public: int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getAddressComputationCost(Type *Val, ScalarEvolution *SE, + int getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr); int getArithmeticInstrCost( diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 807d62547337..a5fbbbf26be9 100644 --- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -969,7 +969,7 @@ public: // checks whether this operand is a memory operand computed as an offset // applied to PC. the offset may have 8 bits of magnitude and is represented - // with two bits of shift. textually it may be either [pc, #imm], #imm or + // with two bits of shift. textually it may be either [pc, #imm], #imm or // relocable expression... bool isThumbMemPC() const { int64_t Val = 0; @@ -2284,7 +2284,7 @@ public: } const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val); - + assert(SR && "Unknown value type!"); Inst.addOperand(MCOperand::createExpr(SR)); return; @@ -2326,7 +2326,7 @@ public: assert(isImm() && "Not an immediate!"); // If we have an immediate that's not a constant, treat it as a label - // reference needing a fixup. + // reference needing a fixup. if (!isa<MCConstantExpr>(getImm())) { Inst.addOperand(MCOperand::createExpr(getImm())); return; @@ -3419,7 +3419,7 @@ int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) - return -1; + return -1; std::string lowerCase = Tok.getString().lower(); ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase) @@ -4311,7 +4311,7 @@ ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); SMLoc S = Parser.getTok().getLoc(); const AsmToken &Tok = Parser.getTok(); - if (!Tok.is(AsmToken::Identifier)) + if (!Tok.is(AsmToken::Identifier)) return MatchOperand_NoMatch; StringRef IFlagsStr = Tok.getString(); @@ -4353,7 +4353,7 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) { return MatchOperand_NoMatch; } unsigned SYSmvalue = Val & 0xFF; - Parser.Lex(); + Parser.Lex(); Operands.push_back(ARMOperand::CreateMSRMask(SYSmvalue, S)); return MatchOperand_Success; } @@ -4996,7 +4996,7 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst, // first decide whether or not the branch should be conditional // by looking at it's location relative to an IT block if(inITBlock()) { - // inside an IT block we cannot have any conditional branches. any + // inside an IT block we cannot have any conditional branches. any // such instructions needs to be converted to unconditional form switch(Inst.getOpcode()) { case ARM::tBcc: Inst.setOpcode(ARM::tB); break; @@ -5008,11 +5008,11 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst, unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode(); switch(Inst.getOpcode()) { case ARM::tB: - case ARM::tBcc: - Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc); + case ARM::tBcc: + Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc); break; case ARM::t2B: - case ARM::t2Bcc: + case ARM::t2Bcc: Inst.setOpcode(Cond == ARMCC::AL ? ARM::t2B : ARM::t2Bcc); break; } @@ -8882,7 +8882,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, case ARM::MOVsi: { ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm()); // rrx shifts and asr/lsr of #32 is encoded as 0 - if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) + if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) return false; if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) { // Shifting by zero is accepted as a vanilla 'MOVr' @@ -9371,6 +9371,12 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveAlign(DirectiveID.getLoc()); // Use Generic on failure. else if (IDVal == ".thumb_set") parseDirectiveThumbSet(DirectiveID.getLoc()); + else if (IDVal == ".inst") + parseDirectiveInst(DirectiveID.getLoc()); + else if (IDVal == ".inst.n") + parseDirectiveInst(DirectiveID.getLoc(), 'n'); + else if (IDVal == ".inst.w") + parseDirectiveInst(DirectiveID.getLoc(), 'w'); else if (!IsMachO && !IsCOFF) { if (IDVal == ".arch") parseDirectiveArch(DirectiveID.getLoc()); @@ -9382,12 +9388,6 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { parseDirectiveFPU(DirectiveID.getLoc()); else if (IDVal == ".fnstart") parseDirectiveFnStart(DirectiveID.getLoc()); - else if (IDVal == ".inst") - parseDirectiveInst(DirectiveID.getLoc()); - else if (IDVal == ".inst.n") - parseDirectiveInst(DirectiveID.getLoc(), 'n'); - else if (IDVal == ".inst.w") - parseDirectiveInst(DirectiveID.getLoc(), 'w'); else if (IDVal == ".object_arch") parseDirectiveObjectArch(DirectiveID.getLoc()); else if (IDVal == ".tlsdescseq") @@ -10012,8 +10012,8 @@ bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) { case 'w': break; default: - return Error(Loc, "cannot determine Thumb instruction size, " - "use inst.n/inst.w instead"); + Width = 0; + break; } } else { if (Suffix) @@ -10029,6 +10029,7 @@ bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) { return Error(Loc, "expected constant expression"); } + char CurSuffix = Suffix; switch (Width) { case 2: if (Value->getValue() > 0xffff) @@ -10039,11 +10040,21 @@ bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) { return Error(Loc, StringRef(Suffix ? "inst.w" : "inst") + " operand is too big"); break; + case 0: + // Thumb mode, no width indicated. Guess from the opcode, if possible. + if (Value->getValue() < 0xe800) + CurSuffix = 'n'; + else if (Value->getValue() >= 0xe8000000) + CurSuffix = 'w'; + else + return Error(Loc, "cannot determine Thumb instruction size, " + "use inst.n/inst.w instead"); + break; default: llvm_unreachable("only supported widths are 2 and 4"); } - getTargetStreamer().emitInst(Value->getValue(), Suffix); + getTargetStreamer().emitInst(Value->getValue(), CurSuffix); return false; }; diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 4733cf49827e..61bec04678dd 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -620,7 +620,7 @@ ThumbDisassembler::AddThumbPredicate(MCInst &MI) const { // assume a predicate of AL. unsigned CC; CC = ITBlock.getITCC(); - if (CC == 0xF) + if (CC == 0xF) CC = ARMCC::AL; if (ITBlock.instrInITBlock()) ITBlock.advanceITState(); @@ -888,7 +888,7 @@ DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; - if (RegNo == 15) + if (RegNo == 15) S = MCDisassembler::SoftFail; Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder)); @@ -2171,7 +2171,7 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn, const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits(); - if (!FeatureBits[ARM::HasV8_1aOps] || + if (!FeatureBits[ARM::HasV8_1aOps] || !FeatureBits[ARM::HasV8Ops]) return MCDisassembler::Fail; @@ -4467,7 +4467,7 @@ static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn, index = fieldFromInstruction(Insn, 7, 1); switch (fieldFromInstruction(Insn, 4, 2)) { - case 0: + case 0: align = 0; break; case 3: align = 4; break; @@ -5279,7 +5279,7 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, return MCDisassembler::Fail; if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder))) return MCDisassembler::Fail; - if (!Check(S, DecodeAddrMode7Operand(Inst, Rn, Address, Decoder))) + if (!Check(S, DecodeAddrMode7Operand(Inst, Rn, Address, Decoder))) return MCDisassembler::Fail; if (!Check(S, DecodePostIdxReg(Inst, Rm, Address, Decoder))) return MCDisassembler::Fail; diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index 75ed40c18fa2..bfc32073ba18 100644 --- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -834,7 +834,7 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, return; } - O << SYSm; + O << SYSm; return; } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index dfa339091a7b..7d04c73fb3f2 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -64,7 +64,7 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, } } -// Need to examine the Fixup when determining whether to +// Need to examine the Fixup when determining whether to // emit the relocation as an explicit symbol or as a section relative // offset unsigned ARMELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index 0dab789505d5..b37b8073548f 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -740,7 +740,7 @@ getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx, const MCOperand MO = MI.getOperand(OpIdx); if (MO.isExpr()) { if (HasConditionalBranch(MI)) - return ::getBranchTargetOpValue(MI, OpIdx, + return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_condbl, Fixups, STI); return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups, STI); } @@ -766,10 +766,10 @@ uint32_t ARMMCCodeEmitter::getThumbBranchTargetOpValue( const MCSubtargetInfo &STI) const { unsigned Val = 0; const MCOperand MO = MI.getOperand(OpIdx); - + if(MO.isExpr()) return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups, STI); - else + else Val = MO.getImm() >> 1; bool I = (Val & 0x800000); diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index 42371736fef4..63aa9735e8a4 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -13,6 +13,8 @@ #include "ARMTargetMachine.h" #include "llvm/MC/ConstantPools.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -47,6 +49,41 @@ void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); } // reset() - Reset any state void ARMTargetStreamer::reset() {} +void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) { + unsigned Size; + char Buffer[4]; + const bool LittleEndian = getStreamer().getContext().getAsmInfo()->isLittleEndian(); + + switch (Suffix) { + case '\0': + Size = 4; + + for (unsigned II = 0, IE = Size; II != IE; II++) { + const unsigned I = LittleEndian ? (Size - II - 1) : II; + Buffer[Size - II - 1] = uint8_t(Inst >> I * CHAR_BIT); + } + + break; + case 'n': + case 'w': + Size = (Suffix == 'n' ? 2 : 4); + + // Thumb wide instructions are emitted as a pair of 16-bit words of the + // appropriate endianness. + for (unsigned II = 0, IE = Size; II != IE; II = II + 2) { + const unsigned I0 = LittleEndian ? II + 0 : II + 1; + const unsigned I1 = LittleEndian ? II + 1 : II + 0; + Buffer[Size - II - 2] = uint8_t(Inst >> I0 * CHAR_BIT); + Buffer[Size - II - 1] = uint8_t(Inst >> I1 * CHAR_BIT); + } + + break; + default: + llvm_unreachable("Invalid Suffix"); + } + getStreamer().EmitBytes(StringRef(Buffer, Size)); +} + // The remaining callbacks should be handled separately by each // streamer. void ARMTargetStreamer::emitFnStart() {} @@ -76,7 +113,6 @@ void ARMTargetStreamer::emitArchExtension(unsigned ArchExt) {} void ARMTargetStreamer::emitObjectArch(ARM::ArchKind Arch) {} void ARMTargetStreamer::emitFPU(unsigned FPU) {} void ARMTargetStreamer::finishAttributeSection() {} -void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {} void ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {} void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {} diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp index 637e4a44c428..7f03e1463c1d 100644 --- a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp +++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp @@ -233,7 +233,7 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) { // On Swift, we mostly care about hazards from multiplication instructions // writing the accumulator and the pipelining of loop iterations by out-of- - // order execution. + // order execution. if (isSwift) return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI); diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp index a65e22fd86e8..5c745e112b2e 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -127,7 +127,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. DebugLoc dl; - + unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned BasePtr = RegInfo->getBaseRegister(); int CFAOffset = 0; diff --git a/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp index c1515571aae5..1b412a9c6813 100644 --- a/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -63,6 +63,13 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm) setTruncStoreAction(MVT::i16, MVT::i8, Expand); + for (MVT VT : MVT::integer_valuetypes()) { + setOperationAction(ISD::ADDC, VT, Legal); + setOperationAction(ISD::SUBC, VT, Legal); + setOperationAction(ISD::ADDE, VT, Legal); + setOperationAction(ISD::SUBE, VT, Legal); + } + // sub (x, imm) gets canonicalized to add (x, -imm), so for illegal types // revert into a sub since we don't have an add with immediate instruction. setOperationAction(ISD::ADD, MVT::i32, Custom); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 4791b067aa8d..ba255d30fede 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1777,6 +1777,7 @@ namespace { const BitTracker::RegisterCell &RC); bool simplifyExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC, const RegisterSet &AVs); + bool simplifyRCmp0(MachineInstr *MI, BitTracker::RegisterRef RD); // Cache of created instructions to avoid creating duplicates. // XXX Currently only used by genBitSplit. @@ -2567,6 +2568,127 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI, return Changed; } +bool BitSimplification::simplifyRCmp0(MachineInstr *MI, + BitTracker::RegisterRef RD) { + unsigned Opc = MI->getOpcode(); + if (Opc != Hexagon::A4_rcmpeqi && Opc != Hexagon::A4_rcmpneqi) + return false; + MachineOperand &CmpOp = MI->getOperand(2); + if (!CmpOp.isImm() || CmpOp.getImm() != 0) + return false; + + const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI); + if (FRC != &Hexagon::IntRegsRegClass && FRC != &Hexagon::DoubleRegsRegClass) + return false; + assert(RD.Sub == 0); + + MachineBasicBlock &B = *MI->getParent(); + const DebugLoc &DL = MI->getDebugLoc(); + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + bool KnownZ = true; + bool KnownNZ = false; + + BitTracker::RegisterRef SR = MI->getOperand(1); + if (!BT.has(SR.Reg)) + return false; + const BitTracker::RegisterCell &SC = BT.lookup(SR.Reg); + unsigned F, W; + if (!HBS::getSubregMask(SR, F, W, MRI)) + return false; + + for (uint16_t I = F; I != F+W; ++I) { + const BitTracker::BitValue &V = SC[I]; + if (!V.is(0)) + KnownZ = false; + if (V.is(1)) + KnownNZ = true; + } + + auto ReplaceWithConst = [&] (int C) { + unsigned NewR = MRI.createVirtualRegister(FRC); + BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), NewR) + .addImm(C); + HBS::replaceReg(RD.Reg, NewR, MRI); + BitTracker::RegisterCell NewRC(W); + for (uint16_t I = 0; I != W; ++I) { + NewRC[I] = BitTracker::BitValue(C & 1); + C = unsigned(C) >> 1; + } + BT.put(BitTracker::RegisterRef(NewR), NewRC); + return true; + }; + + auto IsNonZero = [] (const MachineOperand &Op) { + if (Op.isGlobal() || Op.isBlockAddress()) + return true; + if (Op.isImm()) + return Op.getImm() != 0; + if (Op.isCImm()) + return !Op.getCImm()->isZero(); + if (Op.isFPImm()) + return !Op.getFPImm()->isZero(); + return false; + }; + + auto IsZero = [] (const MachineOperand &Op) { + if (Op.isGlobal() || Op.isBlockAddress()) + return false; + if (Op.isImm()) + return Op.getImm() == 0; + if (Op.isCImm()) + return Op.getCImm()->isZero(); + if (Op.isFPImm()) + return Op.getFPImm()->isZero(); + return false; + }; + + // If the source register is known to be 0 or non-0, the comparison can + // be folded to a load of a constant. + if (KnownZ || KnownNZ) { + assert(KnownZ != KnownNZ && "Register cannot be both 0 and non-0"); + return ReplaceWithConst(KnownZ == (Opc == Hexagon::A4_rcmpeqi)); + } + + // Special case: if the compare comes from a C2_muxii, then we know the + // two possible constants that can be the source value. + MachineInstr *InpDef = MRI.getVRegDef(SR.Reg); + if (!InpDef) + return false; + if (SR.Sub == 0 && InpDef->getOpcode() == Hexagon::C2_muxii) { + MachineOperand &Src1 = InpDef->getOperand(2); + MachineOperand &Src2 = InpDef->getOperand(3); + // Check if both are non-zero. + bool KnownNZ1 = IsNonZero(Src1), KnownNZ2 = IsNonZero(Src2); + if (KnownNZ1 && KnownNZ2) + return ReplaceWithConst(Opc == Hexagon::A4_rcmpneqi); + // Check if both are zero. + bool KnownZ1 = IsZero(Src1), KnownZ2 = IsZero(Src2); + if (KnownZ1 && KnownZ2) + return ReplaceWithConst(Opc == Hexagon::A4_rcmpeqi); + + // If for both operands we know that they are either 0 or non-0, + // replace the comparison with a C2_muxii, using the same predicate + // register, but with operands substituted with 0/1 accordingly. + if ((KnownZ1 || KnownNZ1) && (KnownZ2 || KnownNZ2)) { + unsigned NewR = MRI.createVirtualRegister(FRC); + BuildMI(B, At, DL, HII.get(Hexagon::C2_muxii), NewR) + .addReg(InpDef->getOperand(1).getReg()) + .addImm(KnownZ1 == (Opc == Hexagon::A4_rcmpeqi)) + .addImm(KnownZ2 == (Opc == Hexagon::A4_rcmpeqi)); + HBS::replaceReg(RD.Reg, NewR, MRI); + // Create a new cell with only the least significant bit unknown. + BitTracker::RegisterCell NewRC(W); + NewRC[0] = BitTracker::BitValue::self(); + NewRC.fill(1, W, BitTracker::BitValue::Zero); + BT.put(BitTracker::RegisterRef(NewR), NewRC); + return true; + } + } + + return false; +} + bool BitSimplification::processBlock(MachineBasicBlock &B, const RegisterSet &AVs) { if (!BT.reached(&B)) @@ -2615,6 +2737,7 @@ bool BitSimplification::processBlock(MachineBasicBlock &B, T = T || genExtractHalf(MI, RD, RC); T = T || genCombineHalf(MI, RD, RC); T = T || genExtractLow(MI, RD, RC); + T = T || simplifyRCmp0(MI, RD); Changed |= T; continue; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp index e13cfd3f655a..94aacbed6af6 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -347,9 +347,11 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI, return rr0(RC, Outputs); } case C2_tfrrp: { - RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0); - W0 = 8; // XXX Pred size - return rr0(eINS(RC, eXTR(rc(1), 0, W0), 0), Outputs); + uint16_t RW = W0; + uint16_t PW = 8; // XXX Pred size: getRegBitWidth(Reg[1]); + RegisterCell RC = RegisterCell::self(Reg[0].Reg, RW); + RC.fill(PW, RW, BT::BitValue::Zero); + return rr0(eINS(RC, eXTR(rc(1), 0, PW), 0), Outputs); } // Arithmetic: @@ -950,6 +952,19 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI, } default: + // For instructions that define a single predicate registers, store + // the low 8 bits of the register only. + if (unsigned DefR = getUniqueDefVReg(MI)) { + if (MRI.getRegClass(DefR) == &Hexagon::PredRegsRegClass) { + BT::RegisterRef PD(DefR, 0); + uint16_t RW = getRegBitWidth(PD); + uint16_t PW = 8; // XXX Pred size: getRegBitWidth(Reg[1]); + RegisterCell RC = RegisterCell::self(DefR, RW); + RC.fill(PW, RW, BT::BitValue::Zero); + putCell(PD, RC, Outputs); + return true; + } + } return MachineEvaluator::evaluate(MI, Inputs, Outputs); } #undef im @@ -1016,6 +1031,21 @@ bool HexagonEvaluator::evaluate(const MachineInstr &BI, return true; } +unsigned HexagonEvaluator::getUniqueDefVReg(const MachineInstr &MI) const { + unsigned DefReg = 0; + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + if (DefReg != 0) + return 0; + DefReg = R; + } + return DefReg; +} + bool HexagonEvaluator::evaluateLoad(const MachineInstr &MI, const CellMapType &Inputs, CellMapType &Outputs) const { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h index d9dd04e1b088..f0b7c9d91950 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h @@ -49,6 +49,7 @@ struct HexagonEvaluator : public BitTracker::MachineEvaluator { const HexagonInstrInfo &TII; private: + unsigned getUniqueDefVReg(const MachineInstr &MI) const; bool evaluateLoad(const MachineInstr &MI, const CellMapType &Inputs, CellMapType &Outputs) const; bool evaluateFormalCopy(const MachineInstr &MI, const CellMapType &Inputs, diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h index 183dee36a047..de486ec4b7bd 100644 --- a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h +++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h @@ -2,7 +2,7 @@ // // The LLVM Compiler Infrastructure // -// This file is distributed under the University of Illinois Open Source +// This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 2acf701b43cb..ce7db657f5e9 100644 --- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -7371,7 +7371,7 @@ bool MipsAsmParser::parseDirectiveGpWord() { getParser().getStreamer().EmitGPRel32Value(Value); if (getLexer().isNot(AsmToken::EndOfStatement)) - return Error(getLexer().getLoc(), + return Error(getLexer().getLoc(), "unexpected token, expected end of statement"); Parser.Lex(); // Eat EndOfStatement token. return false; @@ -7506,7 +7506,7 @@ bool MipsAsmParser::parseDirectiveOption() { } // Unknown option. - Warning(Parser.getTok().getLoc(), + Warning(Parser.getTok().getLoc(), "unknown option, expected 'pic0' or 'pic2'"); Parser.eatToEndOfStatement(); return false; @@ -8193,7 +8193,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".abicalls") { getTargetStreamer().emitDirectiveAbiCalls(); if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { - Error(Parser.getTok().getLoc(), + Error(Parser.getTok().getLoc(), "unexpected token, expected end of statement"); } return false; diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h index fdb560f3c72f..d7f6cf91db73 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h @@ -114,7 +114,7 @@ namespace Mips { // resulting in - R_MIPS_GOT_DISP fixup_Mips_GOT_DISP, - // resulting in - R_MIPS_HIGHER/R_MICROMIPS_HIGHER + // resulting in - R_MIPS_HIGHER/R_MICROMIPS_HIGHER fixup_Mips_HIGHER, fixup_MICROMIPS_HIGHER, diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 8ffc0731abcb..2e0c25de2bc8 100644 --- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -1094,7 +1094,7 @@ void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) { // ALIGN // B .tmpN // 11 NOP instructions (44 bytes) - // ADDIU T9, T9, 52 + // ADDIU T9, T9, 52 // .tmpN // // We need the 44 bytes (11 instructions) because at runtime, we'd diff --git a/contrib/llvm/lib/Target/Mips/MipsCallLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsCallLowering.cpp index e82f62260b3f..a705ebb6b193 100644 --- a/contrib/llvm/lib/Target/Mips/MipsCallLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsCallLowering.cpp @@ -418,7 +418,8 @@ void MipsCallLowering::subTargetRegTypeForCallingConv( for (auto &Arg : Args) { EVT VT = TLI.getValueType(DL, Arg.Ty); - MVT RegisterVT = TLI.getRegisterTypeForCallingConv(F.getContext(), VT); + MVT RegisterVT = TLI.getRegisterTypeForCallingConv(F.getContext(), + F.getCallingConv(), VT); ISD::ArgFlagsTy Flags = Arg.Flags; Flags.setOrigAlign(TLI.getABIAlignmentForCallingConv(Arg.Ty, DL)); diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index 9eb13a68e561..744523cc6cb9 100644 --- a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This pass is used to make Pc relative loads of constants. -// For now, only Mips16 will use this. +// For now, only Mips16 will use this. // // Loading constants inline is expensive on Mips16 and it's in general better // to place the constant nearby in code space and then it can be loaded with a @@ -1171,7 +1171,7 @@ static inline unsigned getUnconditionalBrDisp(int Opc) { /// findAvailableWater - Look for an existing entry in the WaterList in which /// we can place the CPE referenced from U so it's within range of U's MI. /// Returns true if found, false if not. If it returns true, WaterIter -/// is set to the WaterList entry. +/// is set to the WaterList entry. /// To ensure that this pass /// terminates, the CPE location for a particular CPUser is only allowed to /// move to a lower address, so search backward from the end of the list and @@ -1231,7 +1231,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()]; // If the block does not end in an unconditional branch already, and if the - // end of the block is within range, make new water there. + // end of the block is within range, make new water there. if (BBHasFallthrough(UserMBB)) { // Size of branch to insert. unsigned Delta = 2; @@ -1258,7 +1258,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, } } - // What a big block. Find a place within the block to split it. + // What a big block. Find a place within the block to split it. // Try to split the block so it's fully aligned. Compute the latest split // point where we can add a 4-byte branch instruction, and then align to @@ -1582,7 +1582,7 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) { MachineInstr *BMI = &MBB->back(); bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); unsigned OppositeBranchOpcode = TII->getOppositeBranchOpc(Opcode); - + ++NumCBrFixed; if (BMI != MI) { if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) && @@ -1595,7 +1595,7 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) { // bnez L2 // b L1 unsigned BMITargetOperand = branchTargetOperand(BMI); - MachineBasicBlock *NewDest = + MachineBasicBlock *NewDest = BMI->getOperand(BMITargetOperand).getMBB(); if (isBBInRange(MI, NewDest, Br.MaxDisp)) { LLVM_DEBUG( diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp index 7b39507812ed..19b30a44e86a 100644 --- a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -1662,7 +1662,7 @@ bool MipsFastISel::selectRet(const Instruction *I) { return false; SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp index 9ffc38356b76..0677d378a115 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -111,6 +111,7 @@ static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) { // The MIPS MSA ABI passes vector arguments in the integer register set. // The number of integer registers used is dependant on the ABI used. MVT MipsTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const { if (VT.isVector()) { if (Subtarget.isABI_O32()) { @@ -123,6 +124,7 @@ MVT MipsTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, } unsigned MipsTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const { if (VT.isVector()) return std::max((VT.getSizeInBits() / (Subtarget.isABI_O32() ? 32 : 64)), @@ -131,10 +133,10 @@ unsigned MipsTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, } unsigned MipsTargetLowering::getVectorTypeBreakdownForCallingConv( - LLVMContext &Context, EVT VT, EVT &IntermediateVT, + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const { // Break down vector types to either 2 i64s or 4 i32s. - RegisterVT = getRegisterTypeForCallingConv(Context, VT) ; + RegisterVT = getRegisterTypeForCallingConv(Context, CC, VT); IntermediateVT = RegisterVT; NumIntermediates = VT.getSizeInBits() < RegisterVT.getSizeInBits() ? VT.getVectorNumElements() diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h index b58d92c370d8..5a0de45c44f3 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h @@ -288,17 +288,18 @@ class TargetRegisterClass; /// Return the register type for a given MVT, ensuring vectors are treated /// as a series of gpr sized integers. - MVT getRegisterTypeForCallingConv(LLVMContext &Context, + MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; /// Return the number of registers for a given MVT, ensuring vectors are /// treated as a series of gpr sized integers. unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const override; /// Break down vectors to the correct number of gpr sized integers. unsigned getVectorTypeBreakdownForCallingConv( - LLVMContext &Context, EVT VT, EVT &IntermediateVT, + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override; /// Return the correct alignment for the current calling convention. diff --git a/contrib/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/contrib/llvm/lib/Target/Mips/MipsInstructionSelector.cpp index af0ac006bc9e..6c5b83021f74 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstructionSelector.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsInstructionSelector.cpp @@ -166,6 +166,33 @@ bool MipsInstructionSelector::select(MachineInstr &I, I.eraseFromParent(); return true; } + case G_GLOBAL_VALUE: { + if (MF.getTarget().isPositionIndependent()) + return false; + + const llvm::GlobalValue *GVal = I.getOperand(1).getGlobal(); + unsigned LUiReg = MRI.createVirtualRegister(&Mips::GPR32RegClass); + MachineInstr *LUi, *ADDiu; + + LUi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi)) + .addDef(LUiReg) + .addGlobalAddress(GVal); + LUi->getOperand(1).setTargetFlags(MipsII::MO_ABS_HI); + + ADDiu = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::ADDiu)) + .addDef(I.getOperand(0).getReg()) + .addUse(LUiReg) + .addGlobalAddress(GVal); + ADDiu->getOperand(2).setTargetFlags(MipsII::MO_ABS_LO); + + if (!constrainSelectedInstRegOperands(*LUi, TII, TRI, RBI)) + return false; + if (!constrainSelectedInstRegOperands(*ADDiu, TII, TRI, RBI)) + return false; + + I.eraseFromParent(); + return true; + } default: return false; diff --git a/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp index da6f9dabdaaf..fb259516be09 100644 --- a/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -36,6 +36,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { getActionDefinitionsBuilder(G_FRAME_INDEX) .legalFor({p0}); + getActionDefinitionsBuilder(G_GLOBAL_VALUE) + .legalFor({p0}); + computeTables(); verify(*ST.getInstrInfo()); } diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp index cef21f447205..351135079217 100644 --- a/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp @@ -88,6 +88,7 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; case G_CONSTANT: case G_FRAME_INDEX: + case G_GLOBAL_VALUE: OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx], nullptr}); break; diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h index 676d702ba63e..896dd0eb0a5e 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h +++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h @@ -163,7 +163,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // HasEVA -- supports EVA ASE. bool HasEVA; - + // nomadd4 - disables generation of 4-operand madd.s, madd.d and // related instructions. bool DisableMadd4; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index 3b042c74b26c..efe98003b1c8 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -248,7 +248,7 @@ protected: private: bool GlobalsEmitted; - + // This is specific per MachineFunction. const MachineRegisterInfo *MRI; // The contents are specific for each diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp index f12ed81b6d9f..ad1d7cbb52fc 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp @@ -2,7 +2,7 @@ // // The LLVM Compiler Infrastructure // -// This file is distributed under the University of Illinois Open Source +// This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h index 10f1135ad841..5a9115f6f7f1 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h @@ -2,7 +2,7 @@ // // The LLVM Compiler Infrastructure // -// This file is distributed under the University of Illinois Open Source +// This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index ea709a73ebf2..fd7f81591426 100644 --- a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -175,7 +175,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, + raw_ostream &O, const char *Modifier) { unsigned Code = MI->getOperand(OpNo).getImm(); diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h index f000fbb98110..351ccefa2da2 100644 --- a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -35,11 +35,11 @@ public: void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) override; - + // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); - + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, unsigned PrintMethodIdx, diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 8ac461b96b88..fb7bf23509c7 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -61,7 +61,7 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { CommentString = "#"; // Uses '.section' before '.bss' directive - UsesELFSectionDirectiveForBSS = true; + UsesELFSectionDirectiveForBSS = true; // Debug Information SupportsDebugInformation = true; @@ -73,7 +73,7 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; - + ZeroDirective = "\t.space\t"; Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr; AssemblerDialect = 1; // New-Style mnemonics. diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 2b948ca60028..57bda1403c62 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -102,7 +102,7 @@ public: unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - + // getBinaryCodeForInstr - TableGen'erated function for getting the // binary encoding for an instruction. uint64_t getBinaryCodeForInstr(const MCInst &MI, @@ -138,7 +138,7 @@ public: default: llvm_unreachable("Invalid instruction size"); } - + ++MCNumEmitted; // Keep track of the # of mi's emitted. } @@ -147,7 +147,7 @@ private: void verifyInstructionPredicates(const MCInst &MI, uint64_t AvailableFeatures) const; }; - + } // end anonymous namespace MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII, @@ -162,7 +162,7 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo, const MCSubtargetInfo &STI) const { const MCOperand &MO = MI.getOperand(OpNo); if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); - + // Add a fixup for the branch target. Fixups.push_back(MCFixup::create(0, MO.getExpr(), (MCFixupKind)PPC::fixup_ppc_br24)); @@ -212,7 +212,7 @@ unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo, const MCSubtargetInfo &STI) const { const MCOperand &MO = MI.getOperand(OpNo); if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); - + // Add a fixup for the immediate field. Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(), (MCFixupKind)PPC::fixup_ppc_half16)); @@ -226,11 +226,11 @@ unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo, // displacement and the next 5 bits as the register #. assert(MI.getOperand(OpNo+1).isReg()); unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 16; - + const MCOperand &MO = MI.getOperand(OpNo); if (MO.isImm()) return (getMachineOpValue(MI, MO, Fixups, STI) & 0xFFFF) | RegBits; - + // Add a fixup for the displacement field. Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(), (MCFixupKind)PPC::fixup_ppc_half16)); @@ -244,11 +244,11 @@ unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo, // displacement and the next 5 bits as the register #. assert(MI.getOperand(OpNo+1).isReg()); unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 14; - + const MCOperand &MO = MI.getOperand(OpNo); if (MO.isImm()) return ((getMachineOpValue(MI, MO, Fixups, STI) >> 2) & 0x3FFF) | RegBits; - + // Add a fixup for the displacement field. Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(), (MCFixupKind)PPC::fixup_ppc_half16ds)); @@ -320,7 +320,7 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo, const MCSubtargetInfo &STI) const { const MCOperand &MO = MI.getOperand(OpNo); if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups, STI); - + // Add a fixup for the TLS register, which simply provides a relocation // hint to the linker that this statement is part of a relocation sequence. // Return the thread-pointer register's encoding. @@ -373,7 +373,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO, return Encode; } - + assert(MO.isImm() && "Relocation required in an instruction that we cannot encode!"); return MO.getImm(); diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h index fe7e7aeeb182..481ba3f09cc7 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h @@ -58,7 +58,7 @@ namespace PPC { PRED_BIT_SET = 1024, PRED_BIT_UNSET = 1025 }; - + // Bit for branch taken (plus) or not-taken (minus) hint enum BranchHintBit { BR_NO_HINT = 0x0, diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h index dfdec246e868..bfc613af3dc0 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPC.h +++ b/contrib/llvm/lib/Target/PowerPC/PPC.h @@ -66,7 +66,7 @@ namespace llvm { extern char &PPCVSXFMAMutateID; namespace PPCII { - + /// Target Operand Flag enum. enum TOF { //===------------------------------------------------------------------===// @@ -111,7 +111,7 @@ namespace llvm { MO_TLS = 8 << 4 }; } // end namespace PPCII - + } // end namespace llvm; #endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp index 64b8f1168beb..0d1bb9297bcb 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -130,7 +130,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { BlockSizes[MBB->getNumber()].first = BlockSize; FuncSize += BlockSize; } - + // If the entire function is smaller than the displacement of a branch field, // we know we don't need to shrink any branches in this function. This is a // common case. @@ -138,7 +138,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { BlockSizes.clear(); return false; } - + // For each conditional branch, if the offset to its destination is larger // than the offset field allows, transform it into a long branch sequence // like this: @@ -153,7 +153,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { while (MadeChange) { // Iteratively expand branches until we reach a fixed point. MadeChange = false; - + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { MachineBasicBlock &MBB = *MFI; @@ -175,7 +175,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { MBBStartOffset += TII->getInstSizeInBytes(*I); continue; } - + // Determine the offset from the current branch to the destination // block. int BranchSize; @@ -184,7 +184,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { // start of this block to this branch, plus the sizes of all blocks // from this block to the dest. BranchSize = MBBStartOffset; - + for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i) BranchSize += BlockSizes[i].first; } else { @@ -213,7 +213,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { // 2. Target MBB PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm(); unsigned CRReg = I->getOperand(1).getReg(); - + // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition. BuildMI(MBB, I, dl, TII->get(PPC::BCC)) .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2); @@ -234,7 +234,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { } else { llvm_unreachable("Unhandled branch type!"); } - + // Uncond branch to the real destination. I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest); @@ -277,7 +277,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { EverMadeChange |= MadeChange; } - + BlockSizes.clear(); return true; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp index ed5e496b32fd..ac931f7d0ec0 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -73,7 +73,7 @@ protected: if ((*PI)->empty()) continue; - + for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) { if (J == (*PI)->end()) break; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp index b00655b50229..f212894035db 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -1697,7 +1697,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index f0000c5bafd7..84dacf396462 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -174,7 +174,7 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots( {PPC::V22, -160}, {PPC::V21, -176}, {PPC::V20, -192}, - + // SPE register save area (overlaps Vector save area). {PPC::S31, -8}, {PPC::S30, -16}, @@ -1229,7 +1229,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, if (MBBI != MBB.end()) dl = MBBI->getDebugLoc(); - + const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); @@ -1315,7 +1315,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, } bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn()); - + if (IsReturnBlock) { unsigned RetOpcode = MBBI->getOpcode(); bool UsesTCRet = RetOpcode == PPC::TCRETURNri || diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp index 551220466901..793a4dd7f624 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -50,7 +50,7 @@ bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) { return true; } - return false; + return false; } bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) { @@ -76,7 +76,7 @@ bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) { return true; } - return false; + return false; } // FIXME: Remove this when we don't need this: diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 1e3e14c71144..51ff8a5cf77e 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1224,6 +1224,7 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, } unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv:: ID CC, EVT VT) const { if (Subtarget.hasSPE() && VT == MVT::f64) return 2; @@ -1231,6 +1232,7 @@ unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, } MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv:: ID CC, EVT VT) const { if (Subtarget.hasSPE() && VT == MVT::f64) return MVT::i32; @@ -13102,8 +13104,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDValue PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, - SelectionDAG &DAG, - std::vector<SDNode *> *Created) const { + SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const { // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if (VT == MVT::i64 && !Subtarget.isPPC64()) @@ -13120,13 +13122,11 @@ PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); - if (Created) - Created->push_back(Op.getNode()); + Created.push_back(Op.getNode()); if (IsNegPow2) { Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); - if (Created) - Created->push_back(Op.getNode()); + Created.push_back(Op.getNode()); } return Op; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h index 9b8d6435515b..f174943a8004 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -665,7 +665,7 @@ namespace llvm { SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, - std::vector<SDNode *> *Created) const override; + SmallVectorImpl<SDNode *> &Created) const override; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; @@ -872,9 +872,11 @@ namespace llvm { MCContext &Ctx) const override; unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv:: ID CC, EVT VT) const override; MVT getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv:: ID CC, EVT VT) const override; private: @@ -1141,7 +1143,7 @@ namespace llvm { ISD::ArgFlagsTy &ArgFlags, CCState &State); - bool + bool CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 4669719744bc..0930f7d3b8d7 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -316,11 +316,11 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, } // For opcodes with the ReMaterializable flag set, this function is called to -// verify the instruction is really rematable. +// verify the instruction is really rematable. bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, AliasAnalysis *AA) const { switch (MI.getOpcode()) { - default: + default: // This function should only be called for opcodes with the ReMaterializable // flag set. llvm_unreachable("Unknown rematerializable operation!"); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index 2217fa4693ce..0b57dd9b618d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -360,7 +360,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { // generate direct offsets from both the pre-incremented and // post-incremented pointer values. Thus, we'll pick the first non-prefetch // instruction in each bucket, and adjust the recurrence and other offsets - // accordingly. + // accordingly. for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) { if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr)) if (II->getIntrinsicID() == Intrinsic::prefetch) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 62a612feb55c..e731c0bc0c23 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -75,7 +75,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, } return Sym; } - + return Sym; } @@ -130,7 +130,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, // Subtract off the PIC base if required. if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG) { const MachineFunction *MF = MO.getParent()->getParent()->getParent(); - + const MCExpr *PB = MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); Expr = MCBinaryExpr::createSub(Expr, PB, Ctx); } @@ -151,7 +151,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin) { OutMI.setOpcode(MI->getOpcode()); - + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MCOperand MCOp; if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP, diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index dbe1fe37ddf8..0068df19f0c8 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -891,7 +891,7 @@ static bool eligibleForCompareElimination(MachineBasicBlock &MBB, auto BII = BB.getFirstInstrTerminator(); // We optimize BBs ending with a conditional branch. // We check only for BCC here, not BCCLR, because BCCLR - // will be formed only later in the pipeline. + // will be formed only later in the pipeline. if (BB.succ_size() == 2 && BII != BB.instr_end() && (*BII).getOpcode() == PPC::BCC && diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h index b14bbad2039a..8a3f50aa9565 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -29,7 +29,7 @@ class PPCFunctionInfo : public MachineFunctionInfo { /// stored. Also used as an anchor for instructions that need to be altered /// when using frame pointers (dyna_add, dyna_sub.) int FramePointerSaveIndex = 0; - + /// ReturnAddrSaveIndex - Frame index of where the return address is stored. /// int ReturnAddrSaveIndex = 0; @@ -128,7 +128,7 @@ public: int getFramePointerSaveIndex() const { return FramePointerSaveIndex; } void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; } - + int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; } void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 6647ceace5eb..96923a97a82c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -979,7 +979,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, SReg = MF.getRegInfo().createVirtualRegister(RC); // Insert a set of rA with the full offset value before the ld, st, or add - if (isInt<16>(Offset)) + if (isInt<16>(Offset)) BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI), SReg) .addImm(Offset); else { diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 226c75f704f4..b0da9b5a6d70 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -201,7 +201,7 @@ unsigned PPCTTIImpl::getUserCost(const User *U, std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType()); return LT.first * BaseT::getUserCost(U, Operands); } - + return BaseT::getUserCost(U, Operands); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index 1e8a1750ec3b..1be193e08c01 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -443,7 +443,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX, // by adding special handling for narrowing copies as well as // widening ones. However, I've experimented with this, and in - // practice we currently do not appear to use STXSDX fed by + // practice we currently do not appear to use STXSDX fed by // a narrowing copy from a full vector register. Since I can't // generate any useful test cases, I've left this alone for now. case PPC::STXSDX: diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index c7a5a1e8e6ee..35f52f7d279b 100644 --- a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -190,7 +190,7 @@ public: Sparc::C8_C9, Sparc::C10_C11, Sparc::C12_C13, Sparc::C14_C15, Sparc::C16_C17, Sparc::C18_C19, Sparc::C20_C21, Sparc::C22_C23, Sparc::C24_C25, Sparc::C26_C27, Sparc::C28_C29, Sparc::C30_C31}; - + namespace { /// SparcOperand - Instances of this class represent a parsed Sparc machine @@ -459,7 +459,7 @@ public: Op.Reg.Kind = rk_CoprocPairReg; return true; } - + static std::unique_ptr<SparcOperand> MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) { unsigned offsetReg = Op->getReg(); @@ -1000,7 +1000,7 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, unsigned &RegNo, RegKind = SparcOperand::rk_Special; return true; } - + if (name.equals("wim")) { RegNo = Sparc::WIM; RegKind = SparcOperand::rk_Special; @@ -1093,7 +1093,7 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, unsigned &RegNo, RegKind = SparcOperand::rk_CoprocReg; return true; } - + if (name.equals("tpc")) { RegNo = Sparc::TPC; RegKind = SparcOperand::rk_Special; diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp index 8e298e8316da..3e30dae1537f 100644 --- a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp +++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp @@ -350,18 +350,18 @@ DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, return MCDisassembler::Fail; // Calling the auto-generated decoder function. - + if (STI.getFeatureBits()[Sparc::FeatureV9]) { Result = decodeInstruction(DecoderTableSparcV932, Instr, Insn, Address, this, STI); } else { - Result = decodeInstruction(DecoderTableSparcV832, Instr, Insn, Address, this, STI); + Result = decodeInstruction(DecoderTableSparcV832, Instr, Insn, Address, this, STI); } if (Result != MCDisassembler::Fail) return Result; - + Result = decodeInstruction(DecoderTableSparc32, Instr, Insn, Address, this, STI); @@ -662,7 +662,7 @@ static DecodeStatus DecodeTRAP(MCInst &MI, unsigned insn, uint64_t Address, if (status != MCDisassembler::Success) return status; } - + // Decode CC MI.addOperand(MCOperand::createImm(cc)); diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp index 4981deae6af6..c1512cbdc44f 100644 --- a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp +++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp @@ -118,9 +118,9 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum, if (MO.isImm()) { switch (MI->getOpcode()) { default: - O << (int)MO.getImm(); + O << (int)MO.getImm(); return; - + case SP::TICCri: // Fall through case SP::TICCrr: // Fall through case SP::TRAPri: // Fall through @@ -128,7 +128,7 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum, case SP::TXCCri: // Fall through case SP::TXCCrr: // Fall through // Only seven-bit values up to 127. - O << ((int) MO.getImm() & 0x7f); + O << ((int) MO.getImm() & 0x7f); return; } } diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.h b/contrib/llvm/lib/Target/Sparc/Sparc.h index 4135e4e1b61d..0cea53b359eb 100644 --- a/contrib/llvm/lib/Target/Sparc/Sparc.h +++ b/contrib/llvm/lib/Target/Sparc/Sparc.h @@ -73,7 +73,7 @@ namespace llvm { FCC_LE = 13+16, // Less or Equal FCC_ULE = 14+16, // Unordered or Less or Equal FCC_O = 15+16, // Ordered - + CPCC_A = 8+32, // Always CPCC_N = 0+32, // Never CPCC_3 = 7+32, diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h index bf700d6a99d8..0cbbda787881 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -59,9 +59,9 @@ namespace llvm { public: SparcTargetLowering(const TargetMachine &TM, const SparcSubtarget &STI); SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - + bool useSoftFloat() const override; - + /// computeKnownBitsForTargetNode - Determine which of the bits specified /// in Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index 6750763d8ee5..47b42444b94d 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -115,7 +115,7 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC) case SPCC::FCC_UE: return SPCC::FCC_LG; case SPCC::FCC_NE: return SPCC::FCC_E; case SPCC::FCC_E: return SPCC::FCC_NE; - + case SPCC::CPCC_A: return SPCC::CPCC_N; case SPCC::CPCC_N: return SPCC::CPCC_A; case SPCC::CPCC_3: LLVM_FALLTHROUGH; diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp index a0d40653fd9b..07f9e7250bd9 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp @@ -100,7 +100,7 @@ SparcTargetMachine::SparcTargetMachine( SparcTargetMachine::~SparcTargetMachine() {} -const SparcSubtarget * +const SparcSubtarget * SparcTargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); Attribute FSAttr = F.getFnAttribute("target-features"); @@ -119,7 +119,7 @@ SparcTargetMachine::getSubtargetImpl(const Function &F) const { F.hasFnAttribute("use-soft-float") && F.getFnAttribute("use-soft-float").getValueAsString() == "true"; - if (softFloat) + if (softFloat) FS += FS.empty() ? "+soft-float" : ",+soft-float"; auto &I = SubtargetMap[CPU + FS]; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp index d300d1d88abc..b9e5788cf018 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -55,7 +55,7 @@ getNumDecoderSlots(SUnit *SU) const { else return 3; // Expanded/group-alone instruction } - + return 1; // Normal instruction } @@ -81,6 +81,7 @@ getHazardType(SUnit *m, int Stalls) { void SystemZHazardRecognizer::Reset() { CurrGroupSize = 0; + CurrGroupHas4RegOps = false; clearProcResCounters(); GrpCount = 0; LastFPdOpCycleIdx = UINT_MAX; @@ -99,6 +100,12 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const { if (SC->BeginGroup) return (CurrGroupSize == 0); + // An instruction with 4 register operands will not fit in last slot. + assert ((CurrGroupSize < 2 || !CurrGroupHas4RegOps) && + "Current decoder group is already full!"); + if (CurrGroupSize == 2 && has4RegOps(SU->getInstr())) + return false; + // Since a full group is handled immediately in EmitInstruction(), // SU should fit into current group. NumSlots should be 1 or 0, // since it is not a cracked or expanded instruction. @@ -108,6 +115,23 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const { return true; } +bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const { + const MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + const MCInstrDesc &MID = MI->getDesc(); + unsigned Count = 0; + for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) { + const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI, MF); + if (RC == nullptr) + continue; + if (OpIdx >= MID.getNumDefs() && + MID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) + continue; + Count++; + } + return Count >= 4; +} + void SystemZHazardRecognizer::nextGroup() { if (CurrGroupSize == 0) return; @@ -119,6 +143,7 @@ void SystemZHazardRecognizer::nextGroup() { // Reset counter for next group. CurrGroupSize = 0; + CurrGroupHas4RegOps = false; // Decrease counters for execution units by one. for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i) @@ -142,7 +167,7 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const { const MCSchedClassDesc *SC = getSchedClass(SU); if (!SC->isValid()) return; - + for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC), PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { @@ -172,6 +197,8 @@ void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const { OS << "/EndsGroup"; if (SU->isUnbuffered) OS << "/Unbuffered"; + if (has4RegOps(SU->getInstr())) + OS << "/4RegOps"; } void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const { @@ -184,6 +211,7 @@ void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const { dbgs() << "{ " << CurGroupDbg << " }"; dbgs() << " (" << CurrGroupSize << " decoder slot" << (CurrGroupSize > 1 ? "s":"") + << (CurrGroupHas4RegOps ? ", 4RegOps" : "") << ")\n"; } } @@ -294,11 +322,14 @@ EmitInstruction(SUnit *SU) { // Insert SU into current group by increasing number of slots used // in current group. CurrGroupSize += getNumDecoderSlots(SU); - assert (CurrGroupSize <= 3); + CurrGroupHas4RegOps |= has4RegOps(SU->getInstr()); + unsigned GroupLim = + ((CurrGroupHas4RegOps && getNumDecoderSlots(SU) < 3) ? 2 : 3); + assert (CurrGroupSize <= GroupLim && "SU does not fit into decoder group!"); // Check if current group is now full/ended. If so, move on to next // group to be ready to evaluate more candidates. - if (CurrGroupSize == 3 || SC->EndGroup) + if (CurrGroupSize == GroupLim || SC->EndGroup) nextGroup(); } @@ -306,7 +337,7 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const { const MCSchedClassDesc *SC = getSchedClass(SU); if (!SC->isValid()) return 0; - + // If SU begins new group, it can either break a current group early // or fit naturally if current group is empty (negative cost). if (SC->BeginGroup) { @@ -325,6 +356,10 @@ int SystemZHazardRecognizer::groupingCost(SUnit *SU) const { return -1; } + // An instruction with 4 register operands will not fit in last slot. + if (CurrGroupSize == 2 && has4RegOps(SU->getInstr())) + return 1; + // Most instructions can be placed in any decoder slot. return 0; } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h index 40cb3acc7009..6292feefbfea 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h @@ -45,15 +45,17 @@ namespace llvm { /// SystemZHazardRecognizer maintains the state for one MBB during scheduling. class SystemZHazardRecognizer : public ScheduleHazardRecognizer { -#ifndef NDEBUG const SystemZInstrInfo *TII; -#endif const TargetSchedModel *SchedModel; /// Keep track of the number of decoder slots used in the current /// decoder group. unsigned CurrGroupSize; + /// True if an instruction with four reg operands have been scheduled into + /// the current decoder group. + bool CurrGroupHas4RegOps; + /// The tracking of resources here are quite similar to the common /// code use of a critical resource. However, z13 differs in the way /// that it has two processor sides which may be interesting to @@ -73,6 +75,9 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer { /// Return true if MI fits into current decoder group. bool fitsIntoCurrentGroup(SUnit *SU) const; + /// Return true if this instruction has four register operands. + bool has4RegOps(const MachineInstr *MI) const; + /// Two decoder groups per cycle are formed (for z13), meaning 2x3 /// instructions. This function returns a number between 0 and 5, /// representing the current decoder slot of the current cycle. If an SU @@ -105,11 +110,7 @@ class SystemZHazardRecognizer : public ScheduleHazardRecognizer { public: SystemZHazardRecognizer(const SystemZInstrInfo *tii, const TargetSchedModel *SM) - : -#ifndef NDEBUG - TII(tii), -#endif - SchedModel(SM) { + : TII(tii), SchedModel(SM) { Reset(); } @@ -134,7 +135,7 @@ public: /// new decoder group, this is negative if this fits the schedule or /// positive if it would mean ending a group prematurely. For normal /// instructions this returns 0. - int groupingCost(SUnit *SU) const; + int groupingCost(SUnit *SU) const; /// Return the cost of SU in regards to processor resources usage. /// A positive value means it would be better to wait with SU, while diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 302c7883f97b..e76fa71dacd7 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -527,10 +527,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::BSWAP); - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::SRA); - setTargetDAGCombine(ISD::SRL); - setTargetDAGCombine(ISD::ROTL); // Handle intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -609,7 +605,7 @@ struct AddressingMode { // True if use of index register is supported. bool IndexReg; - + AddressingMode(bool LongDispl, bool IdxReg) : LongDisplacement(LongDispl), IndexReg(IdxReg) {} }; @@ -5524,76 +5520,6 @@ SDValue SystemZTargetLowering::combineBSWAP( return SDValue(); } -SDValue SystemZTargetLowering::combineSHIFTROT( - SDNode *N, DAGCombinerInfo &DCI) const { - - SelectionDAG &DAG = DCI.DAG; - - // Shift/rotate instructions only use the last 6 bits of the second operand - // register. If the second operand is the result of an AND with an immediate - // value that has its last 6 bits set, we can safely remove the AND operation. - // - // If the AND operation doesn't have the last 6 bits set, we can't remove it - // entirely, but we can still truncate it to a 16-bit value. This prevents - // us from ending up with a NILL with a signed operand, which will cause the - // instruction printer to abort. - SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::AND) { - SDValue AndMaskOp = N1->getOperand(1); - auto *AndMask = dyn_cast<ConstantSDNode>(AndMaskOp); - - // The AND mask is constant - if (AndMask) { - auto AmtVal = AndMask->getZExtValue(); - - // Bottom 6 bits are set - if ((AmtVal & 0x3f) == 0x3f) { - SDValue AndOp = N1->getOperand(0); - - // This is the only use, so remove the node - if (N1.hasOneUse()) { - // Combine the AND away - DCI.CombineTo(N1.getNode(), AndOp); - - // Return N so it isn't rechecked - return SDValue(N, 0); - - // The node will be reused, so create a new node for this one use - } else { - SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N), - N->getValueType(0), N->getOperand(0), - AndOp); - DCI.AddToWorklist(Replace.getNode()); - - return Replace; - } - - // We can't remove the AND, but we can use NILL here (normally we would - // use NILF). Only keep the last 16 bits of the mask. The actual - // transformation will be handled by .td definitions. - } else if (AmtVal >> 16 != 0) { - SDValue AndOp = N1->getOperand(0); - - auto NewMask = DAG.getConstant(AndMask->getZExtValue() & 0x0000ffff, - SDLoc(AndMaskOp), - AndMaskOp.getValueType()); - - auto NewAnd = DAG.getNode(N1.getOpcode(), SDLoc(N1), N1.getValueType(), - AndOp, NewMask); - - SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N), - N->getValueType(0), N->getOperand(0), - NewAnd); - DCI.AddToWorklist(Replace.getNode()); - - return Replace; - } - } - } - - return SDValue(); -} - static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) { // We have a SELECT_CCMASK or BR_CCMASK comparing the condition code // set by the CCReg instruction using the CCValid / CCMask masks, @@ -5752,10 +5678,6 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI); case ISD::FP_ROUND: return combineFP_ROUND(N, DCI); case ISD::BSWAP: return combineBSWAP(N, DCI); - case ISD::SHL: - case ISD::SRA: - case ISD::SRL: - case ISD::ROTL: return combineSHIFTROT(N, DCI); case SystemZISD::BR_CCMASK: return combineBR_CCMASK(N, DCI); case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI); case SystemZISD::GET_CCMASK: return combineGET_CCMASK(N, DCI); diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 0ca93a38a016..267e31a85216 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -602,7 +602,6 @@ private: SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue combineSHIFTROT(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td index 9d7312269957..bb5b7aae883b 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -1352,8 +1352,8 @@ def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))), //===----------------------------------------------------------------------===// // Logical shift left. -defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; -def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>; +defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shiftop<shl>, GR32>; +def SLLG : BinaryRSY<"sllg", 0xEB0D, shiftop<shl>, GR64>; def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>; // Arithmetic shift left. @@ -1364,20 +1364,20 @@ let Defs = [CC] in { } // Logical shift right. -defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; -def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>; +defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, shiftop<srl>, GR32>; +def SRLG : BinaryRSY<"srlg", 0xEB0C, shiftop<srl>, GR64>; def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>; // Arithmetic shift right. let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { - defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>; - def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>; + defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, shiftop<sra>, GR32>; + def SRAG : BinaryRSY<"srag", 0xEB0A, shiftop<sra>, GR64>; def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>; } // Rotate left. -def RLL : BinaryRSY<"rll", 0xEB1D, rotl, GR32>; -def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>; +def RLL : BinaryRSY<"rll", 0xEB1D, shiftop<rotl>, GR32>; +def RLLG : BinaryRSY<"rllg", 0xEB1C, shiftop<rotl>, GR64>; // Rotate second operand left and inserted selected bits into first operand. // These can act like 32-bit operands provided that the constant start and @@ -2162,29 +2162,29 @@ def : Pat<(and (xor GR64:$x, (i64 -1)), GR64:$y), // Complexity is added so that we match this before we match NILF on the AND // operation alone. let AddedComplexity = 4 in { - def : Pat<(shl GR32:$val, (and GR32:$shift, uimm32:$imm)), - (SLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; + def : Pat<(shl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)), + (SLL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; - def : Pat<(sra GR32:$val, (and GR32:$shift, uimm32:$imm)), - (SRA GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; + def : Pat<(sra GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)), + (SRA GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; - def : Pat<(srl GR32:$val, (and GR32:$shift, uimm32:$imm)), - (SRL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; + def : Pat<(srl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)), + (SRL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; - def : Pat<(shl GR64:$val, (and GR32:$shift, uimm32:$imm)), - (SLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; + def : Pat<(shl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)), + (SLLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; - def : Pat<(sra GR64:$val, (and GR32:$shift, uimm32:$imm)), - (SRAG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; + def : Pat<(sra GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)), + (SRAG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; - def : Pat<(srl GR64:$val, (and GR32:$shift, uimm32:$imm)), - (SRLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; + def : Pat<(srl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)), + (SRLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; - def : Pat<(rotl GR32:$val, (and GR32:$shift, uimm32:$imm)), - (RLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; + def : Pat<(rotl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)), + (RLL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; - def : Pat<(rotl GR64:$val, (and GR32:$shift, uimm32:$imm)), - (RLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; + def : Pat<(rotl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)), + (RLLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; } // Peepholes for turning scalar operations into block operations. diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp index fcbf4c4b5fe4..98e761ef87fe 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp @@ -129,7 +129,7 @@ SystemZPostRASchedStrategy:: SystemZPostRASchedStrategy(const MachineSchedContext *C) : MLI(C->MLI), TII(static_cast<const SystemZInstrInfo *> - (C->MF->getSubtarget().getInstrInfo())), + (C->MF->getSubtarget().getInstrInfo())), MBB(nullptr), HazardRec(nullptr) { const TargetSubtargetInfo *ST = &C->MF->getSubtarget(); SchedModel.init(ST); @@ -169,8 +169,7 @@ SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) { return *Available.begin(); } - // All nodes that are possible to schedule are stored by in the - // Available set. + // All nodes that are possible to schedule are stored in the Available set. LLVM_DEBUG(dbgs() << "** Available: "; Available.dump(*HazardRec);); Candidate Best; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h index cb0304825966..ab820e5d3e63 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h @@ -26,7 +26,7 @@ using namespace llvm; namespace llvm { - + /// A MachineSchedStrategy implementation for SystemZ post RA scheduling. class SystemZPostRASchedStrategy : public MachineSchedStrategy { @@ -37,7 +37,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy { // non-scheduled instructions, so it would not always be possible to call // DAG->getSchedClass(SU). TargetSchedModel SchedModel; - + /// A candidate during instruction evaluation. struct Candidate { SUnit *SU = nullptr; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td index da682cb4e5ab..7bf32bf19a4a 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td @@ -357,6 +357,7 @@ def imm32zx16 : Immediate<i32, [{ }], UIMM16, "U16Imm">; def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">; +def imm32zx16trunc : Immediate<i32, [{}], UIMM16, "U16Imm">; // Full 32-bit immediates. we need both signed and unsigned versions // because the assembler is picky. E.g. AFI requires signed operands diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td index 3cfe23aec417..5103867e2d9a 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -697,6 +697,16 @@ class storei<SDPatternOperator operator, SDPatternOperator store = store> : PatFrag<(ops node:$addr), (store (operator), node:$addr)>; +// Create a shift operator that optionally ignores an AND of the +// shift count with an immediate if the bottom 6 bits are all set. +def imm32bottom6set : PatLeaf<(i32 imm), [{ + return (N->getZExtValue() & 0x3f) == 0x3f; +}]>; +class shiftop<SDPatternOperator operator> + : PatFrags<(ops node:$val, node:$count), + [(operator node:$val, node:$count), + (operator node:$val, (and node:$count, imm32bottom6set))]>; + // Vector representation of all-zeros and all-ones. def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>; def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index e2a3efda5c5e..c5cdc22f2099 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -329,7 +329,7 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { } int SystemZTTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, + unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, @@ -469,7 +469,7 @@ int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, assert (Tp->isVectorTy()); assert (ST->hasVector() && "getShuffleCost() called."); unsigned NumVectors = getNumberOfParts(Tp); - + // TODO: Since fp32 is expanded, the shuffle cost should always be 0. // FP128 values are always in scalar registers, so there is no work @@ -647,7 +647,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Cost; } } - + if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { // TODO: Fix base implementation which could simplify things a bit here @@ -704,7 +704,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/); - + if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && Src->isIntegerTy(1)) { // This should be extension of a compare i1 result, which is done with diff --git a/contrib/llvm/lib/Target/Target.cpp b/contrib/llvm/lib/Target/Target.cpp index 42d92622d6c8..f23ea72eb513 100644 --- a/contrib/llvm/lib/Target/Target.cpp +++ b/contrib/llvm/lib/Target/Target.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements the common infrastructure (including C bindings) for +// This file implements the common infrastructure (including C bindings) for // libLLVMTarget.a, which implements target information. // //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp index 907ecf46e8ff..6bcf60fafc3e 100644 --- a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -92,10 +92,10 @@ static bool IsNullTerminatedString(const Constant *C) { if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C)) { unsigned NumElts = CDS->getNumElements(); assert(NumElts != 0 && "Can't have an empty CDS"); - + if (CDS->getElementAsInteger(NumElts-1) != 0) return false; // Not null terminated. - + // Verify that the null doesn't occur anywhere else in the string. for (unsigned i = 0; i != NumElts-1; ++i) if (CDS->getElementAsInteger(i) == 0) diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index b84c2d31a63e..fafbed0bd935 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2603,11 +2603,11 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, bool HadVerifyError = false; // Append default arguments to "ins[bwld]" - if (Name.startswith("ins") && + if (Name.startswith("ins") && (Operands.size() == 1 || Operands.size() == 3) && (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd" || Name == "ins")) { - + AddDefaultSrcDestOperands(TmpOperands, X86Operand::CreateReg(X86::DX, NameLoc, NameLoc), DefaultMemDIOperand(NameLoc)); @@ -2615,7 +2615,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } // Append default arguments to "outs[bwld]" - if (Name.startswith("outs") && + if (Name.startswith("outs") && (Operands.size() == 1 || Operands.size() == 3) && (Name == "outsb" || Name == "outsw" || Name == "outsl" || Name == "outsd" || Name == "outs")) { diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 82e82fe1efd9..0e861d5ddbc9 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -92,7 +92,7 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, // the hex value of the immediate operand when it isn't in the range // [-256,255]. if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) { - // Don't print unnecessary hex sign bits. + // Don't print unnecessary hex sign bits. if (Imm == (int16_t)(Imm)) *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm); else if (Imm == (int32_t)(Imm)) diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index d030f26d98de..f1d15e66918b 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -307,10 +307,84 @@ class X86MCInstrAnalysis : public MCInstrAnalysis { public: X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {} + bool isDependencyBreaking(const MCSubtargetInfo &STI, + const MCInst &Inst) const override; bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, APInt &Mask) const override; }; +bool X86MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI, + const MCInst &Inst) const { + if (STI.getCPU() == "btver2") { + // Reference: Agner Fog's microarchitecture.pdf - Section 20 "AMD Bobcat and + // Jaguar pipeline", subsection 8 "Dependency-breaking instructions". + switch (Inst.getOpcode()) { + default: + return false; + case X86::SUB32rr: + case X86::SUB64rr: + case X86::SBB32rr: + case X86::SBB64rr: + case X86::XOR32rr: + case X86::XOR64rr: + case X86::XORPSrr: + case X86::XORPDrr: + case X86::VXORPSrr: + case X86::VXORPDrr: + case X86::ANDNPSrr: + case X86::VANDNPSrr: + case X86::ANDNPDrr: + case X86::VANDNPDrr: + case X86::PXORrr: + case X86::VPXORrr: + case X86::PANDNrr: + case X86::VPANDNrr: + case X86::PSUBBrr: + case X86::PSUBWrr: + case X86::PSUBDrr: + case X86::PSUBQrr: + case X86::VPSUBBrr: + case X86::VPSUBWrr: + case X86::VPSUBDrr: + case X86::VPSUBQrr: + case X86::PCMPEQBrr: + case X86::PCMPEQWrr: + case X86::PCMPEQDrr: + case X86::PCMPEQQrr: + case X86::VPCMPEQBrr: + case X86::VPCMPEQWrr: + case X86::VPCMPEQDrr: + case X86::VPCMPEQQrr: + case X86::PCMPGTBrr: + case X86::PCMPGTWrr: + case X86::PCMPGTDrr: + case X86::PCMPGTQrr: + case X86::VPCMPGTBrr: + case X86::VPCMPGTWrr: + case X86::VPCMPGTDrr: + case X86::VPCMPGTQrr: + case X86::MMX_PXORirr: + case X86::MMX_PANDNirr: + case X86::MMX_PSUBBirr: + case X86::MMX_PSUBDirr: + case X86::MMX_PSUBQirr: + case X86::MMX_PSUBWirr: + case X86::MMX_PCMPGTBirr: + case X86::MMX_PCMPGTDirr: + case X86::MMX_PCMPGTWirr: + case X86::MMX_PCMPEQBirr: + case X86::MMX_PCMPEQDirr: + case X86::MMX_PCMPEQWirr: + return Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg(); + case X86::CMP32rr: + case X86::CMP64rr: + return Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg(); + } + } + + return false; +} + bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst, APInt &Mask) const { diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h index c49a6838fa44..d0fcbd313312 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.h +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -66,7 +66,7 @@ inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, // not to split i64 and double between a register and stack static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]); - + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); // If this is the first part of an double/i64/i128, or if we're already diff --git a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp index f73455cc31b8..1c5f110d8c60 100644 --- a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp +++ b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp @@ -622,7 +622,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // If the CMOV group is not packed, e.g., there are debug instructions between // first CMOV and last CMOV, then pack the group and make the CMOV instruction - // consecutive by moving the debug instructions to after the last CMOV. + // consecutive by moving the debug instructions to after the last CMOV. packCmovGroup(Group.front(), Group.back()); // To convert a CMOVcc instruction, we actually have to insert the diamond diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index de8b40f28a86..35a15577fe09 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -1195,7 +1195,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (Ret->getNumOperands() > 0) { SmallVector<ISD::OutputArg, 4> Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; @@ -2649,7 +2649,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::VMOVPDI2DIrr), ResultReg) .addReg(InputReg, RegState::Kill); - + // The result value is in the lower 16-bits of ResultReg. unsigned RegIdx = X86::sub_16bit; ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); @@ -3687,7 +3687,7 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { unsigned Reg = getRegForValue(I->getOperand(0)); if (Reg == 0) return false; - + // No instruction is needed for conversion. Reuse the register used by // the fist operand. updateValueMap(I, Reg); diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index d85389a0a7f1..f3f7f6a37360 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -578,7 +578,7 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, continue; if (OptLEA) { - if (MF.getSubtarget<X86Subtarget>().isSLM()) + if (MF.getSubtarget<X86Subtarget>().slowLEA()) processInstructionForSLM(I, MFI); else { diff --git a/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index 1ba08d39c595..c17c51a7aeac 100644 --- a/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -730,9 +730,12 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); - if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() && - TRI->isVirtualRegister(MI.getOperand(0).getReg())) + if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && + TRI->isVirtualRegister(MI.getOperand(0).getReg())) { + assert(MI.getOperand(0).isDef() && + "A non-storing SETcc should always define a register!"); CondRegs[Cond] = MI.getOperand(0).getReg(); + } // Stop scanning when we see the first definition of the EFLAGS as prior to // this we would potentially capture the wrong flag state. diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index ae748901164a..f330acff61a1 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -347,12 +347,12 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { LiveBundle &Bundle = LiveBundles[Bundles->getBundle(Entry->getNumber(), false)]; - + // In regcall convention, some FP registers may not be passed through // the stack, so they will need to be assigned to the stack first if ((Entry->getParent()->getFunction().getCallingConv() == CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) { - // In the register calling convention, up to one FP argument could be + // In the register calling convention, up to one FP argument could be // saved in the first FP register. // If bundle.mask is non-zero and Bundle.FixCount is zero, it means // that the FP registers contain arguments. @@ -991,7 +991,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) { assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2)); // Reset the FP Stack - It is required because of possible leftovers from - // passed arguments. The caller should assume that the FP stack is + // passed arguments. The caller should assume that the FP stack is // returned empty (unless the callee returns values on FP stack). while (StackTop > 0) popReg(); diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index a257ec41f75b..e207c343fac8 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -68,7 +68,7 @@ X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { // needsFrameIndexResolution - Do we need to perform FI resolution for // this function. Normally, this is required only when the function // has any stack objects. However, FI resolution actually has another job, -// not apparent from the title - it resolves callframesetup/destroy +// not apparent from the title - it resolves callframesetup/destroy // that were not simplified earlier. // So, this is required for x86 functions that have push sequences even // when there are no stack objects. @@ -607,8 +607,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, int64_t RCXShadowSlot = 0; int64_t RDXShadowSlot = 0; - // If inlining in the prolog, save RCX and RDX. - // Future optimization: don't save or restore if not live in. + // If inlining in the prolog, save RCX and RDX. if (InProlog) { // Compute the offsets. We need to account for things already // pushed onto the stack at this point: return address, frame @@ -616,15 +615,30 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); const bool HasFP = hasFP(MF); - RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); - RDXShadowSlot = RCXShadowSlot + 8; - // Emit the saves. - addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, - RCXShadowSlot) - .addReg(X86::RCX); - addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, - RDXShadowSlot) - .addReg(X86::RDX); + + // Check if we need to spill RCX and/or RDX. + // Here we assume that no earlier prologue instruction changes RCX and/or + // RDX, so checking the block live-ins is enough. + const bool IsRCXLiveIn = MBB.isLiveIn(X86::RCX); + const bool IsRDXLiveIn = MBB.isLiveIn(X86::RDX); + int64_t InitSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); + // Assign the initial slot to both registers, then change RDX's slot if both + // need to be spilled. + if (IsRCXLiveIn) + RCXShadowSlot = InitSlot; + if (IsRDXLiveIn) + RDXShadowSlot = InitSlot; + if (IsRDXLiveIn && IsRCXLiveIn) + RDXShadowSlot += 8; + // Emit the saves if needed. + if (IsRCXLiveIn) + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RCXShadowSlot) + .addReg(X86::RCX); + if (IsRDXLiveIn) + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RDXShadowSlot) + .addReg(X86::RDX); } else { // Not in the prolog. Copy RAX to a virtual reg. BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); @@ -661,6 +675,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); // Add code to roundMBB to round the final stack pointer to a page boundary. + RoundMBB->addLiveIn(FinalReg); BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) .addReg(FinalReg) .addImm(PageMask); @@ -677,6 +692,7 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, .addMBB(LoopMBB); } + LoopMBB->addLiveIn(JoinReg); addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, false, -PageSize); @@ -688,6 +704,8 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, .addImm(0) .addReg(0) .addImm(0); + + LoopMBB->addLiveIn(RoundedReg); BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) .addReg(RoundedReg) .addReg(ProbeReg); @@ -697,16 +715,19 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, // If in prolog, restore RDX and RCX. if (InProlog) { - addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), - X86::RCX), - X86::RSP, false, RCXShadowSlot); - addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), - X86::RDX), - X86::RSP, false, RDXShadowSlot); + if (RCXShadowSlot) // It means we spilled RCX in the prologue. + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, + TII.get(X86::MOV64rm), X86::RCX), + X86::RSP, false, RCXShadowSlot); + if (RDXShadowSlot) // It means we spilled RDX in the prologue. + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, + TII.get(X86::MOV64rm), X86::RDX), + X86::RSP, false, RDXShadowSlot); } // Now that the probing is done, add code to continueMBB to update // the stack pointer for real. + ContinueMBB->addLiveIn(SizeReg); BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) .addReg(X86::RSP) .addReg(SizeReg); @@ -734,8 +755,6 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, CMBBI->setFlag(MachineInstr::FrameSetup); } } - - // Possible TODO: physreg liveness for InProlog case. } void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, @@ -2694,7 +2713,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, Regs[FoundRegs++] = Regs[0]; for (int i = 0; i < NumPops; ++i) - BuildMI(MBB, MBBI, DL, + BuildMI(MBB, MBBI, DL, TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); return true; @@ -2984,7 +3003,7 @@ struct X86FrameSortingComparator { // in general. Something to keep in mind, though. if (DensityAScaled == DensityBScaled) return A.ObjectAlignment < B.ObjectAlignment; - + return DensityAScaled < DensityBScaled; } }; @@ -3020,7 +3039,7 @@ void X86FrameLowering::orderFrameObjects( if (ObjectSize == 0) // Variable size. Just use 4. SortingObjects[Obj].ObjectSize = 4; - else + else SortingObjects[Obj].ObjectSize = ObjectSize; } diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 7dcdb7967058..2820004cfc6d 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1800,17 +1800,19 @@ X86TargetLowering::getPreferredVectorAction(EVT VT) const { } MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return MVT::v32i8; - return TargetLowering::getRegisterTypeForCallingConv(Context, VT); + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return 1; - return TargetLowering::getNumRegistersForCallingConv(Context, VT); + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, @@ -23366,7 +23368,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, return DAG.getBuildVector(VT, dl, Elts); } - // If the target doesn't support variable shifts, use either FP conversion + // If the target doesn't support variable shifts, use either FP conversion // or integer multiplication to avoid shifting each element individually. if (VT == MVT::v4i32) { Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); @@ -23509,6 +23511,24 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) return DAG.getNode(ISD::MUL, dl, VT, R, Scale); + // Constant ISD::SRL can be performed efficiently on vXi8/vXi16 vectors as we + // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt). + // TODO: Improve support for the shift by zero special case. + if (Op.getOpcode() == ISD::SRL && ConstantAmt && + ((Subtarget.hasSSE41() && VT == MVT::v8i16) || + DAG.isKnownNeverZero(Amt)) && + (VT == MVT::v16i8 || VT == MVT::v8i16 || + ((VT == MVT::v32i8 || VT == MVT::v16i16) && Subtarget.hasInt256()))) { + SDValue EltBits = DAG.getConstant(VT.getScalarSizeInBits(), dl, VT); + SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt); + if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) { + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ); + SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale); + return DAG.getSelect(dl, VT, ZAmt, R, Res); + } + } + // v4i32 Non Uniform Shifts. // If the shift amount is constant we can shift each lane using the SSE2 // immediate shifts, else we need to zero-extend each lane to the lower i64 @@ -33425,33 +33445,32 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, } } - // Handle (CMOV C-1, (ADD (CTTZ X), C), (X != 0)) -> - // (ADD (CMOV (CTTZ X), -1, (X != 0)), C) or - // (CMOV (ADD (CTTZ X), C), C-1, (X == 0)) -> - // (ADD (CMOV C-1, (CTTZ X), (X == 0)), C) - if (CC == X86::COND_NE || CC == X86::COND_E) { - auto *Cnst = CC == X86::COND_E ? dyn_cast<ConstantSDNode>(TrueOp) - : dyn_cast<ConstantSDNode>(FalseOp); - SDValue Add = CC == X86::COND_E ? FalseOp : TrueOp; - - if (Cnst && Add.getOpcode() == ISD::ADD && Add.hasOneUse()) { - auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); - SDValue AddOp2 = Add.getOperand(0); - if (AddOp1 && (AddOp2.getOpcode() == ISD::CTTZ_ZERO_UNDEF || - AddOp2.getOpcode() == ISD::CTTZ)) { - APInt Diff = Cnst->getAPIntValue() - AddOp1->getAPIntValue(); - if (CC == X86::COND_E) { - Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(), AddOp2, - DAG.getConstant(Diff, DL, Add.getValueType()), - DAG.getConstant(CC, DL, MVT::i8), Cond); - } else { - Add = DAG.getNode(X86ISD::CMOV, DL, Add.getValueType(), - DAG.getConstant(Diff, DL, Add.getValueType()), - AddOp2, DAG.getConstant(CC, DL, MVT::i8), Cond); - } - return DAG.getNode(X86ISD::ADD, DL, Add.getValueType(), Add, - SDValue(AddOp1, 0)); - } + // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) -> + // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2) + // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) -> + // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2) + if ((CC == X86::COND_NE || CC == X86::COND_E) && + Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) { + SDValue Add = TrueOp; + SDValue Const = FalseOp; + // Canonicalize the condition code for easier matching and output. + if (CC == X86::COND_E) { + std::swap(Add, Const); + CC = X86::COND_NE; + } + + // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant. + if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD && + Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) && + (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF || + Add.getOperand(0).getOpcode() == ISD::CTTZ) && + Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) { + EVT VT = N->getValueType(0); + // This should constant fold. + SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1)); + SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), + DAG.getConstant(CC, DL, MVT::i8), Cond); + return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1)); } } @@ -33873,31 +33892,42 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (!C) return SDValue(); - uint64_t MulAmt = C->getZExtValue(); - if (isPowerOf2_64(MulAmt)) + if (isPowerOf2_64(C->getZExtValue())) return SDValue(); + int64_t SignMulAmt = C->getSExtValue(); + assert(SignMulAmt != INT64_MIN && "Int min should have been handled!"); + uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt; + SDLoc DL(N); - if (MulAmt == 3 || MulAmt == 5 || MulAmt == 9) - return DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - N->getOperand(1)); + if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) { + SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(AbsMulAmt, DL, VT)); + if (SignMulAmt < 0) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + NewMul); + + return NewMul; + } uint64_t MulAmt1 = 0; uint64_t MulAmt2 = 0; - if ((MulAmt % 9) == 0) { + if ((AbsMulAmt % 9) == 0) { MulAmt1 = 9; - MulAmt2 = MulAmt / 9; - } else if ((MulAmt % 5) == 0) { + MulAmt2 = AbsMulAmt / 9; + } else if ((AbsMulAmt % 5) == 0) { MulAmt1 = 5; - MulAmt2 = MulAmt / 5; - } else if ((MulAmt % 3) == 0) { + MulAmt2 = AbsMulAmt / 5; + } else if ((AbsMulAmt % 3) == 0) { MulAmt1 = 3; - MulAmt2 = MulAmt / 3; + MulAmt2 = AbsMulAmt / 3; } SDValue NewMul; + // For negative multiply amounts, only allow MulAmt2 to be a power of 2. if (MulAmt2 && - (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ + (isPowerOf2_64(MulAmt2) || + (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) { if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -33919,17 +33949,19 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); + + // Negate the result. + if (SignMulAmt < 0) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + NewMul); } else if (!Subtarget.slowLEA()) - NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL); + NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL); if (!NewMul) { - assert(MulAmt != 0 && - MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && + assert(C->getZExtValue() != 0 && + C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && "Both cases that could cause potential overflows should have " "already been handled."); - int64_t SignMulAmt = C->getSExtValue(); - assert(SignMulAmt != INT64_MIN && "Int min should have been handled!"); - uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt; if (isPowerOf2_64(AbsMulAmt - 1)) { // (mul x, 2^N + 1) => (add (shl x, N), x) NewMul = DAG.getNode( @@ -36738,6 +36770,145 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, return DAG.getNode(Opc, DL, VT, LHS, RHS); } +// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes +// from one vector with signed bytes from another vector, adds together +// adjacent pairs of 16-bit products, and saturates the result before +// truncating to 16-bits. +// +// Which looks something like this: +// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))), +// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B)))))))) +static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + const SDLoc &DL) { + if (!VT.isVector() || !Subtarget.hasSSSE3()) + return SDValue(); + + unsigned NumElems = VT.getVectorNumElements(); + EVT ScalarVT = VT.getVectorElementType(); + if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems)) + return SDValue(); + + SDValue SSatVal = detectSSatPattern(In, VT); + if (!SSatVal || SSatVal.getOpcode() != ISD::ADD) + return SDValue(); + + // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs + // of multiplies from even/odd elements. + SDValue N0 = SSatVal.getOperand(0); + SDValue N1 = SSatVal.getOperand(1); + + if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + SDValue N10 = N1.getOperand(0); + SDValue N11 = N1.getOperand(1); + + // TODO: Handle constant vectors and use knownbits/computenumsignbits? + // Canonicalize zero_extend to LHS. + if (N01.getOpcode() == ISD::ZERO_EXTEND) + std::swap(N00, N01); + if (N11.getOpcode() == ISD::ZERO_EXTEND) + std::swap(N10, N11); + + // Ensure we have a zero_extend and a sign_extend. + if (N00.getOpcode() != ISD::ZERO_EXTEND || + N01.getOpcode() != ISD::SIGN_EXTEND || + N10.getOpcode() != ISD::ZERO_EXTEND || + N11.getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + + // Peek through the extends. + N00 = N00.getOperand(0); + N01 = N01.getOperand(0); + N10 = N10.getOperand(0); + N11 = N11.getOperand(0); + + // Ensure the extend is from vXi8. + if (N00.getValueType().getVectorElementType() != MVT::i8 || + N01.getValueType().getVectorElementType() != MVT::i8 || + N10.getValueType().getVectorElementType() != MVT::i8 || + N11.getValueType().getVectorElementType() != MVT::i8) + return SDValue(); + + // All inputs should be build_vectors. + if (N00.getOpcode() != ISD::BUILD_VECTOR || + N01.getOpcode() != ISD::BUILD_VECTOR || + N10.getOpcode() != ISD::BUILD_VECTOR || + N11.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // N00/N10 are zero extended. N01/N11 are sign extended. + + // For each element, we need to ensure we have an odd element from one vector + // multiplied by the odd element of another vector and the even element from + // one of the same vectors being multiplied by the even element from the + // other vector. So we need to make sure for each element i, this operator + // is being performed: + // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1] + SDValue ZExtIn, SExtIn; + for (unsigned i = 0; i != NumElems; ++i) { + SDValue N00Elt = N00.getOperand(i); + SDValue N01Elt = N01.getOperand(i); + SDValue N10Elt = N10.getOperand(i); + SDValue N11Elt = N11.getOperand(i); + // TODO: Be more tolerant to undefs. + if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1)); + auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1)); + auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1)); + auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1)); + if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt) + return SDValue(); + unsigned IdxN00 = ConstN00Elt->getZExtValue(); + unsigned IdxN01 = ConstN01Elt->getZExtValue(); + unsigned IdxN10 = ConstN10Elt->getZExtValue(); + unsigned IdxN11 = ConstN11Elt->getZExtValue(); + // Add is commutative so indices can be reordered. + if (IdxN00 > IdxN10) { + std::swap(IdxN00, IdxN10); + std::swap(IdxN01, IdxN11); + } + // N0 indices be the even element. N1 indices must be the next odd element. + if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || + IdxN01 != 2 * i || IdxN11 != 2 * i + 1) + return SDValue(); + SDValue N00In = N00Elt.getOperand(0); + SDValue N01In = N01Elt.getOperand(0); + SDValue N10In = N10Elt.getOperand(0); + SDValue N11In = N11Elt.getOperand(0); + // First time we find an input capture it. + if (!ZExtIn) { + ZExtIn = N00In; + SExtIn = N01In; + } + if (ZExtIn != N00In || SExtIn != N01In || + ZExtIn != N10In || SExtIn != N11In) + return SDValue(); + } + + auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef<SDValue> Ops) { + // Shrink by adding truncate nodes and let DAGCombine fold with the + // sources. + EVT InVT = Ops[0].getValueType(); + assert(InVT.getScalarType() == MVT::i8 && + "Unexpected scalar element type"); + assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); + EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + InVT.getVectorNumElements() / 2); + return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]); + }; + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn }, + PMADDBuilder); +} + static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); @@ -36752,6 +36923,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try to detect PMADD + if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL)) + return PMAdd; + // Try to combine truncation with signed/unsigned saturation. if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) return Val; @@ -36793,38 +36968,14 @@ static SDValue isFNEG(SDNode *N) { if (!Op1.getValueType().isFloatingPoint()) return SDValue(); - SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); - - unsigned EltBits = Op1.getScalarValueSizeInBits(); - auto isSignMask = [&](const ConstantFP *C) { - return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits); - }; - - // There is more than one way to represent the same constant on - // the different X86 targets. The type of the node may also depend on size. - // - load scalar value and broadcast - // - BUILD_VECTOR node - // - load from a constant pool. - // We check all variants here. - if (Op1.getOpcode() == X86ISD::VBROADCAST) { - if (auto *C = getTargetConstantFromNode(Op1.getOperand(0))) - if (isSignMask(cast<ConstantFP>(C))) - return Op0; - - } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) { - if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode()) - if (isSignMask(CN->getConstantFPValue())) - return Op0; + // Extract constant bits and see if they are all sign bit masks. + APInt UndefElts; + SmallVector<APInt, 16> EltBits; + if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(), + UndefElts, EltBits, false, false)) + if (llvm::all_of(EltBits, [](APInt &I) { return I.isSignMask(); })) + return peekThroughBitcasts(Op.getOperand(0)); - } else if (auto *C = getTargetConstantFromNode(Op1)) { - if (C->getType()->isVectorTy()) { - if (auto *SplatV = C->getSplatValue()) - if (isSignMask(cast<ConstantFP>(SplatV))) - return Op0; - } else if (auto *FPConst = dyn_cast<ConstantFP>(C)) - if (isSignMask(FPConst)) - return Op0; - } return SDValue(); } @@ -37777,8 +37928,7 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, // Look through extract_vector_elts. If it comes from an FNEG, create a // new extract from the FNEG input. if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(V.getOperand(1)) && - cast<ConstantSDNode>(V.getOperand(1))->getZExtValue() == 0) { + isNullConstant(V.getOperand(1))) { if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) { NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal); V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), @@ -38896,7 +39046,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, std::swap(IdxN00, IdxN10); std::swap(IdxN01, IdxN11); } - // N0 indices be the even elemtn. N1 indices must be the next odd element. + // N0 indices be the even element. N1 indices must be the next odd element. if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i || IdxN11 != 2 * i + 1) return SDValue(); @@ -39322,8 +39472,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { - auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); - if (Idx2 && Idx2->getZExtValue() == 0) { + if (isNullConstant(Vec.getOperand(2))) { SDValue SubVec2 = Vec.getOperand(1); // If needed, look through bitcasts to get to the load. if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index 32215b170a8c..ff5006d208e5 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -1097,10 +1097,11 @@ namespace llvm { /// Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; - MVT getRegisterTypeForCallingConv(LLVMContext &Context, + MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, EVT VT) const override; bool isIntDivCheap(EVT VT, AttributeList Attr) const override; @@ -1125,8 +1126,8 @@ namespace llvm { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, - SDValue Addr, SelectionDAG &DAG) + SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, + SDValue Addr, SelectionDAG &DAG) const override; protected: diff --git a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 5d8400595bfa..7d31cfab4137 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -1576,7 +1576,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE }, - // FIXME: TEST*rr -> swapped operand of TEST *mr. + // FIXME: TEST*rr -> swapped operand of TEST *mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 1b61accfb42b..96db8b4e7585 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7725,7 +7725,7 @@ X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, if (C.CallConstructionID == MachineOutlinerTailCall) { // Yes, just insert a JMP. It = MBB.insert(It, - BuildMI(MF, DebugLoc(), get(X86::JMP_1)) + BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64)) .addGlobalAddress(M.getNamedValue(MF.getName()))); } else { // No, insert a call. diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index 7509b312c100..bc7afd32d494 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -1750,7 +1750,7 @@ def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags // Bit tests instructions: BT, BTS, BTR, BTC. let Defs = [EFLAGS] in { -let SchedRW = [WriteALU] in { +let SchedRW = [WriteBitTest] in { def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, @@ -1783,7 +1783,7 @@ let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { []>, TB, NotMemoryFoldable; } -let SchedRW = [WriteALU] in { +let SchedRW = [WriteBitTest] in { def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>, @@ -1818,7 +1818,7 @@ def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), } // SchedRW let hasSideEffects = 0 in { -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB, NotMemoryFoldable; @@ -1842,7 +1842,7 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), NotMemoryFoldable; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), @@ -1861,7 +1861,7 @@ def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), Requires<[In64BitMode]>; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB, NotMemoryFoldable; @@ -1885,7 +1885,7 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), NotMemoryFoldable; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; @@ -1908,7 +1908,7 @@ def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), Requires<[In64BitMode]>; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB, NotMemoryFoldable; @@ -1932,7 +1932,7 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), NotMemoryFoldable; } -let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index ee3b01159174..023137634df1 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -650,9 +650,9 @@ def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), // Double shift instructions (generalizations of rotate) //===----------------------------------------------------------------------===// -let Constraints = "$src1 = $dst", SchedRW = [WriteShiftDouble] in { +let Constraints = "$src1 = $dst" in { -let Uses = [CL] in { +let Uses = [CL], SchedRW = [WriteSHDrrcl] in { def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", @@ -683,9 +683,9 @@ def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, TB; -} +} // SchedRW -let isCommutable = 1 in { // These instructions commute to each other. +let isCommutable = 1, SchedRW = [WriteSHDrri] in { // These instructions commute to each other. def SHLD16rri8 : Ii8<0xA4, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, u8imm:$src3), @@ -728,11 +728,10 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg, [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, (i8 imm:$src3)))]>, TB; -} -} // Constraints = "$src = $dst", SchedRW +} // SchedRW +} // Constraints = "$src = $dst" -let SchedRW = [WriteShiftDoubleLd, WriteRMW] in { -let Uses = [CL] in { +let Uses = [CL], SchedRW = [WriteSHDmrcl] in { def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), @@ -759,8 +758,9 @@ def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), addr:$dst)]>, TB; -} +} // SchedRW +let SchedRW = [WriteSHDmri] in { def SHLD16mri8 : Ii8<0xA4, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", diff --git a/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td b/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td index c7713fea70fa..6334d9e89a60 100755 --- a/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -119,8 +119,8 @@ defm : BWWriteResPair<WriteIDiv16, [BWPort0, BWDivider], 25, [1, 10]>; defm : BWWriteResPair<WriteIDiv32, [BWPort0, BWDivider], 25, [1, 10]>; defm : BWWriteResPair<WriteIDiv64, [BWPort0, BWDivider], 25, [1, 10]>; -defm : BWWriteResPair<WriteBSWAP32,[BWPort15], 1>; // -defm : BWWriteResPair<WriteBSWAP64,[BWPort06, BWPort15], 2, [1, 1], 2>; // +defm : X86WriteRes<WriteBSWAP32, [BWPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [BWPort06, BWPort15], 2, [1, 1], 2>; defm : BWWriteResPair<WriteCRC32, [BWPort1], 3>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. @@ -137,6 +137,7 @@ def : WriteRes<WriteSETCCStore, [BWPort06,BWPort4,BWPort237]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [BWPort06]>; +def : WriteRes<WriteBitTest,[BWPort06]>; // Bit Test instrs // Bit counts. defm : BWWriteResPair<WriteBSF, [BWPort1], 3>; @@ -148,8 +149,11 @@ defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>; // Integer shifts and rotates. defm : BWWriteResPair<WriteShift, [BWPort06], 1>; -// Double shift instructions. -defm : BWWriteResPair<WriteShiftDouble, [BWPort06], 1>; +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [BWPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[BWPort1,BWPort06,BWPort0156], 6, [1, 1, 2], 4>; +defm : X86WriteRes<WriteSHDmri, [BWPort1,BWPort23,BWPort237,BWPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156], 11, [1, 1, 1, 1, 2], 6>; // BMI1 BEXTR, BMI2 BZHI defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>; @@ -600,14 +604,6 @@ def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> { let ResourceCycles = [1]; } def: InstRW<[BWWriteResGroup6], (instrs CDQ, CQO)>; -def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> { let Latency = 1; @@ -746,8 +742,6 @@ def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> { def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr", "PDEP(32|64)rr", "PEXT(32|64)rr", - "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8", "(V?)CVTDQ2PS(Y?)rr")>; def BWWriteResGroup27_16 : SchedWriteRes<[BWPort1, BWPort0156]> { @@ -1055,14 +1049,6 @@ def BWWriteResGroup66 : SchedWriteRes<[BWPort23,BWPort0156]> { def: InstRW<[BWWriteResGroup66], (instrs POP16r, POP32r, POP64r)>; def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)rmr")>; -def BWWriteResGroup67 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[BWWriteResGroup67], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def BWWriteResGroup68 : SchedWriteRes<[BWPort1,BWPort6,BWPort06,BWPort0156]> { let Latency = 6; let NumMicroOps = 4; @@ -1307,14 +1293,6 @@ def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> { def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTB(Y?)rm", "VPBROADCASTW(Y?)rm")>; -def BWWriteResGroup111 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort0156]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[BWWriteResGroup111], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { let Latency = 9; let NumMicroOps = 5; @@ -1380,14 +1358,6 @@ def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { } def: InstRW<[BWWriteResGroup128], (instregex "VCVTDQ2PDYrm")>; -def BWWriteResGroup130 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156]> { - let Latency = 11; - let NumMicroOps = 6; - let ResourceCycles = [1,1,1,1,2]; -} -def: InstRW<[BWWriteResGroup130], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def BWWriteResGroup131 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { let Latency = 11; let NumMicroOps = 7; diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td index 189dd4183839..876c3e4162cf 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td @@ -118,17 +118,26 @@ defm : X86WriteRes<WriteLoad, [HWPort23], 5, [1], 1>; defm : X86WriteRes<WriteMove, [HWPort0156], 1, [1], 1>; def : WriteRes<WriteZero, []>; +// Arithmetic. defm : HWWriteResPair<WriteALU, [HWPort0156], 1>; -defm : HWWriteResPair<WriteADC, [HWPort06,HWPort0156], 2, [1,1], 2>; +defm : HWWriteResPair<WriteADC, [HWPort06, HWPort0156], 2, [1,1], 2>; defm : HWWriteResPair<WriteIMul, [HWPort1], 3>; defm : HWWriteResPair<WriteIMul64, [HWPort1], 3>; -defm : HWWriteResPair<WriteBSWAP32,[HWPort15], 1>; -defm : HWWriteResPair<WriteBSWAP64,[HWPort06, HWPort15], 2, [1,1], 2>; +defm : X86WriteRes<WriteBSWAP32, [HWPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [HWPort06, HWPort15], 2, [1,1], 2>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } + +// Integer shifts and rotates. defm : HWWriteResPair<WriteShift, [HWPort06], 1>; -defm : HWWriteResPair<WriteShiftDouble, [HWPort06], 1>; + +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [HWPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[HWPort1, HWPort06, HWPort0156], 6, [1, 1, 2], 4>; +defm : X86WriteRes<WriteSHDmri, [HWPort1, HWPort23, HWPort237, HWPort0156], 10, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[HWPort1, HWPort23, HWPort237, HWPort06, HWPort0156], 12, [1, 1, 1, 1, 2], 6>; + defm : HWWriteResPair<WriteJump, [HWPort06], 1>; defm : HWWriteResPair<WriteCRC32, [HWPort1], 3>; @@ -141,6 +150,7 @@ def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [HWPort06]>; +def : WriteRes<WriteBitTest,[HWPort06]>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -886,14 +896,6 @@ def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> { let ResourceCycles = [1]; } def: InstRW<[HWWriteResGroup7], (instrs CDQ, CQO)>; -def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> { let Latency = 1; @@ -1240,8 +1242,6 @@ def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> { def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr", "PDEP(32|64)rr", "PEXT(32|64)rr", - "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8", "(V?)CVTDQ2PS(Y?)rr")>; def HWWriteResGroup50_16i : SchedWriteRes<[HWPort1, HWPort0156]> { @@ -1513,14 +1513,6 @@ def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> { } def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>; -def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> { - let Latency = 10; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> { let Latency = 9; let NumMicroOps = 5; @@ -1638,14 +1630,6 @@ def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { } def: InstRW<[HWWriteResGroup104], (instregex "VCVTDQ2PDYrm")>; -def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,1,2]; -} -def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> { let Latency = 6; let NumMicroOps = 4; @@ -1660,14 +1644,6 @@ def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> { } def: InstRW<[HWWriteResGroup108], (instrs STD)>; -def HWWriteResGroup109 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort0156]> { - let Latency = 12; - let NumMicroOps = 6; - let ResourceCycles = [1,1,1,1,2]; -} -def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> { let Latency = 7; let NumMicroOps = 7; diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td index 3b543c680ef4..6b7bbdea860a 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -106,13 +106,14 @@ def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 5; } def : WriteRes<WriteMove, [SBPort015]>; def : WriteRes<WriteZero, []>; +// Arithmetic. defm : SBWriteResPair<WriteALU, [SBPort015], 1>; defm : SBWriteResPair<WriteADC, [SBPort05,SBPort015], 2, [1,1], 2>; defm : SBWriteResPair<WriteIMul, [SBPort1], 3>; defm : SBWriteResPair<WriteIMul64, [SBPort1], 3>; -defm : SBWriteResPair<WriteBSWAP32,[SBPort1], 1>; -defm : SBWriteResPair<WriteBSWAP64,[SBPort1,SBPort05], 2, [1,1], 2>; +defm : X86WriteRes<WriteBSWAP32, [SBPort1], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SBPort1,SBPort05], 2, [1,1], 2>; defm : SBWriteResPair<WriteDiv8, [SBPort0, SBDivider], 25, [1, 10]>; defm : SBWriteResPair<WriteDiv16, [SBPort0, SBDivider], 25, [1, 10]>; @@ -125,8 +126,13 @@ defm : SBWriteResPair<WriteIDiv64, [SBPort0, SBDivider], 25, [1, 10]>; def : WriteRes<WriteIMulH, []> { let Latency = 3; } +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SBPort05, SBPort015], 2, [1, 1], 2>; +defm : X86WriteRes<WriteSHDrrcl,[SBPort05, SBPort015], 4, [3, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SBPort4,SBPort23,SBPort05,SBPort015], 8, [1, 2, 1, 1], 5>; +defm : X86WriteRes<WriteSHDmrcl,[SBPort4,SBPort23,SBPort05,SBPort015], 10, [1, 2, 3, 1], 7>; + defm : SBWriteResPair<WriteShift, [SBPort05], 1>; -defm : SBWriteResPair<WriteShiftDouble, [SBPort05], 1>; defm : SBWriteResPair<WriteJump, [SBPort5], 1>; defm : SBWriteResPair<WriteCRC32, [SBPort1], 3, [1], 1, 5>; @@ -139,6 +145,7 @@ def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [SBPort05]>; +def : WriteRes<WriteBitTest,[SBPort05]>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on @@ -564,14 +571,6 @@ def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> { let ResourceCycles = [1]; } def: InstRW<[SBWriteResGroup4], (instrs CDQ, CQO)>; -def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> { let Latency = 1; @@ -630,14 +629,6 @@ def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> { def: InstRW<[SBWriteResGroup18], (instrs JCXZ, JECXZ, JRCXZ)>; def: InstRW<[SBWriteResGroup18], (instregex "MMX_MOVDQ2Qrr")>; -def SBWriteResGroup19 : SchedWriteRes<[SBPort05,SBPort015]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SBWriteResGroup19], (instregex "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8")>; - def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> { let Latency = 3; let NumMicroOps = 1; @@ -728,14 +719,6 @@ def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> { } def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>; -def SBWriteResGroup29_3 : SchedWriteRes<[SBPort05,SBPort015]> { - let Latency = 4; - let NumMicroOps = 4; - let ResourceCycles = [3,1]; -} -def: InstRW<[SBWriteResGroup29_3], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> { let Latency = 5; let NumMicroOps = 1; @@ -1027,14 +1010,6 @@ def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { } def: InstRW<[SBWriteResGroup87], (instrs FARCALL64)>; -def SBWriteResGroup88 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { - let Latency = 8; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; -} -def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { let Latency = 9; let NumMicroOps = 3; @@ -1130,14 +1105,6 @@ def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> { def: InstRW<[SBWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m", "ILD_F(16|32|64)m")>; -def SBWriteResGroup103_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { - let Latency = 10; - let NumMicroOps = 7; - let ResourceCycles = [1,2,3,1]; -} -def: InstRW<[SBWriteResGroup103_2], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> { let Latency = 11; let NumMicroOps = 2; diff --git a/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 1417799d76be..bda088e1512f 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -110,8 +110,8 @@ defm : SKLWriteResPair<WriteADC, [SKLPort06], 1>; // Integer ALU + flags op defm : SKLWriteResPair<WriteIMul, [SKLPort1], 3>; // Integer multiplication. defm : SKLWriteResPair<WriteIMul64, [SKLPort1], 3>; // Integer 64-bit multiplication. -defm : SKLWriteResPair<WriteBSWAP32,[SKLPort15], 1>; // -defm : SKLWriteResPair<WriteBSWAP64,[SKLPort06, SKLPort15], 2, [1,1], 2>; // +defm : X86WriteRes<WriteBSWAP32, [SKLPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SKLPort06, SKLPort15], 2, [1,1], 2>; defm : SKLWriteResPair<WriteDiv8, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; defm : SKLWriteResPair<WriteDiv16, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; @@ -136,6 +136,7 @@ def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [SKLPort06]>; +def : WriteRes<WriteBitTest,[SKLPort06]>; // // Bit counts. defm : SKLWriteResPair<WriteBSF, [SKLPort1], 3>; @@ -147,8 +148,11 @@ defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>; // Integer shifts and rotates. defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>; -// Double shift instructions. -defm : SKLWriteResPair<WriteShiftDouble, [SKLPort06], 1>; +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SKLPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SKLPort1,SKLPort06,SKLPort0156], 6, [1, 2, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SKLPort1,SKLPort23,SKLPort237,SKLPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156], 11, [1, 1, 1, 2, 1], 6>; // BMI1 BEXTR, BMI2 BZHI defm : SKLWriteResPair<WriteBEXTR, [SKLPort06,SKLPort15], 2, [1,1], 2>; @@ -602,14 +606,6 @@ def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>; -def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> { let Latency = 1; @@ -743,9 +739,7 @@ def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr", - "PEXT(32|64)rr", - "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8")>; + "PEXT(32|64)rr")>; def SKLWriteResGroup29_16i : SchedWriteRes<[SKLPort1, SKLPort0156]> { let Latency = 4; @@ -1096,14 +1090,6 @@ def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort01]> { } def: InstRW<[SKLWriteResGroup78], (instregex "(V?)CVTSI642SSrr")>; -def SKLWriteResGroup79 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,2,1]; -} -def: InstRW<[SKLWriteResGroup79], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> { let Latency = 6; let NumMicroOps = 4; @@ -1392,14 +1378,6 @@ def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { def: InstRW<[SKLWriteResGroup128], (instregex "(V?)PHADDSWrm", "(V?)PHSUBSWrm")>; -def SKLWriteResGroup130 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort0156]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[SKLWriteResGroup130], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> { let Latency = 9; let NumMicroOps = 5; @@ -1519,14 +1497,6 @@ def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm", "CVT(T?)PD2DQrm", "MMX_CVT(T?)PD2PIirm")>; -def SKLWriteResGroup153 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { - let Latency = 11; - let NumMicroOps = 6; - let ResourceCycles = [1,1,1,2,1]; -} -def: InstRW<[SKLWriteResGroup153], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { let Latency = 11; let NumMicroOps = 7; diff --git a/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 7095ec081bd9..9d5f8555c505 100755 --- a/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -110,8 +110,8 @@ defm : SKXWriteResPair<WriteADC, [SKXPort06], 1>; // Integer ALU + flags op defm : SKXWriteResPair<WriteIMul, [SKXPort1], 3>; // Integer multiplication. defm : SKXWriteResPair<WriteIMul64, [SKXPort1], 3>; // Integer 64-bit multiplication. -defm : SKXWriteResPair<WriteBSWAP32,[SKXPort15], 1>; // -defm : SKXWriteResPair<WriteBSWAP64,[SKXPort06, SKXPort15], 2, [1,1], 2>; // +defm : X86WriteRes<WriteBSWAP32, [SKXPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SKXPort06, SKXPort15], 2, [1,1], 2>; defm : SKXWriteResPair<WriteDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; defm : SKXWriteResPair<WriteDiv16, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; @@ -136,12 +136,16 @@ def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> { let NumMicroOps = 3; } def : WriteRes<WriteLAHFSAHF, [SKXPort06]>; +def : WriteRes<WriteBitTest,[SKXPort06]>; // // Integer shifts and rotates. defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>; -// Double shift instructions. -defm : SKXWriteResPair<WriteShiftDouble, [SKXPort06], 1>; +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SKXPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SKXPort1,SKXPort06,SKXPort0156], 6, [1, 2, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SKXPort1,SKXPort23,SKXPort237,SKXPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156], 11, [1, 1, 1, 2, 1], 6>; // Bit counts. defm : SKXWriteResPair<WriteBSF, [SKXPort1], 3>; @@ -615,14 +619,6 @@ def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> { let ResourceCycles = [1]; } def: InstRW<[SKXWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>; -def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8", - "BT(16|32|64)rr", - "BTC(16|32|64)ri8", - "BTC(16|32|64)rr", - "BTR(16|32|64)ri8", - "BTR(16|32|64)rr", - "BTS(16|32|64)ri8", - "BTS(16|32|64)rr")>; def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> { let Latency = 1; @@ -783,9 +779,7 @@ def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> { let ResourceCycles = [1]; } def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr", - "PEXT(32|64)rr", - "SHLD(16|32|64)rri8", - "SHRD(16|32|64)rri8")>; + "PEXT(32|64)rr")>; def SKXWriteResGroup31_16i : SchedWriteRes<[SKXPort1, SKXPort0156]> { let Latency = 4; @@ -1270,14 +1264,6 @@ def: InstRW<[SKXWriteResGroup82], (instregex "(V?)CVTSI642SSrr", "VCVTSI642SSZrr", "VCVTUSI642SSZrr")>; -def SKXWriteResGroup83 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { - let Latency = 6; - let NumMicroOps = 4; - let ResourceCycles = [1,2,1]; -} -def: InstRW<[SKXWriteResGroup83], (instregex "SHLD(16|32|64)rrCL", - "SHRD(16|32|64)rrCL")>; - def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]> { let Latency = 6; let NumMicroOps = 4; @@ -1830,14 +1816,6 @@ def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> { def: InstRW<[SKXWriteResGroup143], (instregex "(V?)PHADDSWrm", "(V?)PHSUBSWrm")>; -def SKXWriteResGroup145 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort0156]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1,1,1,1]; -} -def: InstRW<[SKXWriteResGroup145], (instregex "SHLD(16|32|64)mri8", - "SHRD(16|32|64)mri8")>; - def SKXWriteResGroup146 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> { let Latency = 9; let NumMicroOps = 5; @@ -2033,14 +2011,6 @@ def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { } def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)")>; -def SKXWriteResGroup168 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { - let Latency = 11; - let NumMicroOps = 6; - let ResourceCycles = [1,1,1,2,1]; -} -def: InstRW<[SKXWriteResGroup168], (instregex "SHLD(16|32|64)mrCL", - "SHRD(16|32|64)mrCL")>; - def SKXWriteResGroup169 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { let Latency = 11; let NumMicroOps = 7; diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td index d0167753ccd4..ef9ce94706df 100644 --- a/contrib/llvm/lib/Target/X86/X86Schedule.td +++ b/contrib/llvm/lib/Target/X86/X86Schedule.td @@ -118,8 +118,8 @@ defm WriteIMul64 : X86SchedWritePair; // Integer 64-bit multiplication. def WriteIMulH : SchedWrite; // Integer multiplication, high part. def WriteLEA : SchedWrite; // LEA instructions can't fold loads. -defm WriteBSWAP32: X86SchedWritePair; // Byte Order (Endiannes) Swap -defm WriteBSWAP64: X86SchedWritePair; // Byte Order (Endiannes) Swap +def WriteBSWAP32 : SchedWrite; // Byte Order (Endianness) 32-bit Swap. +def WriteBSWAP64 : SchedWrite; // Byte Order (Endianness) 64-bit Swap. // Integer division. defm WriteDiv8 : X86SchedWritePair; @@ -142,11 +142,15 @@ def WriteFCMOV : SchedWrite; // X87 conditional move. def WriteSETCC : SchedWrite; // Set register based on condition code. def WriteSETCCStore : SchedWrite; def WriteLAHFSAHF : SchedWrite; // Load/Store flags in AH. +def WriteBitTest : SchedWrite; // Bit Test - TODO add memory folding support // Integer shifts and rotates. defm WriteShift : X86SchedWritePair; // Double shift instructions. -defm WriteShiftDouble : X86SchedWritePair; +def WriteSHDrri : SchedWrite; +def WriteSHDrrcl : SchedWrite; +def WriteSHDmri : SchedWrite; +def WriteSHDmrcl : SchedWrite; // BMI1 BEXTR, BMI2 BZHI defm WriteBEXTR : X86SchedWritePair; diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td index d1e902e6c43f..a7f461c456bd 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -81,8 +81,8 @@ defm : AtomWriteResPair<WriteADC, [AtomPort01], [AtomPort0]>; defm : AtomWriteResPair<WriteIMul, [AtomPort01], [AtomPort01], 7, 7, [7], [7]>; defm : AtomWriteResPair<WriteIMul64, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>; -defm : AtomWriteResPair<WriteBSWAP32, [AtomPort0], [AtomPort0]>; -defm : AtomWriteResPair<WriteBSWAP64, [AtomPort0], [AtomPort0]>; +defm : X86WriteRes<WriteBSWAP32, [AtomPort0], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [AtomPort0], 1, [1], 1>; defm : AtomWriteResPair<WriteDiv8, [AtomPort01], [AtomPort01], 50, 68, [50], [68]>; defm : AtomWriteResPair<WriteDiv16, [AtomPort01], [AtomPort01], 50, 50, [50], [50]>; @@ -108,6 +108,7 @@ def : WriteRes<WriteLAHFSAHF, [AtomPort01]> { let Latency = 2; let ResourceCycles = [2]; } +def : WriteRes<WriteBitTest,[AtomPort01]>; defm : X86WriteResUnsupported<WriteIMulH>; @@ -150,11 +151,10 @@ defm : X86WriteResPairUnsupported<WriteBZHI>; defm : AtomWriteResPair<WriteShift, [AtomPort0], [AtomPort0]>; -//////////////////////////////////////////////////////////////////////////////// -// Double shift instructions. -//////////////////////////////////////////////////////////////////////////////// - -defm : AtomWriteResPair<WriteShiftDouble, [AtomPort0], [AtomPort0]>; +defm : X86WriteRes<WriteSHDrri, [AtomPort01], 2, [2], 1>; +defm : X86WriteRes<WriteSHDrrcl,[AtomPort01], 2, [2], 1>; +defm : X86WriteRes<WriteSHDmri, [AtomPort01], 4, [4], 1>; +defm : X86WriteRes<WriteSHDmrcl,[AtomPort01], 4, [4], 1>; //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. @@ -562,9 +562,7 @@ def AtomWrite01_2 : SchedWriteRes<[AtomPort01]> { def : InstRW<[AtomWrite01_2], (instrs LEAVE, LEAVE64, POP16r, PUSH16rmm, PUSH32rmm, PUSH64rmm, LODSB, LODSL, LODSQ, LODSW, - SCASB, SCASL, SCASQ, SCASW, - SHLD32rrCL, SHRD32rrCL, - SHLD32rri8, SHRD32rri8)>; + SCASB, SCASL, SCASQ, SCASW)>; def : InstRW<[AtomWrite01_2], (instregex "BT(C|R|S)(16|32|64)mi8", "PUSH(CS|DS|ES|FS|GS|SS)(16|32|64)", "XADD(8|16|32|64)rr", @@ -598,8 +596,6 @@ def AtomWrite01_4 : SchedWriteRes<[AtomPort01]> { } def : InstRW<[AtomWrite01_4], (instrs CBW, CWD, CWDE, CDQ, CDQE, CQO, JCXZ, JECXZ, JRCXZ, - SHLD32mrCL, SHRD32mrCL, - SHLD32mri8, SHRD32mri8, LD_F80m)>; def : InstRW<[AtomWrite01_4], (instregex "PH(ADD|SUB)Drm", "(MMX_)?PEXTRWrr(_REV)?")>; diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td index d78c343ebd5c..719e71cd25e5 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -168,8 +168,8 @@ defm : JWriteResIntPair<WriteIMul, [JALU1, JMul], 3, [1, 1], 2>; // i8/i16/i32 defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; // i64 multiplication defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; -defm : JWriteResIntPair<WriteBSWAP32,[JALU01], 1>; -defm : JWriteResIntPair<WriteBSWAP64,[JALU01], 1>; +defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; @@ -188,6 +188,7 @@ defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional m def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; def : WriteRes<WriteLAHFSAHF, [JALU01]>; +def : WriteRes<WriteBitTest,[JALU01]>; // This is for simple LEAs with one or two input operands. def : WriteRes<WriteLEA, [JALU01]>; @@ -209,33 +210,11 @@ defm : X86WriteResPairUnsupported<WriteBZHI>; defm : JWriteResIntPair<WriteShift, [JALU01], 1>; -defm : JWriteResIntPair<WriteShiftDouble, [JALU01], 1>; - -def JWriteSHLDrri : SchedWriteRes<[JALU01]> { - let Latency = 3; - let ResourceCycles = [6]; - let NumMicroOps = 6; -} -def: InstRW<[JWriteSHLDrri], (instrs SHLD16rri8, SHLD32rri8, SHLD64rri8, - SHRD16rri8, SHRD32rri8, SHRD64rri8)>; - -def JWriteSHLDrrCL : SchedWriteRes<[JALU01]> { - let Latency = 4; - let ResourceCycles = [8]; - let NumMicroOps = 7; -} -def: InstRW<[JWriteSHLDrrCL], (instrs SHLD16rrCL, SHLD32rrCL, SHLD64rrCL, - SHRD16rrCL, SHRD32rrCL, SHRD64rrCL)>; - -def JWriteSHLDm : SchedWriteRes<[JLAGU, JALU01]> { - let Latency = 9; - let ResourceCycles = [1, 22]; - let NumMicroOps = 8; -} -def: InstRW<[JWriteSHLDm],(instrs SHLD16mri8, SHLD32mri8, SHLD64mri8, - SHLD16mrCL, SHLD32mrCL, SHLD64mrCL, - SHRD16mri8, SHRD32mri8, SHRD64mri8, - SHRD16mrCL, SHRD32mrCL, SHRD64mrCL)>; +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; +defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; +defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; +defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td index c938a4a8939e..b1e843013707 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -98,11 +98,16 @@ defm : SLMWriteResPair<WriteADC, [SLM_IEC_RSV01], 1>; defm : SLMWriteResPair<WriteIMul, [SLM_IEC_RSV1], 3>; defm : SLMWriteResPair<WriteIMul64, [SLM_IEC_RSV1], 3>; -defm : SLMWriteResPair<WriteBSWAP32,[SLM_IEC_RSV01], 1>; -defm : SLMWriteResPair<WriteBSWAP64,[SLM_IEC_RSV01], 1>; +defm : X86WriteRes<WriteBSWAP32, [SLM_IEC_RSV01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SLM_IEC_RSV01], 1, [1], 1>; defm : SLMWriteResPair<WriteShift, [SLM_IEC_RSV0], 1>; -defm : SLMWriteResPair<WriteShiftDouble, [SLM_IEC_RSV0], 1>; + +defm : X86WriteRes<WriteSHDrri, [SLM_IEC_RSV0], 1, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SLM_IEC_RSV0], 1, [1], 1>; +defm : X86WriteRes<WriteSHDmri, [SLM_MEC_RSV, SLM_IEC_RSV0], 4, [2, 1], 2>; +defm : X86WriteRes<WriteSHDmrcl,[SLM_MEC_RSV, SLM_IEC_RSV0], 4, [2, 1], 2>; + defm : SLMWriteResPair<WriteJump, [SLM_IEC_RSV1], 1>; defm : SLMWriteResPair<WriteCRC32, [SLM_IEC_RSV1], 3>; @@ -115,6 +120,7 @@ def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> { let ResourceCycles = [2,1]; } def : WriteRes<WriteLAHFSAHF, [SLM_IEC_RSV01]>; +def : WriteRes<WriteBitTest,[SLM_IEC_RSV01]>; // This is for simple LEAs with one or two input operands. // The complex ones can only execute on port 1, and they require two cycles on diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td index d28d58580752..7184b850a195 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -180,11 +180,16 @@ defm : ZnWriteResPair<WriteADC, [ZnALU], 1>; defm : ZnWriteResPair<WriteIMul, [ZnALU1, ZnMultiplier], 4>; defm : ZnWriteResPair<WriteIMul64, [ZnALU1, ZnMultiplier], 4, [1,1], 2>; -defm : ZnWriteResPair<WriteBSWAP32,[ZnALU], 1, [4]>; -defm : ZnWriteResPair<WriteBSWAP64,[ZnALU], 1, [4]>; +defm : X86WriteRes<WriteBSWAP32, [ZnALU], 1, [4], 1>; +defm : X86WriteRes<WriteBSWAP64, [ZnALU], 1, [4], 1>; defm : ZnWriteResPair<WriteShift, [ZnALU], 1>; -defm : ZnWriteResPair<WriteShiftDouble, [ZnALU], 1>; + +defm : X86WriteRes<WriteSHDrri, [ZnALU], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteSHDrrcl>; +defm : X86WriteResUnsupported<WriteSHDmri>; +defm : X86WriteResUnsupported<WriteSHDmrcl>; + defm : ZnWriteResPair<WriteJump, [ZnALU], 1>; defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>; @@ -193,6 +198,7 @@ defm : ZnWriteResPair<WriteCMOV2, [ZnALU], 1>; def : WriteRes<WriteSETCC, [ZnALU]>; def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>; defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>; +def : WriteRes<WriteBitTest,[ZnALU]>; // Bit counts. defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>; diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index fedb13f89e19..85e8256a6e94 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -51,7 +51,7 @@ enum Style { } // end namespace PICStyles class X86Subtarget final : public X86GenSubtargetInfo { -public: +public: enum X86ProcFamilyEnum { Others, IntelAtom, diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index bae2ef80c365..865462622627 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2274,8 +2274,8 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { // Sign-extend all constants to a multiple of 64-bit. APInt ImmVal = Imm; - if (BitSize & 0x3f) - ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); + if (BitSize % 64 != 0) + ImmVal = Imm.sext(alignTo(BitSize, 64)); // Split the constant into 64-bit chunks and calculate the cost for each // chunk. @@ -2332,9 +2332,15 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, // immediates here as the normal path expects bit 31 to be sign extended. if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) return TTI::TCC_Free; - LLVM_FALLTHROUGH; + ImmIdx = 1; + break; case Instruction::Add: case Instruction::Sub: + // For add/sub, we can use the opposite instruction for INT32_MIN. + if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) + return TTI::TCC_Free; + ImmIdx = 1; + break; case Instruction::Mul: case Instruction::UDiv: case Instruction::SDiv: @@ -2366,7 +2372,7 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, } if (Idx == ImmIdx) { - int NumConstants = (BitSize + 63) / 64; + int NumConstants = divideCeil(BitSize, 64); int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) ? static_cast<int>(TTI::TCC_Free) diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp index 8f7c8a82380a..916bca6392de 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -146,7 +146,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { } EmitAlignment(Align > 2 ? Align : 2, GV); - + if (GV->isThreadLocal()) { report_fatal_error("TLS is not supported by this target!"); } @@ -162,7 +162,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { // are padded to 32 bits. if (Size < 4) OutStreamer->EmitZeros(4 - Size); - + // Mark the end of the global getTargetStreamer().emitCCBottomData(GVSym->getName()); } @@ -295,6 +295,6 @@ void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) { } // Force static initialization. -extern "C" void LLVMInitializeXCoreAsmPrinter() { +extern "C" void LLVMInitializeXCoreAsmPrinter() { RegisterAsmPrinter<XCoreAsmPrinter> X(getTheXCoreTarget()); } diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index d5e276788f71..b0de048672df 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -63,7 +63,7 @@ static bool isZeroImm(const MachineOperand &op) { unsigned XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { int Opcode = MI.getOpcode(); - if (Opcode == XCore::LDWFI) + if (Opcode == XCore::LDWFI) { if ((MI.getOperand(1).isFI()) && // is a stack slot (MI.getOperand(2).isImm()) && // the imm is zero @@ -74,7 +74,7 @@ unsigned XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, } return 0; } - + /// isStoreToStackSlot - If the specified machine instruction is a direct /// store to a stack slot, return the virtual or physical register number of /// the source reg along with the FrameIndex of the loaded stack slot. If @@ -129,9 +129,9 @@ static inline bool IsBR_JT(unsigned BrOpc) { || BrOpc == XCore::BR_JT32; } -/// GetCondFromBranchOpc - Return the XCore CC that matches +/// GetCondFromBranchOpc - Return the XCore CC that matches /// the correspondent Branch instruction opcode. -static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) +static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) { if (IsBRT(BrOpc)) { return XCore::COND_TRUE; @@ -144,7 +144,7 @@ static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) /// GetCondBranchFromCond - Return the Branch instruction /// opcode that matches the cc. -static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) +static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) { switch (CC) { default: llvm_unreachable("Illegal condition code!"); @@ -153,7 +153,7 @@ static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) } } -/// GetOppositeBranchCondition - Return the inverse of the specified +/// GetOppositeBranchCondition - Return the inverse of the specified /// condition, e.g. turning COND_E to COND_NE. static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC) { @@ -209,11 +209,11 @@ bool XCoreInstrInfo::analyzeBranch(MachineBasicBlock &MBB, TBB = LastInst->getOperand(0).getMBB(); return false; } - + XCore::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode()); if (BranchCode == XCore::COND_INVALID) return true; // Can't handle indirect branch. - + // Conditional branch // Block ends with fall-through condbranch. @@ -222,17 +222,17 @@ bool XCoreInstrInfo::analyzeBranch(MachineBasicBlock &MBB, Cond.push_back(LastInst->getOperand(0)); return false; } - + // Get the instruction before it if it's a terminator. MachineInstr *SecondLastInst = &*I; // If there are three terminators, we don't know what sort of block this is. if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) return true; - + unsigned SecondLastOpc = SecondLastInst->getOpcode(); XCore::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc); - + // If the block ends with conditional branch followed by unconditional, // handle it. if (BranchCode != XCore::COND_INVALID @@ -245,10 +245,10 @@ bool XCoreInstrInfo::analyzeBranch(MachineBasicBlock &MBB, FBB = LastInst->getOperand(0).getMBB(); return false; } - + // If the block ends with two unconditional branches, handle it. The second // one is not executed, so remove it. - if (IsBRU(SecondLastInst->getOpcode()) && + if (IsBRU(SecondLastInst->getOpcode()) && IsBRU(LastInst->getOpcode())) { TBB = SecondLastInst->getOperand(0).getMBB(); I = LastInst; @@ -293,7 +293,7 @@ unsigned XCoreInstrInfo::insertBranch(MachineBasicBlock &MBB, } return 1; } - + // Two-way Conditional branch. assert(Cond.size() == 2 && "Unexpected number of components!"); unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm()); @@ -313,17 +313,17 @@ XCoreInstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode())) return 0; - + // Remove the branch. I->eraseFromParent(); - + I = MBB.end(); if (I == MBB.begin()) return 1; --I; if (!IsCondBranch(I->getOpcode())) return 1; - + // Remove the branch. I->eraseFromParent(); return 2; @@ -342,7 +342,7 @@ void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(0); return; } - + if (GRDest && SrcReg == XCore::SP) { BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg).addImm(0); return; diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h index cf469ec3cf1a..6c05ab3f10df 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h @@ -43,11 +43,11 @@ class XCoreFunctionInfo : public MachineFunctionInfo { public: XCoreFunctionInfo() = default; - + explicit XCoreFunctionInfo(MachineFunction &MF) {} - + ~XCoreFunctionInfo() override = default; - + void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; } int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp index 1915aaedc35d..e119d9555f9d 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp @@ -296,12 +296,12 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // fold constant into offset. Offset += MI.getOperand(FIOperandNum + 1).getImm(); MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); - + assert(Offset%4 == 0 && "Misaligned stack offset"); LLVM_DEBUG(errs() << "Offset : " << Offset << "\n" << "<--------->\n"); Offset/=4; - + unsigned Reg = MI.getOperand(0).getReg(); assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand"); diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h index c31f5d5a7c44..9451a05d8d58 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h @@ -32,7 +32,7 @@ public: const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; - + bool enableMultipleCopyHints() const override { return true; } bool requiresRegisterScavenging(const MachineFunction &MF) const override; diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h index 140ddba68aab..ed9936ebf2b8 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h +++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h @@ -43,7 +43,7 @@ public: XCoreSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM); - /// ParseSubtargetFeatures - Parses features string setting specified + /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); |