diff options
Diffstat (limited to 'lib/Target')
116 files changed, 1845 insertions, 863 deletions
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index d0c0956b87ca..629ad5c61b78 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -942,6 +942,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, AArch64::XZR, NextMBBI); case AArch64::CMP_SWAP_128: return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); + } return false; } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 550174b22a89..dc916c034661 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1125,7 +1125,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, if (RegInfo->hasBasePointer(MF)) BasePointerReg = RegInfo->getBaseRegister(); - bool ExtraCSSpill = false; + unsigned ExtraCSSpill = 0; const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { @@ -1153,7 +1153,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(PairedReg); if (AArch64::GPR64RegClass.contains(PairedReg) && !RegInfo->isReservedReg(MF, PairedReg)) - ExtraCSSpill = true; + ExtraCSSpill = PairedReg; } } @@ -1186,8 +1186,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // register scavenging. If we already spilled an extra callee-saved register // above to keep the number of spills even, we don't need to do anything else // here. - if (BigStack && !ExtraCSSpill) { - if (UnspilledCSGPR != AArch64::NoRegister) { + if (BigStack) { + if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) { DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) << " to get a scratch register.\n"); SavedRegs.set(UnspilledCSGPR); @@ -1196,15 +1196,18 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // store the pair. if (produceCompactUnwindFrame(MF)) SavedRegs.set(UnspilledCSGPRPaired); - ExtraCSSpill = true; + ExtraCSSpill = UnspilledCSGPRPaired; NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create // an emergency spill slot. - if (!ExtraCSSpill) { - const TargetRegisterClass *RC = &AArch64::GPR64RegClass; - int FI = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), false); + if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass &RC = AArch64::GPR64RegClass; + unsigned Size = TRI->getSpillSize(RC); + unsigned Align = TRI->getSpillAlignment(RC); + int FI = MFI.CreateStackObject(Size, Align, false); RS->addScavengingFrameIndex(FI); DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI << " as the emergency spill slot.\n"); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 4ddc95199d4c..a7c98fbb425f 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -91,6 +91,7 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); static cl::opt<bool> EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, @@ -105,6 +106,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +static cl::opt<bool> +EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, + cl::desc("Enable AArch64 logical imm instruction " + "optimization"), + cl::init(true)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -787,6 +794,140 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, + const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO, + unsigned NewOpc) { + uint64_t OldImm = Imm, NewImm, Enc; + uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; + + // Return if the immediate is already all zeros, all ones, a bimm32 or a + // bimm64. + if (Imm == 0 || Imm == Mask || + AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) + return false; + + unsigned EltSize = Size; + uint64_t DemandedBits = Demanded.getZExtValue(); + + // Clear bits that are not demanded. + Imm &= DemandedBits; + + while (true) { + // The goal here is to set the non-demanded bits in a way that minimizes + // the number of switching between 0 and 1. In order to achieve this goal, + // we set the non-demanded bits to the value of the preceding demanded bits. + // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a + // non-demanded bit), we copy bit0 (1) to the least significant 'x', + // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. + // The final result is 0b11000011. + uint64_t NonDemandedBits = ~DemandedBits; + uint64_t InvertedImm = ~Imm & DemandedBits; + uint64_t RotatedImm = + ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & + NonDemandedBits; + uint64_t Sum = RotatedImm + NonDemandedBits; + bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); + uint64_t Ones = (Sum + Carry) & NonDemandedBits; + NewImm = (Imm | Ones) & Mask; + + // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate + // or all-ones or all-zeros, in which case we can stop searching. Otherwise, + // we halve the element size and continue the search. + if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) + break; + + // We cannot shrink the element size any further if it is 2-bits. + if (EltSize == 2) + return false; + + EltSize /= 2; + Mask >>= EltSize; + uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; + + // Return if there is mismatch in any of the demanded bits of Imm and Hi. + if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) + return false; + + // Merge the upper and lower halves of Imm and DemandedBits. + Imm |= Hi; + DemandedBits |= DemandedBitsHi; + } + + ++NumOptimizedImms; + + // Replicate the element across the register width. + while (EltSize < Size) { + NewImm |= NewImm << EltSize; + EltSize *= 2; + } + + (void)OldImm; + assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && + "demanded bits should never be altered"); + assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); + + // Create the new constant immediate node. + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + // If the new constant immediate is all-zeros or all-ones, let the target + // independent DAG combine optimize this node. + if (NewImm == 0 || NewImm == OrigMask) + return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT)); + + // Otherwise, create a machine node so that target independent DAG combine + // doesn't undo this optimization. + Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); + SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); + SDValue New( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + + return TLO.CombineTo(Op, New); +} + +bool AArch64TargetLowering::targetShrinkDemandedConstant( + SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { + // Delay this optimization to as late as possible. + if (!TLO.LegalOps) + return false; + + if (!EnableOptimizeLogicalImm) + return false; + + EVT VT = Op.getValueType(); + if (VT.isVector()) + return false; + + unsigned Size = VT.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "i32 or i64 is expected after legalization."); + + // Exit early if we demand all bits. + if (Demanded.countPopulation() == Size) + return false; + + unsigned NewOpc; + switch (Op.getOpcode()) { + default: + return false; + case ISD::AND: + NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; + break; + case ISD::OR: + NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; + break; + case ISD::XOR: + NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; + break; + } + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!C) + return false; + uint64_t Imm = C->getZExtValue(); + return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); +} + /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. @@ -3418,11 +3559,75 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Other Lowering Code //===----------------------------------------------------------------------===// +SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), + N->getOffset(), Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); +} + +// (loadGOT sym) +template <class NodeTy> +SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const { + DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT); + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes instead of using a wrapper node. + return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); +} + +// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) +template <class NodeTy> +SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG) + const { + DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, Ty, + getTargetNode(N, Ty, DAG, AArch64II::MO_G3), + getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC), + getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC), + getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC)); +} + +// (addlow (adrp %hi(sym)) %lo(sym)) +template <class NodeTy> +SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const { + DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE); + SDValue Lo = getTargetNode(N, Ty, DAG, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); +} + SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); - const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); + GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); @@ -3430,32 +3635,15 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && "unexpected offset in global node"); - // This also catched the large code model case for Darwin. + // This also catches the large code model case for Darwin. if ((OpFlags & AArch64II::MO_GOT) != 0) { - SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); - // FIXME: Once remat is capable of dealing with instructions with register - // operands, expand this into two nodes instead of using a wrapper node. - return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + return getGOT(GN, DAG); } if (getTargetMachine().getCodeModel() == CodeModel::Large) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(GN, DAG); } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and - // the only correct model on Darwin. - SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, - OpFlags | AArch64II::MO_PAGE); - unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; - SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); - - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(GN, DAG); } } @@ -4232,90 +4420,37 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(JT, DAG); } - - SDValue Hi = - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(JT, DAG); } SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { // Use the GOT for the large code model on iOS. if (Subtarget->isTargetMachO()) { - SDValue GotAddr = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - AArch64II::MO_GOT); - return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + return getGOT(CP, DAG); } - - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G3), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G2 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G1 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(CP, DAG); } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on - // ELF, the only valid one on Darwin. - SDValue Hi = - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(CP, DAG); } } SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); + BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(BA, DAG); } else { - SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | - AArch64II::MO_NC); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(BA, DAG); } } diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index a023b4373835..6081b07479b9 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -255,6 +255,9 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; + bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const override; + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; /// Returns true if the target allows unaligned memory accesses of the @@ -508,6 +511,18 @@ private: const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + template <class NodeTy> SDValue getGOT(NodeTy *N, SelectionDAG &DAG) const; + template <class NodeTy> + SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG) const; + template <class NodeTy> SDValue getAddr(NodeTy *N, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index 867074c3c374..71826bec6b11 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -14,6 +14,9 @@ //===---------------------------------- // Atomic fences //===---------------------------------- +let AddedComplexity = 15, Size = 0 in +def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering), + [(atomic_fence imm:$ordering, 0)]>, Sched<[]>; def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>; def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 16be4432b160..c44daf306ea9 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -693,11 +693,11 @@ def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>; def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>; def gi_addsub_shifted_imm32 : - GIComplexOperandMatcher<s32, (ops i32imm, i32imm), "selectArithImmed">, + GIComplexOperandMatcher<s32, "selectArithImmed">, GIComplexPatternEquiv<addsub_shifted_imm32>; def gi_addsub_shifted_imm64 : - GIComplexOperandMatcher<s64, (ops i32imm, i32imm), "selectArithImmed">, + GIComplexOperandMatcher<s64, "selectArithImmed">, GIComplexPatternEquiv<addsub_shifted_imm64>; class neg_addsub_shifted_imm<ValueType Ty> diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 41fc8eceab5c..cb268828455e 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2320,7 +2320,7 @@ void AArch64InstrInfo::storeRegToStackSlot( PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); unsigned Opc = 0; bool Offset = true; - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::STRBui; @@ -2424,7 +2424,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( unsigned Opc = 0; bool Offset = true; - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRBui; @@ -2649,7 +2649,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( }; if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { - assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() && + assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == + TRI.getRegSizeInBits(*getRegClass(SrcReg)) && "Mismatched register size in non subreg COPY"); if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, @@ -2735,7 +2736,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( } if (FillRC) { - assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() && + assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == + TRI.getRegSizeInBits(*FillRC) && "Mismatched regclass size on folded subreg COPY"); loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); MachineInstr &LoadMI = *--InsertPt; @@ -3025,7 +3027,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, return false; } -void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +void AArch64InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::createImm(0)); } diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index bacce441f6c5..4cd14db633b9 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -205,7 +205,7 @@ public: const DebugLoc &DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; + void getNoop(MCInst &NopInst) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 5e01b6cd2b46..b0e0e3eb4ba7 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -41,12 +41,17 @@ using namespace llvm; namespace { +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + class AArch64InstructionSelector : public InstructionSelector { public: AArch64InstructionSelector(const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI); + void beginFunction(const MachineFunction &MF) override; bool select(MachineInstr &I) const override; private: @@ -62,14 +67,19 @@ private: bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; - bool selectArithImmed(MachineOperand &Root, MachineOperand &Result1, - MachineOperand &Result2) const; + ComplexRendererFn selectArithImmed(MachineOperand &Root) const; const AArch64TargetMachine &TM; const AArch64Subtarget &STI; const AArch64InstrInfo &TII; const AArch64RegisterInfo &TRI; const AArch64RegisterBankInfo &RBI; + bool ForCodeSize; + + PredicateBitset AvailableFeatures; + PredicateBitset + computeAvailableFeatures(const MachineFunction *MF, + const AArch64Subtarget *Subtarget) const; // We declare the temporaries used by selectImpl() in the class to minimize the // cost of constructing placeholder values. @@ -88,7 +98,7 @@ AArch64InstructionSelector::AArch64InstructionSelector( const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI) : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI) + TRI(*STI.getRegisterInfo()), RBI(RBI), ForCodeSize(), AvailableFeatures() #define GET_GLOBALISEL_TEMPORARIES_INIT #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_TEMPORARIES_INIT @@ -567,6 +577,12 @@ bool AArch64InstructionSelector::selectVaStartDarwin( return true; } +void AArch64InstructionSelector::beginFunction( + const MachineFunction &MF) { + ForCodeSize = MF.getFunction()->optForSize(); + AvailableFeatures = computeAvailableFeatures(&MF, &STI); +} + bool AArch64InstructionSelector::select(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -1312,9 +1328,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { /// SelectArithImmed - Select an immediate value that can be represented as /// a 12-bit value shifted left by either 0 or 12. If so, return true with /// Val set to the 12-bit value and Shift set to the shifter operand. -bool AArch64InstructionSelector::selectArithImmed( - MachineOperand &Root, MachineOperand &Result1, - MachineOperand &Result2) const { +InstructionSelector::ComplexRendererFn +AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { MachineInstr &MI = *Root.getParent(); MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); @@ -1333,13 +1348,13 @@ bool AArch64InstructionSelector::selectArithImmed( else if (Root.isReg()) { MachineInstr *Def = MRI.getVRegDef(Root.getReg()); if (Def->getOpcode() != TargetOpcode::G_CONSTANT) - return false; + return nullptr; MachineOperand &Op1 = Def->getOperand(1); if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64) - return false; + return nullptr; Immed = Op1.getCImm()->getZExtValue(); } else - return false; + return nullptr; unsigned ShiftAmt; @@ -1349,14 +1364,10 @@ bool AArch64InstructionSelector::selectArithImmed( ShiftAmt = 12; Immed = Immed >> 12; } else - return false; + return nullptr; unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); - Result1.ChangeToImmediate(Immed); - Result1.clearParent(); - Result2.ChangeToImmediate(ShVal); - Result2.clearParent(); - return true; + return [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed).addImm(ShVal); }; } namespace llvm { diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td index eec089087fe0..cf1c0b66db58 100644 --- a/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/lib/Target/AArch64/AArch64SchedFalkor.td @@ -79,14 +79,14 @@ def : WriteRes<WriteIM64, [FalkorUnitX]> { let Latency = 5; } def : WriteRes<WriteBr, [FalkorUnitB]> { let Latency = 1; } def : WriteRes<WriteBrReg, [FalkorUnitB]> { let Latency = 1; } def : WriteRes<WriteLD, [FalkorUnitLD]> { let Latency = 3; } -def : WriteRes<WriteST, [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]> - { let Latency = 3; let NumMicroOps = 3; } +def : WriteRes<WriteST, [FalkorUnitST, FalkorUnitSD]> + { let Latency = 0; let NumMicroOps = 2; } def : WriteRes<WriteSTP, [FalkorUnitST, FalkorUnitSD]> { let Latency = 0; let NumMicroOps = 2; } -def : WriteRes<WriteAdr, [FalkorUnitXYZ]> { let Latency = 5; } +def : WriteRes<WriteAdr, [FalkorUnitXYZ]> { let Latency = 1; } def : WriteRes<WriteLDIdx, [FalkorUnitLD]> { let Latency = 5; } -def : WriteRes<WriteSTIdx, [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]> - { let Latency = 4; let NumMicroOps = 3; } +def : WriteRes<WriteSTIdx, [FalkorUnitST, FalkorUnitSD]> + { let Latency = 0; let NumMicroOps = 2; } def : WriteRes<WriteF, [FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 3; let NumMicroOps = 2; } def : WriteRes<WriteFCmp, [FalkorUnitVXVY]> { let Latency = 2; } diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 4bd77d344488..8f8eeef8a6cf 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -326,6 +326,10 @@ def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>; // SIMD Store Instructions // ----------------------------------------------------------------------------- +def : InstRW<[WriteVST], (instregex "^STP(D|S)(i)$")>; +def : InstRW<[WriteVST, WriteAdr], (instregex "^STP(D|S)(post|pre)$")>; +def : InstRW<[FalkorWr_2XYZ_2ST_2VSD_0cyc], (instregex "^STRQro(W|X)$")>; + def : InstRW<[WriteVST], (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; def : InstRW<[WriteVST], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; @@ -421,6 +425,7 @@ def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>; def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>; + // FP Miscellaneous Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; @@ -433,7 +438,6 @@ def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>; - // Load Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>; @@ -461,6 +465,7 @@ def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>; def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>; def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>; + // Miscellaneous Data-Processing Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(S|U)?BFM(W|X)ri$")>; @@ -502,28 +507,30 @@ def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>; def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>; def : InstRW<[WriteVST], (instrs STNPDi, STNPSi)>; -def : InstRW<[WriteSTP], (instrs STNPWi, STNPXi)>; +def : InstRW<[WriteSTP], (instrs STNPWi, STNPXi)>; def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>; -def : InstRW<[WriteST], (instregex "^LDC.*$")>; -def : InstRW<[WriteST], (instregex "^STLR(B|H|W|X)$")>; -def : InstRW<[WriteST], (instregex "^STXP(W|X)$")>; -def : InstRW<[WriteST], (instregex "^STXR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STLR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXP(W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>; -def : InstRW<[WriteSTX], (instregex "^STLXP(W|X)$")>; -def : InstRW<[WriteSTX], (instregex "^STLXR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>; +def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>; def : InstRW<[WriteVST, WriteVST], (instrs STNPQi)>; // Store Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteVST], (instregex "^STP(D|S)(i|post|pre)$")>; -def : InstRW<[WriteST], (instregex "^STP(W|X)(i|post|pre)$")>; +def : InstRW<[WriteST], (instregex "^STP(W|X)i$")>; +def : InstRW<[WriteST, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>; def : InstRW<[WriteST], (instregex "^STR(Q|D|S|BB|HH)ui$")>; def : InstRW<[WriteST], (instregex "^STUR(Q|D|S|BB|HH)i$")>; -def : InstRW<[WriteST], (instregex "^STR(B|H|W|X)(post|pre|ui)$")>; +def : InstRW<[WriteST], (instregex "^STR(B|H|W|X)ui$")>; +def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)(post|pre)$")>; def : InstRW<[WriteST], (instregex "^STTR(B|H|W|X)i$")>; def : InstRW<[WriteST], (instregex "^STUR(B|H|W|X)i$")>; def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)ro(W|X)$")>; -def : InstRW<[WriteVST, WriteVST], (instregex "^STPQ(i|post|pre)$")>; +def : InstRW<[WriteVST, WriteVST], (instregex "^STPQi$")>; +def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^STPQ(post|pre)$")>; diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td index 9cdb4be4246b..e64b2c441a19 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td +++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td @@ -28,7 +28,6 @@ //===----------------------------------------------------------------------===// // Define 1 micro-op types - def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } def FalkorWr_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } def FalkorWr_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } @@ -175,18 +174,33 @@ def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> { //===----------------------------------------------------------------------===// // Define 3 micro-op types +def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 0; + let NumMicroOps = 3; +} + +def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 3; let NumMicroOps = 3; } + def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 4; let NumMicroOps = 3; } + def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 5; let NumMicroOps = 3; } + def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 6; let NumMicroOps = 3; @@ -196,10 +210,12 @@ def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { let Latency = 4; let NumMicroOps = 3; } + def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { let Latency = 3; let NumMicroOps = 3; } + def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, FalkorUnitLD]> { let Latency = 3; @@ -259,6 +275,12 @@ def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { let NumMicroOps = 4; } +def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST, + FalkorUnitSD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + //===----------------------------------------------------------------------===// // Define 5 micro-op types @@ -289,6 +311,13 @@ def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, let NumMicroOps = 6; } +def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, + FalkorUnitVSD, FalkorUnitXYZ, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + //===----------------------------------------------------------------------===// // Define 8 micro-op types diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index d7bbc2bcd22c..4dbcc9581a84 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -2473,16 +2473,14 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { return MatchOperand_ParseFail; } - auto DB = AArch64DB::lookupDBByName(Tok.getString()); - if (!DB) { - TokError("invalid barrier option name"); - return MatchOperand_ParseFail; - } - // The only valid named option for ISB is 'sy' - if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) { + auto DB = AArch64DB::lookupDBByName(Tok.getString()); + if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) { TokError("'sy' or #imm operand expected"); return MatchOperand_ParseFail; + } else if (!DB) { + TokError("invalid barrier option name"); + return MatchOperand_ParseFail; } Operands.push_back(AArch64Operand::CreateBarrier( diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 41ae70f85e58..fc89657bffd3 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" @@ -275,6 +276,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, } } + if (Opcode == AArch64::CompilerBarrier) { + O << '\t' << MAI.getCommentString() << " COMPILER BARRIER"; + printAnnotation(O, Annot); + return; + } + if (!printAliasInstr(MI, STI, O)) printInstruction(MI, STI, O); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 62dfa59483eb..33698d2b8c38 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -565,6 +565,9 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call); Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup)); return; + } else if (MI.getOpcode() == AArch64::CompilerBarrier) { + // This just prevents the compiler from reordering accesses, no actual code. + return; } uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI); diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 2c7a2d8962d0..0f331486d0f8 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -406,7 +406,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, - FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode + FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, + FeatureFastFMAF32 ] >; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 318de7f2e3d2..f5110857da84 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -116,8 +116,11 @@ private: bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; - bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, - SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFScratchOffen(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFScratchOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; @@ -150,14 +153,12 @@ private: bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; - bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Omod) const; bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; @@ -953,8 +954,12 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, return true; } +static bool isLegalMUBUFImmOffset(unsigned Imm) { + return isUInt<12>(Imm); +} + static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { - return isUInt<12>(Imm->getZExtValue()); + return isLegalMUBUFImmOffset(Imm->getZExtValue()); } bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, @@ -1076,9 +1081,9 @@ SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { return N; } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &ImmOffset) const { +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); @@ -1087,8 +1092,22 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); - // (add n0, c1) + if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + unsigned Imm = CAddr->getZExtValue(); + assert(!isLegalMUBUFImmOffset(Imm) && + "should have been selected by other pattern"); + + SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); + MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, HighBits); + VAddr = SDValue(MovHighBits, 0); + ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); + return true; + } + if (CurDAG->isBaseWithConstantOffset(Addr)) { + // (add n0, c1) + SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); @@ -1107,6 +1126,24 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, return true; } +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr, + SDValue &SRsrc, + SDValue &SOffset, + SDValue &Offset) const { + ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); + if (!CAddr || !isLegalMUBUFImmOffset(CAddr)) + return false; + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); + Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); + return true; +} + bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, @@ -1628,38 +1665,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, return isNoNanSrc(Src); } -bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - bool Res = SelectVOP3Mods(In, Src, SrcMods); - return Res && cast<ConstantSDNode>(SrcMods)->isNullValue(); +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { + if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) + return false; + + Src = In; + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { SDLoc DL(In); - // FIXME: Handle Clamp and Omod - Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); - Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); - - return SelectVOP3Mods(In, Src, SrcMods); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src, - SDValue &SrcMods, SDValue &Clamp, - SDValue &Omod) const { - bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod); - - return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() && - cast<ConstantSDNode>(Clamp)->isNullValue() && - cast<ConstantSDNode>(Omod)->isNullValue(); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Omod) const { - // FIXME: Handle Omod - Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); return SelectVOP3Mods(In, Src, SrcMods); } @@ -1677,9 +1696,8 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, Src = In; SDLoc DL(In); - // FIXME: Handle Clamp and Omod - Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); - Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); return true; } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index c0f336e082bd..e21775e61dd4 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2315,12 +2315,13 @@ static bool simplifyI24(SDNode *Node24, unsigned OpIdx, SelectionDAG &DAG = DCI.DAG; SDValue Op = Node24->getOperand(OpIdx); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = Op.getValueType(); APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, true, true); - if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI)) + if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO)) return true; return false; @@ -3361,7 +3362,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { DCI.CommitTargetLoweringOpt(TLO); @@ -3436,6 +3437,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ELSE) NODE_NAME_CASE(LOOP) NODE_NAME_CASE(CALL) + NODE_NAME_CASE(TRAP) NODE_NAME_CASE(RET_FLAG) NODE_NAME_CASE(RETURN_TO_EPILOG) NODE_NAME_CASE(ENDPGM) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index d6aa0ba92bf7..13cbfe267932 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -231,6 +231,10 @@ public: AMDGPUAS getAMDGPUAS() const { return AMDGPUASI; } + + MVT getFenceOperandTy(const DataLayout &DL) const override { + return MVT::i32; + } }; namespace AMDGPUISD { @@ -244,6 +248,7 @@ enum NodeType : unsigned { // Function call. CALL, + TRAP, // Masked control flow nodes. IF, diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 56f060984f08..c1706d12a2ea 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -78,6 +78,11 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; +def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", + SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, + [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] +>; + def AMDGPUconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<0, iPTR>]> diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index b8d681298dee..4e688ab0b105 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -50,6 +50,16 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; +def u16ImmTarget : AsmOperandClass { + let Name = "U16Imm"; + let RenderMethod = "addImmOperands"; +} + +def s16ImmTarget : AsmOperandClass { + let Name = "S16Imm"; + let RenderMethod = "addImmOperands"; +} + let OperandType = "OPERAND_IMMEDIATE" in { def u32imm : Operand<i32> { @@ -58,6 +68,12 @@ def u32imm : Operand<i32> { def u16imm : Operand<i16> { let PrintMethod = "printU16ImmOperand"; + let ParserMatchClass = u16ImmTarget; +} + +def s16imm : Operand<i16> { + let PrintMethod = "printU16ImmOperand"; + let ParserMatchClass = s16ImmTarget; } def u8imm : Operand<i8> { diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 14ee1c81f8fa..da247fea7de6 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -225,6 +225,12 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) { + if (isVerbose()) + OutStreamer->emitRawComment(" divergent unreachable"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 961f7186f373..70c848f3c7bd 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -479,6 +479,8 @@ public: bool isSMRDLiteralOffset() const; bool isDPPCtrl() const; bool isGPRIdxMode() const; + bool isS16Imm() const; + bool isU16Imm() const; StringRef getExpressionAsToken() const { assert(isExpr()); @@ -2836,6 +2838,28 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { // s_waitcnt //===----------------------------------------------------------------------===// +static bool +encodeCnt( + const AMDGPU::IsaInfo::IsaVersion ISA, + int64_t &IntVal, + int64_t CntVal, + bool Saturate, + unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned), + unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned)) +{ + bool Failed = false; + + IntVal = encode(ISA, IntVal, CntVal); + if (CntVal != decode(ISA, IntVal)) { + if (Saturate) { + IntVal = encode(ISA, IntVal, -1); + } else { + Failed = true; + } + } + return Failed; +} + bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { StringRef CntName = Parser.getTok().getString(); int64_t CntVal; @@ -2851,25 +2875,35 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { if (getParser().parseAbsoluteExpression(CntVal)) return true; - if (getLexer().isNot(AsmToken::RParen)) - return true; - - Parser.Lex(); - if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) - Parser.Lex(); - AMDGPU::IsaInfo::IsaVersion ISA = AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); - if (CntName == "vmcnt") - IntVal = encodeVmcnt(ISA, IntVal, CntVal); - else if (CntName == "expcnt") - IntVal = encodeExpcnt(ISA, IntVal, CntVal); - else if (CntName == "lgkmcnt") - IntVal = encodeLgkmcnt(ISA, IntVal, CntVal); - else - return true; - return false; + bool Failed = true; + bool Sat = CntName.endswith("_sat"); + + if (CntName == "vmcnt" || CntName == "vmcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeVmcnt, decodeVmcnt); + } else if (CntName == "expcnt" || CntName == "expcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt); + } else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt); + } + + // To improve diagnostics, do not skip delimiters on errors + if (!Failed) { + if (getLexer().isNot(AsmToken::RParen)) { + return true; + } + Parser.Lex(); + if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) { + const AsmToken NextToken = getLexer().peekTok(); + if (NextToken.is(AsmToken::Identifier)) { + Parser.Lex(); + } + } + } + + return Failed; } OperandMatchResultTy @@ -3858,6 +3892,14 @@ bool AMDGPUOperand::isGPRIdxMode() const { return isImm() && isUInt<4>(getImm()); } +bool AMDGPUOperand::isS16Imm() const { + return isImm() && (isInt<16>(getImm()) || isUInt<16>(getImm())); +} + +bool AMDGPUOperand::isU16Imm() const { + return isImm() && isUInt<16>(getImm()); +} + OperandMatchResultTy AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index a6609f0725ab..89eddb9ce961 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -11,7 +11,9 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; -def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; +def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen">; +def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [], 20>; + def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; @@ -958,21 +960,30 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>; } // End Predicates = [Has16BitInsts] -class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat < - (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; +multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, + MUBUF_Pseudo InstrOffset, + ValueType vt, PatFrag ld> { + def : Pat < + (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + >; + + def : Pat < + (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0) + >; +} -def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; -def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, @@ -1054,19 +1065,29 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>; defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>; -class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, - u16imm:$offset)), - (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; +multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, + MUBUF_Pseudo InstrOffset, + ValueType vt, PatFrag st> { + def : Pat < + (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset)), + (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + >; + + def : Pat < + (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, + u16imm:$offset)), + (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0) + >; +} -def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>; //===----------------------------------------------------------------------===// // MTBUF Patterns diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 4ecfa118fb27..bf16a8216001 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -83,8 +83,8 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg, const auto RC = MRI.getRegClass(Reg); auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); return STI->isSGPRClass(RC) ? - (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) : - (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE); + (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : + (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); } void GCNRegPressure::inc(unsigned Reg, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp index 29a6ab9fbe93..647017d5061d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp @@ -286,20 +286,20 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual, return ValueKind::Pipe; return StringSwitch<ValueKind>(BaseTypeName) + .Case("image1d_t", ValueKind::Image) + .Case("image1d_array_t", ValueKind::Image) + .Case("image1d_buffer_t", ValueKind::Image) + .Case("image2d_t", ValueKind::Image) + .Case("image2d_array_t", ValueKind::Image) + .Case("image2d_array_depth_t", ValueKind::Image) + .Case("image2d_array_msaa_t", ValueKind::Image) + .Case("image2d_array_msaa_depth_t", ValueKind::Image) + .Case("image2d_depth_t", ValueKind::Image) + .Case("image2d_msaa_t", ValueKind::Image) + .Case("image2d_msaa_depth_t", ValueKind::Image) + .Case("image3d_t", ValueKind::Image) .Case("sampler_t", ValueKind::Sampler) .Case("queue_t", ValueKind::Queue) - .Cases("image1d_t", - "image1d_array_t", - "image1d_buffer_t", - "image2d_t" , - "image2d_array_t", - "image2d_array_depth_t", - "image2d_array_msaa_t" - "image2d_array_msaa_depth_t" - "image2d_depth_t", - "image2d_msaa_t", - "image2d_msaa_depth_t", - "image3d_t", ValueKind::Image) .Default(isa<PointerType>(Ty) ? (Ty->getPointerAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ? diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 6c61fb1f2d6b..2364e7b7b5fb 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -15,6 +15,7 @@ using namespace llvm; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4; + StackGrowsUp = true; HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// MinInstAlignment = 4; diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f9d258f44a62..b0f0bf04a891 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -81,6 +81,11 @@ using namespace llvm; #define DEBUG_TYPE "si-fix-sgpr-copies" +static cl::opt<bool> EnableM0Merge( + "amdgpu-enable-merge-m0", + cl::desc("Merge and hoist M0 initializations"), + cl::init(false)); + namespace { class SIFixSGPRCopies : public MachineFunctionPass { @@ -108,7 +113,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) @@ -332,27 +337,186 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, return true; } -static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, - const TargetRegisterInfo *TRI) { - DenseSet<MachineBasicBlock*> Visited; +template <class UnaryPredicate> +bool searchPredecessors(const MachineBasicBlock *MBB, + const MachineBasicBlock *CutOff, + UnaryPredicate Predicate) { + + if (MBB == CutOff) + return false; + + DenseSet<const MachineBasicBlock*> Visited; SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), MBB->pred_end()); while (!Worklist.empty()) { - MachineBasicBlock *mbb = Worklist.back(); - Worklist.pop_back(); + MachineBasicBlock *MBB = Worklist.pop_back_val(); - if (!Visited.insert(mbb).second) + if (!Visited.insert(MBB).second) continue; - if (hasTerminatorThatModifiesExec(*mbb, *TRI)) + if (MBB == CutOff) + continue; + if (Predicate(MBB)) return true; - Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end()); + Worklist.append(MBB->pred_begin(), MBB->pred_end()); } return false; } +static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, + const TargetRegisterInfo *TRI) { + return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { + return hasTerminatorThatModifiesExec(*MBB, *TRI); }); +} + +// Checks if there is potential path From instruction To instruction. +// If CutOff is specified and it sits in between of that path we ignore +// a higher portion of the path and report it is not reachable. +static bool isReachable(const MachineInstr *From, + const MachineInstr *To, + const MachineBasicBlock *CutOff, + MachineDominatorTree &MDT) { + // If either From block dominates To block or instructions are in the same + // block and From is higher. + if (MDT.dominates(From, To)) + return true; + + const MachineBasicBlock *MBBFrom = From->getParent(); + const MachineBasicBlock *MBBTo = To->getParent(); + if (MBBFrom == MBBTo) + return false; + + // Instructions are in different blocks, do predecessor search. + // We should almost never get here since we do not usually produce M0 stores + // other than -1. + return searchPredecessors(MBBTo, CutOff, [MBBFrom] + (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); +} + +// Hoist and merge identical SGPR initializations into a common predecessor. +// This is intended to combine M0 initializations, but can work with any +// SGPR. A VGPR cannot be processed since we cannot guarantee vector +// executioon. +static bool hoistAndMergeSGPRInits(unsigned Reg, + const MachineRegisterInfo &MRI, + MachineDominatorTree &MDT) { + // List of inits by immediate value. + typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap; + InitListMap Inits; + // List of clobbering instructions. + SmallVector<MachineInstr*, 8> Clobbers; + bool Changed = false; + + for (auto &MI : MRI.def_instructions(Reg)) { + MachineOperand *Imm = nullptr; + for (auto &MO: MI.operands()) { + if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || + (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { + Imm = nullptr; + break; + } else if (MO.isImm()) + Imm = &MO; + } + if (Imm) + Inits[Imm->getImm()].push_front(&MI); + else + Clobbers.push_back(&MI); + } + + for (auto &Init : Inits) { + auto &Defs = Init.second; + + for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { + MachineInstr *MI1 = *I1; + + for (auto I2 = std::next(I1); I2 != E; ) { + MachineInstr *MI2 = *I2; + + // Check any possible interference + auto intereferes = [&](MachineBasicBlock::iterator From, + MachineBasicBlock::iterator To) -> bool { + + assert(MDT.dominates(&*To, &*From)); + + auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { + const MachineBasicBlock *MBBFrom = From->getParent(); + const MachineBasicBlock *MBBTo = To->getParent(); + bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); + bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); + if (!MayClobberFrom && !MayClobberTo) + return false; + if ((MayClobberFrom && !MayClobberTo) || + (!MayClobberFrom && MayClobberTo)) + return true; + // Both can clobber, this is not an interference only if both are + // dominated by Clobber and belong to the same block or if Clobber + // properly dominates To, given that To >> From, so it dominates + // both and located in a common dominator. + return !((MBBFrom == MBBTo && + MDT.dominates(Clobber, &*From) && + MDT.dominates(Clobber, &*To)) || + MDT.properlyDominates(Clobber->getParent(), MBBTo)); + }; + + return (any_of(Clobbers, interferes)) || + (any_of(Inits, [&](InitListMap::value_type &C) { + return C.first != Init.first && any_of(C.second, interferes); + })); + }; + + if (MDT.dominates(MI1, MI2)) { + if (!intereferes(MI2, MI1)) { + DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber() + << " " << *MI2); + MI2->eraseFromParent(); + Defs.erase(I2++); + Changed = true; + continue; + } + } else if (MDT.dominates(MI2, MI1)) { + if (!intereferes(MI1, MI2)) { + DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() + << " " << *MI1); + MI1->eraseFromParent(); + Defs.erase(I1++); + Changed = true; + break; + } + } else { + auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), + MI2->getParent()); + if (!MBB) { + ++I2; + continue; + } + + MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); + if (!intereferes(MI1, I) && !intereferes(MI2, I)) { + DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() + << " " << *MI1 << "and moving from BB#" + << MI2->getParent()->getNumber() << " to BB#" + << I->getParent()->getNumber() << " " << *MI2); + I->getParent()->splice(I, MI2->getParent(), MI2); + MI1->eraseFromParent(); + Defs.erase(I1++); + Changed = true; + break; + } + } + ++I2; + } + ++I1; + } + } + + if (Changed) + MRI.clearKillFlags(Reg); + + return Changed; +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -485,5 +649,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } } + if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) + hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT); + return true; } diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index abe6af9a6d3f..86e3b37b09e9 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -101,10 +101,12 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (ScratchRsrcReg == AMDGPU::NoRegister) + if (ScratchRsrcReg == AMDGPU::NoRegister || + !MRI.isPhysRegUsed(ScratchRsrcReg)) return AMDGPU::NoRegister; if (ST.hasSGPRInitBug() || @@ -122,8 +124,6 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( // We find the resource first because it has an alignment requirement. - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF); AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded)); @@ -143,24 +143,34 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( +// Shift down registers reserved for the scratch wave offset and stack pointer +// SGPRs. +std::pair<unsigned, unsigned> +SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( const SISubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - if (ST.hasSGPRInitBug() || - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) - return ScratchWaveOffsetReg; - unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - MachineRegisterInfo &MRI = MF.getRegInfo(); + // No replacement necessary. + if (ScratchWaveOffsetReg == AMDGPU::NoRegister || + !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { + assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister); + return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); + } + + unsigned SPReg = MFI->getStackPtrOffsetReg(); + if (ST.hasSGPRInitBug()) + return std::make_pair(ScratchWaveOffsetReg, SPReg); + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return ScratchWaveOffsetReg; + return std::make_pair(ScratchWaveOffsetReg, SPReg); AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -175,26 +185,42 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( // register from the list to consider, it means that when this // register is being used for the scratch wave offset and there // are no other free SGPRs, then the value will stay in this register. + // + 1 if stack pointer is used. // ---- - // 13 - if (AllSGPRs.size() < 13) - return ScratchWaveOffsetReg; + // 13 (+1) + unsigned ReservedRegCount = 13; + if (SPReg != AMDGPU::NoRegister) + ++ReservedRegCount; - for (MCPhysReg Reg : AllSGPRs.drop_back(13)) { + if (AllSGPRs.size() < ReservedRegCount) + return std::make_pair(ScratchWaveOffsetReg, SPReg); + + bool HandledScratchWaveOffsetReg = + ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg)) { - if (!MRI.isAllocatable(Reg) || - TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) - continue; + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + if (!HandledScratchWaveOffsetReg) { + HandledScratchWaveOffsetReg = true; - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - MFI->setScratchWaveOffsetReg(Reg); - return Reg; + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + MFI->setScratchWaveOffsetReg(Reg); + ScratchWaveOffsetReg = Reg; + } else { + if (SPReg == AMDGPU::NoRegister) + break; + + MRI.replaceRegWith(SPReg, Reg); + MFI->setStackPtrOffsetReg(Reg); + SPReg = Reg; + break; + } } } - return ScratchWaveOffsetReg; + return std::make_pair(ScratchWaveOffsetReg, SPReg); } void SIFrameLowering::emitPrologue(MachineFunction &MF, @@ -220,18 +246,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned ScratchRsrcReg - = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); - unsigned ScratchWaveOffsetReg - = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - - if (ScratchRsrcReg == AMDGPU::NoRegister) { - assert(ScratchWaveOffsetReg == AMDGPU::NoRegister); - return; - } - - assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); - // We need to do the replacement of the private segment buffer and wave offset // register even if there are no stack objects. There could be stores to undef // or a constant without an associated object. @@ -244,19 +258,49 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); + unsigned SPReg = MFI->getStackPtrOffsetReg(); + if (SPReg != AMDGPU::NoRegister) { + DebugLoc DL; + int64_t StackSize = MF.getFrameInfo().getStackSize(); + + if (StackSize == 0) { + BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } else { + BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(StackSize * ST.getWavefrontSize()); + } + } + + unsigned ScratchRsrcReg + = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + + unsigned ScratchWaveOffsetReg; + std::tie(ScratchWaveOffsetReg, SPReg) + = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); + + // It's possible to have uses of only ScratchWaveOffsetReg without + // ScratchRsrcReg if it's only used for the initialization of flat_scratch, + // but the inverse is not true. + if (ScratchWaveOffsetReg == AMDGPU::NoRegister) { + assert(ScratchRsrcReg == AMDGPU::NoRegister); + return; + } + // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { PreloadedPrivateBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } - bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg); - bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg); + bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); + bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && + MRI.isPhysRegUsed(ScratchRsrcReg); // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. @@ -469,7 +513,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( // this also ensures we shouldn't need a register for the offset when // emergency scavenging. int ScavengeFI = MFI.CreateFixedObject( - AMDGPU::SGPR_32RegClass.getSize(), 0, false); + TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); RS->addScavengingFrameIndex(ScavengeFI); } } diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index 1bfc08093da2..7ccd02b3c86a 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -49,7 +49,7 @@ private: SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - unsigned getReservedPrivateSegmentWaveByteOffsetReg( + std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg( const SISubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index dd867b15b4c7..ce74a7cd8b04 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -287,8 +287,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // On SI this is s_memtime and s_memrealtime on VI. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - setOperationAction(ISD::TRAP, MVT::Other, Legal); - setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); + setOperationAction(ISD::TRAP, MVT::Other, Custom); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); @@ -1644,7 +1644,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset) { - int NumElts = SuperRC->getSize() / 4; + int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; // Skip out of bounds offsets, or else we would end up using an undefined // register. @@ -1793,17 +1793,18 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, return LoopBB; } -static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) { - switch (VecRC->getSize()) { - case 4: +static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, + const TargetRegisterClass *VecRC) { + switch (TRI.getRegSizeInBits(*VecRC)) { + case 32: // 4 bytes return AMDGPU::V_MOVRELD_B32_V1; - case 8: + case 64: // 8 bytes return AMDGPU::V_MOVRELD_B32_V2; - case 16: + case 128: // 16 bytes return AMDGPU::V_MOVRELD_B32_V4; - case 32: + case 256: // 32 bytes return AMDGPU::V_MOVRELD_B32_V8; - case 64: + case 512: // 64 bytes return AMDGPU::V_MOVRELD_B32_V16; default: llvm_unreachable("unsupported size for MOVRELD pseudos"); @@ -1863,7 +1864,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); BuildMI(MBB, I, DL, MovRelDesc) .addReg(Dst, RegState::Define) @@ -1907,7 +1908,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, .addReg(PhiReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); BuildMI(*LoopBB, InsPt, DL, MovRelDesc) .addReg(Dst, RegState::Define) @@ -1948,50 +1949,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } switch (MI.getOpcode()) { - case AMDGPU::S_TRAP_PSEUDO: { - const DebugLoc &DL = MI.getDebugLoc(); - const int TrapType = MI.getOperand(0).getImm(); - - if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && - Subtarget->isTrapHandlerEnabled()) { - - MachineFunction *MF = BB->getParent(); - SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); - unsigned UserSGPR = Info->getQueuePtrUserSGPR(); - assert(UserSGPR != AMDGPU::NoRegister); - - if (!BB->isLiveIn(UserSGPR)) - BB->addLiveIn(UserSGPR); - - BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::SGPR0_SGPR1) - .addReg(UserSGPR); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_TRAP)) - .addImm(TrapType) - .addReg(AMDGPU::SGPR0_SGPR1, RegState::Implicit); - } else { - switch (TrapType) { - case SISubtarget::TrapIDLLVMTrap: - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ENDPGM)); - break; - case SISubtarget::TrapIDLLVMDebugTrap: { - DiagnosticInfoUnsupported NoTrap(*MF->getFunction(), - "debugtrap handler not supported", - DL, - DS_Warning); - LLVMContext &C = MF->getFunction()->getContext(); - C.diagnose(NoTrap); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_NOP)) - .addImm(0); - break; - } - default: - llvm_unreachable("unsupported trap handler type!"); - } - } - - MI.eraseFromParent(); - return BB; - } case AMDGPU::SI_INIT_M0: BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) @@ -2163,6 +2120,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); + + case ISD::TRAP: + case ISD::DEBUGTRAP: + return lowerTRAP(Op, DAG); } return SDValue(); } @@ -2431,6 +2392,57 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; } +SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = Op.getOperand(0); + + unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ? + SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap; + + if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && + Subtarget->isTrapHandlerEnabled()) { + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + + SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); + + SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, + QueuePtr, SDValue()); + + SDValue Ops[] = { + ToReg, + DAG.getTargetConstant(TrapID, SL, MVT::i16), + SGPR01, + ToReg.getValue(1) + }; + + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); + } + + switch (TrapID) { + case SISubtarget::TrapIDLLVMTrap: + return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); + case SISubtarget::TrapIDLLVMDebugTrap: { + DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), + "debugtrap handler not supported", + Op.getDebugLoc(), + DS_Warning); + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.diagnose(NoTrap); + return Chain; + } + default: + llvm_unreachable("unsupported trap handler type!"); + } + + return Chain; +} + SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { // FIXME: Use inline constants (src_{shared, private}_base) instead. @@ -3410,9 +3422,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, EVT VT = Op.getValueType(); bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) + return SDValue(); + if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { - if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - VT == MVT::f16) { + if (Unsafe || VT == MVT::f32 || VT == MVT::f16) { if (CLHS->isExactlyValue(1.0)) { // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. @@ -4696,7 +4710,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Src, Demanded) || + if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) || TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { DCI.CommitTargetLoweringOpt(TLO); } diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c2a3e62aa827..9122cd72d323 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -428,8 +428,8 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, const MachineInstr &MIA = *MI; const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo); - unsigned Size = RC->getSize(); - Result.second = Result.first + (Size / 4); + unsigned Size = TRI->getRegSizeInBits(*RC); + Result.second = Result.first + (Size / 32); return Result; } diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 47257ce16ceb..9f32ecfa52ff 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -216,8 +216,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { // XXX - What if this is a write into a super register? const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); - unsigned Size = RC->getSize(); - Result.Named.LGKM = Size > 4 ? 2 : 1; + unsigned Size = TRI->getRegSizeInBits(*RC); + Result.Named.LGKM = Size > 32 ? 2 : 1; } else { // s_dcache_inv etc. do not have a a destination register. Assume we // want a wait on these. @@ -289,12 +289,12 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, const MachineOperand &Reg) const { - unsigned Size = RC->getSize(); - assert(Size >= 4); + unsigned Size = TRI->getRegSizeInBits(*RC); + assert(Size >= 32); RegInterval Result; Result.first = TRI->getEncodingValue(Reg.getReg()); - Result.second = Result.first + Size / 4; + Result.second = Result.first + Size / 32; return Result; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 05ac67d26620..92e452a3d6a0 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -138,6 +138,11 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, } if (isSMRD(Opc0) && isSMRD(Opc1)) { + // Skip time and cache invalidation instructions. + if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || + AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) + return false; + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); // Check base reg. @@ -245,11 +250,11 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, unsigned EltSize; if (LdSt.mayLoad()) - EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; + EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; else { assert(LdSt.mayStore()); int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); + EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; } if (isStride64(Opc)) @@ -345,7 +350,7 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, FirstLdSt.getParent()->getParent()->getRegInfo(); const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); - return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; + return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, @@ -433,7 +438,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { - if (RC->getSize() > 4) { + if (RI.getRegSizeInBits(*RC) > 32) { Opcode = AMDGPU::S_MOV_B64; EltSize = 8; } else { @@ -493,11 +498,11 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const { unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { - if (DstRC->getSize() == 4) { + if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; - } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { return AMDGPU::S_MOV_B64; - } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { + } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { return AMDGPU::V_MOV_B64_PSEUDO; } return AMDGPU::COPY; @@ -557,17 +562,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, Size, Align); + unsigned SpillSize = TRI->getSpillSize(*RC); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling SGPRs. - const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); + const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. - if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { + if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } @@ -602,7 +608,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); + unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data @@ -660,6 +666,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); unsigned Size = FrameInfo.getObjectSize(FrameIndex); + unsigned SpillSize = TRI->getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -670,8 +677,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. - const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); - if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { + const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); + if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } @@ -701,7 +708,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); + unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc @@ -1440,9 +1447,9 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); - unsigned DstSize = DstRC->getSize(); + unsigned DstSize = RI.getRegSizeInBits(*DstRC); - if (DstSize == 4) { + if (DstSize == 32) { unsigned SelOp = Pred == SCC_TRUE ? AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; @@ -1456,7 +1463,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, return; } - if (DstSize == 8 && Pred == SCC_TRUE) { + if (DstSize == 64 && Pred == SCC_TRUE) { MachineInstr *Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) .addReg(FalseReg) @@ -1483,7 +1490,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; const int16_t *SubIndices = Sub0_15; - int NElts = DstSize / 4; + int NElts = DstSize / 32; // 64-bit select is only avaialble for SALU. if (Pred == SCC_TRUE) { @@ -2635,6 +2642,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) return; + // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for + // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane + // select is uniform. + if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && + RI.isVGPR(MRI, Src1.getReg())) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + return; + } + // We do not use commuteInstruction here because it is too aggressive and will // commute if it is possible. We only want to commute here if it improves // legality. This can be called a fairly large number of times so don't waste @@ -2729,7 +2749,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); unsigned DstReg = MRI.createVirtualRegister(SRC); - unsigned SubRegs = VRC->getSize() / 4; + unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; SmallVector<unsigned, 8> SRegs; for (unsigned i = 0; i < SubRegs; ++i) { @@ -3595,7 +3615,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, .addImm(16) .add(Src0); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) - .addImm(0xffff); + .addImm(0xffff0000); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) .add(Src1) .addReg(ImmReg, RegState::Kill) diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 659473ca6a47..03a5ef74b179 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -626,13 +626,13 @@ public: return 4; } - return RI.getRegClass(OpInfo.RegClass)->getSize(); + return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8; } /// \brief This form should usually be preferred since it handles operands /// with unknown register classes. unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { - return getOpRegClass(MI, OpNo)->getSize(); + return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; } /// \returns true if it is legal for the operand at index \p OpNo diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index c6daf743f3ac..7b052844f177 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -646,11 +646,10 @@ def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; -def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; -def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">; +def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">; // VOP3Mods, but the input source is known to never be NaN. def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 2f89503e129a..3f6ddec70479 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -94,6 +94,12 @@ defm V_INTERP_MOV_F32 : VINTRP_m < //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// +def ATOMIC_FENCE : SPseudoInstSI< + (outs), (ins i32imm:$ordering, i32imm:$scope), + [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))], + "ATOMIC_FENCE $ordering, $scope"> { + let hasSideEffects = 1; +} let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { @@ -111,12 +117,6 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] -def S_TRAP_PSEUDO : SPseudoInstSI <(outs), (ins i16imm:$simm16)> { - let hasSideEffects = 1; - let SALU = 1; - let usesCustomInserter = 1; -} - let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; @@ -400,13 +400,8 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < let Predicates = [isGCN] in { def : Pat< - (trap), - (S_TRAP_PSEUDO TRAPID.LLVM_TRAP) ->; - -def : Pat< - (debugtrap), - (S_TRAP_PSEUDO TRAPID.LLVM_DEBUG_TRAP) + (AMDGPUtrap timm:$trapid), + (S_TRAP $trapid) >; def : Pat< @@ -477,8 +472,8 @@ def : Pat < // fp_to_fp16 patterns def : Pat < - (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))), - (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod) + (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < @@ -507,11 +502,11 @@ def : Pat < multiclass FMADPat <ValueType vt, Instruction inst> { def : Pat < - (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3NoMods vt:$src1, i32:$src1_modifiers), - (VOP3NoMods vt:$src2, i32:$src2_modifiers))), - (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - $src2_modifiers, $src2, $clamp, $omod) + (vt (fmad (VOP3NoMods vt:$src0), + (VOP3NoMods vt:$src1), + (VOP3NoMods vt:$src2))), + (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; } @@ -681,10 +676,9 @@ def : BitConvert <v16f32, v16i32, VReg_512>; // If denormals are not enabled, it only impacts the compare of the // inputs. The output result is not flushed. class ClampPat<Instruction inst, ValueType vt> : Pat < - (vt (AMDGPUclamp - (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))), + (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), (inst i32:$src0_modifiers, vt:$src0, - i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod) + i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) >; def : ClampPat<V_MAX_F32_e64, f32>; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 8e612d2ddfda..b6a982aee6be 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -25,6 +25,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) TIDReg(AMDGPU::NoRegister), ScratchRSrcReg(AMDGPU::NoRegister), ScratchWaveOffsetReg(AMDGPU::NoRegister), + FrameOffsetReg(AMDGPU::NoRegister), + StackPtrOffsetReg(AMDGPU::NoRegister), PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), DispatchPtrUserSGPR(AMDGPU::NoRegister), QueuePtrUserSGPR(AMDGPU::NoRegister), diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 810fb05984c4..dc9f509e60ae 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -88,6 +88,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned ScratchRSrcReg; unsigned ScratchWaveOffsetReg; + // This is the current function's incremented size from the kernel's scratch + // wave offset register. For an entry function, this is exactly the same as + // the ScratchWaveOffsetReg. + unsigned FrameOffsetReg; + + // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. + unsigned StackPtrOffsetReg; + // Input registers for non-HSA ABI unsigned PrivateMemoryPtrUserSGPR; @@ -364,9 +372,25 @@ public: return ScratchWaveOffsetReg; } + unsigned getFrameOffsetReg() const { + return FrameOffsetReg; + } + + void setStackPtrOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + StackPtrOffsetReg = Reg; + } + + unsigned getStackPtrOffsetReg() const { + return StackPtrOffsetReg; + } + void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != AMDGPU::NoRegister && "Should never be unset"); ScratchWaveOffsetReg = Reg; + + // FIXME: Only for entry functions. + FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 098c67252dd8..8820e294562b 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -146,6 +146,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + // M0 has to be reserved so that llvm accepts it as a live-in into a block. + reserveRegisterTuples(Reserved, AMDGPU::M0); + // Reserve the memory aperture registers. reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); @@ -615,7 +618,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. - std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true); + std::tie(EltSize, ScalarStoreOp) = + getSpillEltSize(getRegSizeInBits(*RC) / 8, true); } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); @@ -775,7 +779,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. - std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false); + std::tie(EltSize, ScalarLoadOp) = + getSpillEltSize(getRegSizeInBits(*RC) / 8, false); } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); @@ -1038,20 +1043,21 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TODO: It might be helpful to have some target specific flags in // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - switch (RC->getSize()) { - case 0: return false; - case 1: return false; - case 4: + unsigned Size = getRegSizeInBits(*RC); + if (Size < 32) + return false; + switch (Size) { + case 32: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; - case 8: + case 64: return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; - case 12: + case 96: return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; - case 16: + case 128: return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; - case 32: + case 256: return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; - case 64: + case 512: return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; default: llvm_unreachable("Invalid register class size"); @@ -1060,18 +1066,18 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SRC) const { - switch (SRC->getSize()) { - case 4: + switch (getRegSizeInBits(*SRC)) { + case 32: return &AMDGPU::VGPR_32RegClass; - case 8: + case 64: return &AMDGPU::VReg_64RegClass; - case 12: + case 96: return &AMDGPU::VReg_96RegClass; - case 16: + case 128: return &AMDGPU::VReg_128RegClass; - case 32: + case 256: return &AMDGPU::VReg_256RegClass; - case 64: + case 512: return &AMDGPU::VReg_512RegClass; default: llvm_unreachable("Invalid register class size"); @@ -1080,16 +1086,16 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( const TargetRegisterClass *VRC) const { - switch (VRC->getSize()) { - case 4: + switch (getRegSizeInBits(*VRC)) { + case 32: return &AMDGPU::SGPR_32RegClass; - case 8: + case 64: return &AMDGPU::SReg_64RegClass; - case 16: + case 128: return &AMDGPU::SReg_128RegClass; - case 32: + case 256: return &AMDGPU::SReg_256RegClass; - case 64: + case 512: return &AMDGPU::SReg_512RegClass; default: llvm_unreachable("Invalid register class size"); @@ -1354,15 +1360,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC) const { - unsigned SrcSize = SrcRC->getSize(); - unsigned DstSize = DstRC->getSize(); - unsigned NewSize = NewRC->getSize(); + unsigned SrcSize = getRegSizeInBits(*SrcRC); + unsigned DstSize = getRegSizeInBits(*DstRC); + unsigned NewSize = getRegSizeInBits(*NewRC); // Do not increase size of registers beyond dword, we would need to allocate // adjacent registers and constraint regalloc more than needed. // Always allow dword coalescing. - if (SrcSize <= 4 || DstSize <= 4) + if (SrcSize <= 32 || DstSize <= 32) return true; return NewSize <= DstSize || NewSize <= SrcSize; diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index b4adbdd1df07..593439c2a3cd 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -530,14 +530,16 @@ class SOPKInstTable <bit is_sopk, string cmpOp = ""> { class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo < opName, (outs SReg_32:$sdst), - (ins u16imm:$simm16), + (ins s16imm:$simm16), "$sdst, $simm16", pattern>; -class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo < +class SOPK_SCC <string opName, string base_op, bit isSignExt> : SOPK_Pseudo < opName, (outs), - (ins SReg_32:$sdst, u16imm:$simm16), + !if(isSignExt, + (ins SReg_32:$sdst, s16imm:$simm16), + (ins SReg_32:$sdst, u16imm:$simm16)), "$sdst, $simm16", []>, SOPKInstTable<1, base_op>{ let Defs = [SCC]; @@ -546,7 +548,7 @@ class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo < class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo < opName, (outs SReg_32:$sdst), - (ins SReg_32:$src0, u16imm:$simm16), + (ins SReg_32:$src0, s16imm:$simm16), "$sdst, $simm16", pattern >; @@ -575,20 +577,20 @@ let isCompare = 1 in { // [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] // >; -def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">; -def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">; -def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">; -def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">; -def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">; -def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">; +def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32", 1>; +def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32", 1>; +def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32", 1>; +def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32", 1>; +def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32", 1>; +def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32", 1>; let SOPKZext = 1 in { -def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">; -def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">; -def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">; -def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">; -def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">; -def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">; +def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32", 0>; +def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32", 0>; +def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32", 0>; +def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32", 0>; +def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>; +def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>; } // End SOPKZext = 1 } // End isCompare = 1 @@ -600,7 +602,7 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", def S_CBRANCH_I_FORK : SOPK_Pseudo < "s_cbranch_i_fork", - (outs), (ins SReg_64:$sdst, u16imm:$simm16), + (outs), (ins SReg_64:$sdst, s16imm:$simm16), "$sdst, $simm16" >; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 86095a8e1142..5a3242bed1d0 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -93,6 +93,12 @@ unsigned getVmcntBitWidthHi() { return 2; } } // end namespace anonymous namespace llvm { + +static cl::opt<bool> EnablePackedInlinableLiterals( + "enable-packed-inlinable-literals", + cl::desc("Enable packed inlinable literals (v2f16, v2i16)"), + cl::init(false)); + namespace AMDGPU { namespace IsaInfo { @@ -703,6 +709,9 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { assert(HasInv2Pi); + if (!EnablePackedInlinableLiterals) + return false; + int16_t Lo16 = static_cast<int16_t>(Literal); int16_t Hi16 = static_cast<int16_t>(Literal >> 16); return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 4f5711ca9a79..5c9d589e2625 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -905,7 +905,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 4: if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::STRi12)) @@ -1103,7 +1103,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 4: if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index faf1c631a3a7..28c407f74125 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -105,10 +105,6 @@ public: // Return whether the target has an explicit NOP encoding. bool hasNOP() const; - virtual void getNoopForElfTarget(MCInst &NopInst) const { - getNoopForMachoTarget(NopInst); - } - // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 70a44eaaceb8..a20887564f44 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -806,7 +806,8 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI, if (!DstSubReg) return true; // Small registers don't frequently cause a problem, so we can coalesce them. - if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32) + if (getRegSizeInBits(*NewRC) < 256 && getRegSizeInBits(*DstRC) < 256 && + getRegSizeInBits(*SrcRC) < 256) return true; auto NewRCWeight = diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 94b317a8f986..13fb30767c9f 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -35,7 +35,8 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI) static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T) { EVT VT = TLI.getValueType(DL, T, true); - if (!VT.isSimple() || VT.isVector()) + if (!VT.isSimple() || VT.isVector() || + !(VT.isInteger() || VT.isFloatingPoint())) return false; unsigned VTSize = VT.getSimpleVT().getSizeInBits(); diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index e0aecff2633b..78a9144bd321 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -661,7 +661,6 @@ static bool IsAnAddressOperand(const MachineOperand &MO) { return false; case MachineOperand::MO_IntrinsicID: case MachineOperand::MO_Predicate: - case MachineOperand::MO_Placeholder: llvm_unreachable("should not exist post-isel"); } llvm_unreachable("unhandled machine operand type"); diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 70dbe1bc5b95..4f7a0ab4e220 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -1960,10 +1960,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // note: Thumb1 functions spill to R12, not the stack. Reserve a slot // closest to SP or frame pointer. assert(RS && "Register scavenging not provided"); - const TargetRegisterClass *RC = &ARM::GPRRegClass; - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + const TargetRegisterClass &RC = ARM::GPRRegClass; + unsigned Size = TRI->getSpillSize(RC); + unsigned Align = TRI->getSpillAlignment(RC); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); } } } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 165e9b7378c7..382f881f7741 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -3358,8 +3358,12 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - // FIXME: handle "fence singlethread" more efficiently. SDLoc dl(Op); + ConstantSDNode *ScopeN = cast<ConstantSDNode>(Op.getOperand(2)); + auto Scope = static_cast<SynchronizationScope>(ScopeN->getZExtValue()); + if (Scope == SynchronizationScope::SingleThread) + return Op; + if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get @@ -9476,8 +9480,11 @@ AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } - // Don't generate vpaddl+vmovn; we'll match it to vpadd later. - if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) + // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure + // we're using the entire input vector, otherwise there's a size/legality + // mismatch somewhere. + if (nextIndex != Vec.getValueType().getVectorNumElements() || + Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) return SDValue(); // Create VPADDL node. diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index 3b3606ef462a..a0e2ac4cbc6f 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -32,8 +32,8 @@ using namespace llvm; ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI), RI() {} -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void ARMInstrInfo::getNoop(MCInst &NopInst) const { if (hasNOP()) { NopInst.setOpcode(ARM::HINT); NopInst.addOperand(MCOperand::createImm(0)); diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h index 4b1b7097b18d..c87fb97448c9 100644 --- a/lib/Target/ARM/ARMInstrInfo.h +++ b/lib/Target/ARM/ARMInstrInfo.h @@ -25,8 +25,8 @@ class ARMInstrInfo : public ARMBaseInstrInfo { public: explicit ARMInstrInfo(const ARMSubtarget &STI); - /// getNoopForMachoTarget - Return the noop instruction to use for a noop. - void getNoopForMachoTarget(MCInst &NopInst) const override; + /// Return the noop instruction to use for a noop. + void getNoop(MCInst &NopInst) const override; // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 703e8071b177..9d8ee5c3f9dc 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -5975,3 +5975,10 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status), (ins GPR:$addr, GPRPair:$desired, GPRPair:$new), NoItinerary, []>, Sched<[]>; } + +def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary, + [(atomic_fence imm:$ordering, 0)]> { + let hasSideEffects = 1; + let Size = 0; + let AsmString = "@ COMPILER BARRIER"; +} diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index f2f426e86701..8048c758e998 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -953,7 +953,7 @@ let isAdd = 1 in { /// These opcodes will be converted to the real non-S opcodes by /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. let hasPostISelHook = 1, Defs = [CPSR] in { - let isCommutable = 1 in + let isCommutable = 1, Uses = [CPSR] in def tADCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), 2, IIC_iALUr, [(set tGPR:$Rdn, CPSR, (ARMadde tGPR:$Rn, tGPR:$Rm, @@ -1292,6 +1292,7 @@ def tSUBrr : // A8.6.212 /// These opcodes will be converted to the real non-S opcodes by /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. let hasPostISelHook = 1, Defs = [CPSR] in { + let Uses = [CPSR] in def tSBCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), 2, IIC_iALUr, [(set tGPR:$Rdn, CPSR, (ARMsube tGPR:$Rn, tGPR:$Rm, diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 816596b85721..1c13d51a468e 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -47,12 +47,9 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, unsigned SrcReg = I.getOperand(1).getReg(); const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); (void)SrcSize; - assert((DstSize == SrcSize || - // Copies are a means to setup initial types, the number of - // bits may not exactly match. - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - DstSize <= SrcSize)) && - "Copy with different width?!"); + // We use copies for trunc, so it's ok for the size of the destination to be + // smaller (the higher bits will just be undefined). + assert(DstSize <= SrcSize && "Copy with different width?!"); assert((RegBank->getID() == ARM::GPRRegBankID || RegBank->getID() == ARM::FPRRegBankID) && @@ -294,6 +291,28 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } break; } + case G_TRUNC: { + // The high bits are undefined, so there's nothing special to do, just + // treat it as a copy. + auto SrcReg = I.getOperand(1).getReg(); + auto DstReg = I.getOperand(0).getReg(); + + const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + + if (SrcRegBank.getID() != DstRegBank.getID()) { + DEBUG(dbgs() << "G_TRUNC operands on different register banks\n"); + return false; + } + + if (SrcRegBank.getID() != ARM::GPRRegBankID) { + DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n"); + return false; + } + + I.setDesc(TII.get(COPY)); + return selectCopy(I, TII, MRI, TRI, RBI); + } case G_ADD: case G_GEP: I.setDesc(TII.get(ARM::ADDrr)); @@ -313,6 +332,16 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; + case G_SDIV: + assert(TII.getSubtarget().hasDivideInARMMode() && "Unsupported operation"); + I.setDesc(TII.get(ARM::SDIV)); + MIB.add(predOps(ARMCC::AL)); + break; + case G_UDIV: + assert(TII.getSubtarget().hasDivideInARMMode() && "Unsupported operation"); + I.setDesc(TII.get(ARM::UDIV)); + MIB.add(predOps(ARMCC::AL)); + break; case G_FADD: if (!selectFAdd(MIB, TII, MRI)) return false; @@ -332,6 +361,18 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { "Expected constant to live in a GPR"); I.setDesc(TII.get(ARM::MOVi)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); + + auto &Val = I.getOperand(1); + if (Val.isCImm()) { + if (Val.getCImm()->getBitWidth() > 32) + return false; + Val.ChangeToImmediate(Val.getCImm()->getZExtValue()); + } + + if (!Val.isImm()) { + return false; + } + break; } case G_STORE: diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index fe9681439e6b..9b86030fdd29 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -13,6 +13,8 @@ #include "ARMLegalizerInfo.h" #include "ARMSubtarget.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" @@ -47,6 +49,18 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { for (auto Ty : {s1, s8, s16, s32}) setAction({Op, Ty}, Legal); + for (unsigned Op : {G_SDIV, G_UDIV}) { + for (auto Ty : {s8, s16}) + // FIXME: We need WidenScalar here, but in the case of targets with + // software division we'll also need Libcall afterwards. Treat as Custom + // until we have better support for chaining legalization actions. + setAction({Op, Ty}, Custom); + if (ST.hasDivideInARMMode()) + setAction({Op, s32}, Legal); + else + setAction({Op, s32}, Libcall); + } + for (unsigned Op : {G_SEXT, G_ZEXT}) { setAction({Op, s32}, Legal); for (auto Ty : {s1, s8, s16}) @@ -75,3 +89,48 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { computeTables(); } + +bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + using namespace TargetOpcode; + + switch (MI.getOpcode()) { + default: + return false; + case G_SDIV: + case G_UDIV: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if (Ty != LLT::scalar(16) && Ty != LLT::scalar(8)) + return false; + + // We need to widen to 32 bits and then maybe, if the target requires, + // transform into a libcall. + LegalizerHelper Helper(MIRBuilder.getMF()); + + MachineInstr *NewMI = nullptr; + Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) { + // Store the new, 32-bit div instruction. + if (MI->getOpcode() == G_SDIV || MI->getOpcode() == G_UDIV) + NewMI = MI; + }); + + auto Result = Helper.widenScalar(MI, 0, LLT::scalar(32)); + Helper.MIRBuilder.stopRecordingInsertions(); + if (Result == LegalizerHelper::UnableToLegalize) { + return false; + } + assert(NewMI && "Couldn't find widened instruction"); + assert((NewMI->getOpcode() == G_SDIV || NewMI->getOpcode() == G_UDIV) && + "Unexpected widened instruction"); + assert(MRI.getType(NewMI->getOperand(0).getReg()).getSizeInBits() == 32 && + "Unexpected type for the widened instruction"); + + Result = Helper.legalizeInstrStep(*NewMI); + if (Result == LegalizerHelper::UnableToLegalize) { + return false; + } + return true; + } + } +} diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h index 0b8a608a6bde..a9bdd367737e 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.h +++ b/lib/Target/ARM/ARMLegalizerInfo.h @@ -24,6 +24,9 @@ class ARMSubtarget; class ARMLegalizerInfo : public LegalizerInfo { public: ARMLegalizerInfo(const ARMSubtarget &ST); + + bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; }; } // End llvm namespace. #endif diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 0fd98268723a..9e9c1ba6c114 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -211,11 +211,9 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) .addImm(ARMCC::AL).addReg(0)); MCInst Noop; - Subtarget->getInstrInfo()->getNoopForElfTarget(Noop); + Subtarget->getInstrInfo()->getNoop(Noop); for (int8_t I = 0; I < NoopsInSledCount; I++) - { OutStreamer->EmitInstruction(Noop, getSubtargetInfo()); - } OutStreamer->EmitLabel(Target); recordSled(CurSled, MI, Kind); diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index e47bd3a8963e..7325817d446b 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -221,8 +221,11 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_ADD: case G_SUB: case G_MUL: + case G_SDIV: + case G_UDIV: case G_SEXT: case G_ZEXT: + case G_TRUNC: case G_GEP: // FIXME: We're abusing the fact that everything lives in a GPR for now; in // the real world we would use different mappings. diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index 27bff4d75acf..0ebf55924647 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -24,8 +24,8 @@ using namespace llvm; Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI), RI() {} -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void Thumb1InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(ARM::tMOVr); NopInst.addOperand(MCOperand::createReg(ARM::R8)); NopInst.addOperand(MCOperand::createReg(ARM::R8)); diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h index 931914ad2799..e8d9a9c4ff14 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.h +++ b/lib/Target/ARM/Thumb1InstrInfo.h @@ -25,8 +25,8 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { public: explicit Thumb1InstrInfo(const ARMSubtarget &STI); - /// getNoopForMachoTarget - Return the noop instruction to use for a noop. - void getNoopForMachoTarget(MCInst &NopInst) const override; + /// Return the noop instruction to use for a noop. + void getNoop(MCInst &NopInst) const override; // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 818ba85c7d40..2e2dfe035e26 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -32,8 +32,8 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden, Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI), RI() {} -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void Thumb2InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(ARM::tHINT); NopInst.addOperand(MCOperand::createImm(0)); NopInst.addOperand(MCOperand::createImm(ARMCC::AL)); diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h index 15d63300b6a2..c834ba73bfea 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.h +++ b/lib/Target/ARM/Thumb2InstrInfo.h @@ -26,8 +26,8 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo { public: explicit Thumb2InstrInfo(const ARMSubtarget &STI); - /// getNoopForMachoTarget - Return the noop instruction to use for a noop. - void getNoopForMachoTarget(MCInst &NopInst) const override; + /// Return the noop instruction to use for a noop. + void getNoop(MCInst &NopInst) const override; // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp index 50bb50b44f27..d6491ce5c3bf 100644 --- a/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/lib/Target/AVR/AVRAsmPrinter.cpp @@ -112,7 +112,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - unsigned BytesPerReg = TRI.getMinimalPhysRegClass(Reg)->getSize(); + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); + unsigned BytesPerReg = TRI.getRegSizeInBits(*RC) / 8; assert(BytesPerReg <= 2 && "Only 8 and 16 bit regs are supported."); unsigned RegIdx = ByteNumber / BytesPerReg; diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp index 13080a5d72f0..540e05a92997 100644 --- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp +++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp @@ -88,6 +88,9 @@ private: unsigned ArithOpcode, Block &MBB, BlockIt MBBI); + + /// Scavenges a free GPR8 register for use. + unsigned scavengeGPR8(MachineInstr &MI); }; char AVRExpandPseudo::ID = 0; @@ -577,24 +580,43 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; unsigned OpLo, OpHi, DstLoReg, DstHiReg; unsigned DstReg = MI.getOperand(0).getReg(); + unsigned TmpReg = 0; // 0 for no temporary register unsigned SrcReg = MI.getOperand(1).getReg(); - bool DstIsDead = MI.getOperand(0).isDead(); bool SrcIsKill = MI.getOperand(1).isKill(); OpLo = AVR::LDRdPtr; OpHi = AVR::LDDRdPtrQ; TRI->splitReg(DstReg, DstLoReg, DstHiReg); - assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same"); + // Use a temporary register if src and dst registers are the same. + if (DstReg == SrcReg) + TmpReg = scavengeGPR8(MI); + + unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg; + unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg; + // Load low byte. auto MIBLO = buildMI(MBB, MBBI, OpLo) - .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(CurDstLoReg, RegState::Define) .addReg(SrcReg); + // Push low byte onto stack if necessary. + if (TmpReg) + buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg); + + // Load high byte. auto MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(CurDstHiReg, RegState::Define) .addReg(SrcReg, getKillRegState(SrcIsKill)) .addImm(1); + if (TmpReg) { + // Move the high byte into the final destination. + buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg); + + // Move the low byte from the scratch space into the final destination. + buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg); + } + MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -669,9 +691,9 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; unsigned OpLo, OpHi, DstLoReg, DstHiReg; unsigned DstReg = MI.getOperand(0).getReg(); + unsigned TmpReg = 0; // 0 for no temporary register unsigned SrcReg = MI.getOperand(1).getReg(); unsigned Imm = MI.getOperand(2).getImm(); - bool DstIsDead = MI.getOperand(0).isDead(); bool SrcIsKill = MI.getOperand(1).isKill(); OpLo = AVR::LDDRdPtrQ; OpHi = AVR::LDDRdPtrQ; @@ -679,60 +701,35 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) { assert(Imm <= 63 && "Offset is out of range"); - MachineInstr *MIBLO, *MIBHI; - - // HACK: We shouldn't have instances of this instruction - // where src==dest because the instruction itself is - // marked earlyclobber. We do however get this instruction when - // loading from stack slots where the earlyclobber isn't useful. - // - // In this case, just use a temporary register. - if (DstReg == SrcReg) { - RegScavenger RS; - - RS.enterBasicBlock(MBB); - RS.forward(MBBI); - - BitVector Candidates = - TRI->getAllocatableSet - (*MBB.getParent(), &AVR::GPR8RegClass); - - // Exclude all the registers being used by the instruction. - for (MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() && - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) - Candidates.reset(MO.getReg()); - } - - BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass); - Available &= Candidates; + // Use a temporary register if src and dst registers are the same. + if (DstReg == SrcReg) + TmpReg = scavengeGPR8(MI); - signed TmpReg = Available.find_first(); - assert(TmpReg != -1 && "ran out of registers"); + unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg; + unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg; - MIBLO = buildMI(MBB, MBBI, OpLo) - .addReg(TmpReg, RegState::Define) - .addReg(SrcReg) - .addImm(Imm); + // Load low byte. + auto MIBLO = buildMI(MBB, MBBI, OpLo) + .addReg(CurDstLoReg, RegState::Define) + .addReg(SrcReg) + .addImm(Imm); - buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstLoReg).addReg(TmpReg); + // Push low byte onto stack if necessary. + if (TmpReg) + buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg); - MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(TmpReg, RegState::Define) - .addReg(SrcReg, getKillRegState(SrcIsKill)) - .addImm(Imm + 1); + // Load high byte. + auto MIBHI = buildMI(MBB, MBBI, OpHi) + .addReg(CurDstHiReg, RegState::Define) + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(Imm + 1); + if (TmpReg) { + // Move the high byte into the final destination. buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg); - } else { - MIBLO = buildMI(MBB, MBBI, OpLo) - .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(SrcReg) - .addImm(Imm); - MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(SrcReg, getKillRegState(SrcIsKill)) - .addImm(Imm + 1); + // Move the low byte from the scratch space into the final destination. + buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg); } MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -819,6 +816,32 @@ bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width, }); } +unsigned AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + RegScavenger RS; + + RS.enterBasicBlock(MBB); + RS.forward(MI); + + BitVector Candidates = + TRI->getAllocatableSet + (*MBB.getParent(), &AVR::GPR8RegClass); + + // Exclude all the registers being used by the instruction. + for (MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() && + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + Candidates.reset(MO.getReg()); + } + + BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass); + Available &= Candidates; + + signed Reg = Available.find_first(); + assert(Reg != -1 && "ran out of registers"); + return Reg; +} + template<> bool AVRExpandPseudo::expand<AVR::AtomicLoad8>(Block &MBB, BlockIt MBBI) { return expandAtomicBinaryOp(AVR::LDRdPtr, MBB, MBBI); @@ -948,7 +971,6 @@ bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) { unsigned OpLo, OpHi, SrcLoReg, SrcHiReg; unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); - bool DstIsKill = MI.getOperand(0).isKill(); bool SrcIsKill = MI.getOperand(1).isKill(); OpLo = AVR::STPtrRr; OpHi = AVR::STDPtrQRr; @@ -960,7 +982,7 @@ bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) { .addReg(SrcLoReg, getKillRegState(SrcIsKill)); auto MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(DstReg, getKillRegState(DstIsKill)) + .addReg(DstReg) .addImm(1) .addReg(SrcHiReg, getKillRegState(SrcIsKill)); diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp index b8cb2215ddb4..ab42a7aa9901 100644 --- a/lib/Target/AVR/AVRFrameLowering.cpp +++ b/lib/Target/AVR/AVRFrameLowering.cpp @@ -239,7 +239,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters( unsigned Reg = CSI[i - 1].getReg(); bool IsNotLiveIn = !MBB.isLiveIn(Reg); - assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 && + assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && "Invalid register size"); // Add the callee-saved register as live-in only if it is not already a @@ -277,7 +277,7 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters( for (const CalleeSavedInfo &CCSI : CSI) { unsigned Reg = CCSI.getReg(); - assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 && + assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && "Invalid register size"); BuildMI(MBB, MI, DL, TII.get(AVR::POPRd), Reg); diff --git a/lib/Target/AVR/AVRInstrInfo.cpp b/lib/Target/AVR/AVRInstrInfo.cpp index 88f889260cce..afba66b2e69b 100644 --- a/lib/Target/AVR/AVRInstrInfo.cpp +++ b/lib/Target/AVR/AVRInstrInfo.cpp @@ -142,9 +142,9 @@ void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MFI.getObjectAlignment(FrameIndex)); unsigned Opcode = 0; - if (RC->hasType(MVT::i8)) { + if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { Opcode = AVR::STDPtrQRr; - } else if (RC->hasType(MVT::i16)) { + } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { Opcode = AVR::STDWPtrQRr; } else { llvm_unreachable("Cannot store this register into a stack slot!"); @@ -176,9 +176,9 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MFI.getObjectAlignment(FrameIndex)); unsigned Opcode = 0; - if (RC->hasType(MVT::i8)) { + if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { Opcode = AVR::LDDRdPtrQ; - } else if (RC->hasType(MVT::i16)) { + } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { // Opcode = AVR::LDDWRdPtrQ; //:FIXME: remove this once PR13375 gets fixed Opcode = AVR::LDDWRdYQ; diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index 48798bd4a1da..5cc7eaf8add3 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -78,11 +78,12 @@ BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const { const TargetRegisterClass * AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const { - if (RC->hasType(MVT::i16)) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { return &AVR::DREGSRegClass; } - if (RC->hasType(MVT::i8)) { + if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { return &AVR::GPR8RegClass; } diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp index cb3049bf1500..07767d1037a9 100644 --- a/lib/Target/Hexagon/BitTracker.cpp +++ b/lib/Target/Hexagon/BitTracker.cpp @@ -347,7 +347,7 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const { unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub); const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS); - uint16_t BW = RC->getSize()*8; + uint16_t BW = TRI.getRegSizeInBits(*RC); return BW; } diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp index fda23f8f6b05..c8483f7e6e76 100644 --- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -286,9 +286,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst, const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo(); const MachineFunction &MF = *MI.getParent()->getParent(); const auto &HST = MF.getSubtarget<HexagonSubtarget>(); - unsigned VectorSize = HST.useHVXSglOps() - ? Hexagon::VectorRegsRegClass.getSize() - : Hexagon::VectorRegs128BRegClass.getSize(); + const auto &VecRC = HST.useHVXSglOps() ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned VectorSize = HST.getRegisterInfo()->getSpillSize(VecRC); switch (Inst.getOpcode()) { default: return; diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp index 61f290ca98d7..8502bf24c02f 100644 --- a/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -407,7 +407,7 @@ bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR, const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg); if (RR.Sub == 0) { Begin = 0; - Width = RC->getSize()*8; + Width = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC); return true; } @@ -417,7 +417,7 @@ bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR, case Hexagon::DoubleRegsRegClassID: case Hexagon::VecDblRegsRegClassID: case Hexagon::VecDblRegs128BRegClassID: - Width = RC->getSize()*8 / 2; + Width = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 2; if (RR.Sub == Hexagon::isub_hi || RR.Sub == Hexagon::vsub_hi) Begin = Width; break; @@ -1054,8 +1054,8 @@ namespace { class RedundantInstrElimination : public Transformation { public: RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii, - MachineRegisterInfo &mri) - : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + const HexagonRegisterInfo &hri, MachineRegisterInfo &mri) + : Transformation(true), HII(hii), HRI(hri), MRI(mri), BT(bt) {} bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; @@ -1070,6 +1070,7 @@ namespace { bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS); const HexagonInstrInfo &HII; + const HexagonRegisterInfo &HRI; MachineRegisterInfo &MRI; BitTracker &BT; }; @@ -1262,7 +1263,7 @@ bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI, assert(MI.getOperand(OpN).isReg()); BitTracker::RegisterRef RR = MI.getOperand(OpN); const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI); - uint16_t Width = RC->getSize()*8; + uint16_t Width = HRI.getRegSizeInBits(*RC); if (!GotBits) T.set(Begin, Begin+Width); @@ -2173,8 +2174,10 @@ bool BitSimplification::genBitSplit(MachineInstr *MI, const RegisterSet &AVs) { if (!GenBitSplit) return false; - if (CountBitSplit >= MaxBitSplit) - return false; + if (MaxBitSplit.getNumOccurrences()) { + if (CountBitSplit >= MaxBitSplit) + return false; + } unsigned Opc = MI->getOpcode(); switch (Opc) { @@ -2253,7 +2256,8 @@ bool BitSimplification::genBitSplit(MachineInstr *MI, continue; // Generate bitsplit where S is defined. - CountBitSplit++; + if (MaxBitSplit.getNumOccurrences()) + CountBitSplit++; MachineInstr *DefS = MRI.getVRegDef(S); assert(DefS != nullptr); DebugLoc DL = DefS->getDebugLoc(); @@ -2379,9 +2383,11 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI, const RegisterSet &AVs) { if (!GenExtract) return false; - if (CountExtract >= MaxExtract) - return false; - CountExtract++; + if (MaxExtract.getNumOccurrences()) { + if (CountExtract >= MaxExtract) + return false; + CountExtract++; + } unsigned W = RC.width(); unsigned RW = W; @@ -2651,7 +2657,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) { Changed |= visitBlock(Entry, ImmG, AIG); RegisterSet ARE; // Available registers for RIE. - RedundantInstrElimination RIE(BT, HII, MRI); + RedundantInstrElimination RIE(BT, HII, HRI, MRI); bool Ried = visitBlock(Entry, RIE, ARE); if (Ried) { Changed = true; diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp index d8ba5dcd35ad..9f8c9ded8127 100644 --- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -559,10 +559,10 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO, } unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS); - switch (RC->getSize()) { - case 4: + switch (TRI->getRegSizeInBits(*RC)) { + case 32: return IfTrue ? A2_tfrt : A2_tfrf; - case 8: + case 64: return IfTrue ? A2_tfrpt : A2_tfrpf; } llvm_unreachable("Invalid register operand"); diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 0e2380f4316a..a04aca4afa0f 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1425,7 +1425,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, if (!SRegs[S->Reg]) continue; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(S->Reg); - int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), S->Offset); + int FI = MFI.CreateFixedSpillStackObject(TRI->getSpillSize(*RC), S->Offset); MinOffset = std::min(MinOffset, S->Offset); CSI.push_back(CalleeSavedInfo(S->Reg, FI)); SRegs[S->Reg] = false; @@ -1437,11 +1437,12 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) { unsigned R = x; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R); - int Off = MinOffset - RC->getSize(); - unsigned Align = std::min(RC->getAlignment(), getStackAlignment()); + unsigned Size = TRI->getSpillSize(*RC); + int Off = MinOffset - Size; + unsigned Align = std::min(TRI->getSpillAlignment(*RC), getStackAlignment()); assert(isPowerOf2_32(Align)); Off &= -Align; - int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), Off); + int FI = MFI.CreateFixedSpillStackObject(Size, Off); MinOffset = std::min(MinOffset, Off); CSI.push_back(CalleeSavedInfo(R, FI)); SRegs[R] = false; @@ -1677,10 +1678,10 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B, int FI = MI->getOperand(0).getIndex(); bool Is128B = HST.useHVXDblOps(); - auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass - : &Hexagon::VectorRegs128BRegClass; - unsigned Size = RC->getSize(); - unsigned NeedAlign = RC->getAlignment(); + const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned Size = HRI.getSpillSize(RC); + unsigned NeedAlign = HRI.getSpillAlignment(RC); unsigned HasAlign = MFI.getObjectAlignment(FI); unsigned StoreOpc; @@ -1734,10 +1735,10 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B, int FI = MI->getOperand(1).getIndex(); bool Is128B = HST.useHVXDblOps(); - auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass - : &Hexagon::VectorRegs128BRegClass; - unsigned Size = RC->getSize(); - unsigned NeedAlign = RC->getAlignment(); + const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned Size = HRI.getSpillSize(RC); + unsigned NeedAlign = HRI.getSpillAlignment(RC); unsigned HasAlign = MFI.getObjectAlignment(FI); unsigned LoadOpc; @@ -1777,16 +1778,16 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B, if (!MI->getOperand(0).isFI()) return false; + auto &HRI = *HST.getRegisterInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned SrcR = MI->getOperand(2).getReg(); bool IsKill = MI->getOperand(2).isKill(); int FI = MI->getOperand(0).getIndex(); bool Is128B = HST.useHVXDblOps(); - auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass - : &Hexagon::VectorRegs128BRegClass; - - unsigned NeedAlign = RC->getAlignment(); + const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned NeedAlign = HRI.getSpillAlignment(RC); unsigned HasAlign = MFI.getObjectAlignment(FI); unsigned StoreOpc; @@ -1815,15 +1816,15 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B, if (!MI->getOperand(1).isFI()) return false; + auto &HRI = *HST.getRegisterInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned DstR = MI->getOperand(0).getReg(); int FI = MI->getOperand(1).getIndex(); bool Is128B = HST.useHVXDblOps(); - auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass - : &Hexagon::VectorRegs128BRegClass; - - unsigned NeedAlign = RC->getAlignment(); + const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned NeedAlign = HRI.getSpillAlignment(RC); unsigned HasAlign = MFI.getObjectAlignment(FI); unsigned LoadOpc; @@ -1932,7 +1933,7 @@ void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF, if (!needToReserveScavengingSpillSlots(MF, HRI, RC)) continue; unsigned Num = RC == &Hexagon::IntRegsRegClass ? NumberScavengerSlots : 1; - unsigned S = RC->getSize(), A = RC->getAlignment(); + unsigned S = HRI.getSpillSize(*RC), A = HRI.getSpillAlignment(*RC); for (unsigned i = 0; i < Num; i++) { int NewFI = MFI.CreateSpillStackObject(S, A); RS->addScavengingFrameIndex(NewFI); diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index b5948475e1f7..1829c5da02a6 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -26,6 +26,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -1206,10 +1207,9 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V, if (!T) return false; - unsigned BW = T->getBitWidth(); - APInt K0(BW, 0), K1(BW, 0); - computeKnownBits(V, K0, K1, DL); - return K0.countLeadingOnes() >= IterCount; + KnownBits Known(T->getBitWidth()); + computeKnownBits(V, Known, DL); + return Known.Zero.countLeadingOnes() >= IterCount; } diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td index c0c29b992238..22fc2474fae6 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.td +++ b/lib/Target/MSP430/MSP430InstrInfo.td @@ -122,6 +122,7 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2), } let usesCustomInserter = 1 in { + let Uses = [SR] in { def Select8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc), "# Select8 PSEUDO", [(set GR8:$dst, @@ -130,6 +131,7 @@ let usesCustomInserter = 1 in { "# Select16 PSEUDO", [(set GR16:$dst, (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>; + } let Defs = [SR] in { def Shl8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt), "# Shl8 PSEUDO", diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 2a9d96205eb9..134f7ac3aea3 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -273,9 +273,9 @@ void MipsAsmPrinter::printSavedRegsBitmask() { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); // size of stack area to which FP callee-saved regs are saved. - unsigned CPURegSize = Mips::GPR32RegClass.getSize(); - unsigned FGR32RegSize = Mips::FGR32RegClass.getSize(); - unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize(); + unsigned CPURegSize = TRI->getRegSizeInBits(Mips::GPR32RegClass) / 8; + unsigned FGR32RegSize = TRI->getRegSizeInBits(Mips::FGR32RegClass) / 8; + unsigned AFGR64RegSize = TRI->getRegSizeInBits(Mips::AFGR64RegClass) / 8; bool HasAFGR64Reg = false; unsigned CSFPRegsSize = 0; diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp index 7af988c1f64d..cb9f676c237a 100644 --- a/lib/Target/Mips/MipsCCState.cpp +++ b/lib/Target/Mips/MipsCCState.cpp @@ -38,7 +38,7 @@ static bool isF128SoftLibCall(const char *CallSym) { /// This function returns true if Ty is fp128, {f128} or i128 which was /// originally a fp128. -static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) { +static bool originalTypeIsF128(const Type *Ty, const char *Func) { if (Ty->isFP128Ty()) return true; @@ -46,12 +46,9 @@ static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) { Ty->getStructElementType(0)->isFP128Ty()) return true; - const ExternalSymbolSDNode *ES = - dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode); - // If the Ty is i128 and the function being called is a long double emulation // routine, then the original type is f128. - return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol())); + return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func)); } MipsCCState::SpecialCallingConvType @@ -73,11 +70,11 @@ MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee, void MipsCCState::PreAnalyzeCallResultForF128( const SmallVectorImpl<ISD::InputArg> &Ins, - const TargetLowering::CallLoweringInfo &CLI) { + const Type *RetTy, const char *Call) { for (unsigned i = 0; i < Ins.size(); ++i) { OriginalArgWasF128.push_back( - originalTypeIsF128(CLI.RetTy, CLI.Callee.getNode())); - OriginalArgWasFloat.push_back(CLI.RetTy->isFloatingPointTy()); + originalTypeIsF128(RetTy, Call)); + OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy()); } } @@ -99,10 +96,10 @@ void MipsCCState::PreAnalyzeReturnForF128( void MipsCCState::PreAnalyzeCallOperands( const SmallVectorImpl<ISD::OutputArg> &Outs, std::vector<TargetLowering::ArgListEntry> &FuncArgs, - const SDNode *CallNode) { + const char *Func) { for (unsigned i = 0; i < Outs.size(); ++i) { OriginalArgWasF128.push_back( - originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, CallNode)); + originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, Func)); OriginalArgWasFloat.push_back( FuncArgs[Outs[i].OrigArgIndex].Ty->isFloatingPointTy()); CallOperandIsFixed.push_back(Outs[i].IsFixed); diff --git a/lib/Target/Mips/MipsCCState.h b/lib/Target/Mips/MipsCCState.h index 081c393a09be..77ecc65b2eee 100644 --- a/lib/Target/Mips/MipsCCState.h +++ b/lib/Target/Mips/MipsCCState.h @@ -31,7 +31,7 @@ private: /// Identify lowered values that originated from f128 arguments and record /// this for use by RetCC_MipsN. void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins, - const TargetLowering::CallLoweringInfo &CLI); + const Type *RetTy, const char * Func); /// Identify lowered values that originated from f128 arguments and record /// this for use by RetCC_MipsN. @@ -42,7 +42,7 @@ private: void PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, std::vector<TargetLowering::ArgListEntry> &FuncArgs, - const SDNode *CallNode); + const char *Func); /// Identify lowered values that originated from f128 arguments and record /// this. @@ -73,8 +73,8 @@ public: AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, CCAssignFn Fn, std::vector<TargetLowering::ArgListEntry> &FuncArgs, - const SDNode *CallNode) { - PreAnalyzeCallOperands(Outs, FuncArgs, CallNode); + const char *Func) { + PreAnalyzeCallOperands(Outs, FuncArgs, Func); CCState::AnalyzeCallOperands(Outs, Fn); OriginalArgWasF128.clear(); OriginalArgWasFloat.clear(); @@ -99,9 +99,9 @@ public: } void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, - CCAssignFn Fn, - const TargetLowering::CallLoweringInfo &CLI) { - PreAnalyzeCallResultForF128(Ins, CLI); + CCAssignFn Fn, const Type *RetTy, + const char *Func) { + PreAnalyzeCallResultForF128(Ins, RetTy, Func); CCState::AnalyzeCallResult(Ins, Fn); OriginalArgWasFloat.clear(); OriginalArgWasF128.clear(); diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index c060cf06099d..a5c7bf7699ea 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -1260,8 +1260,10 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, emitInst(Mips::ADJCALLSTACKUP).addImm(16).addImm(0); if (RetVT != MVT::isVoid) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, RetCC_Mips); + MipsCCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); + + CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy, + CLI.Symbol->getName().data()); // Only handle a single return value. if (RVLocs.size() != 1) diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index b2cf03976f81..ef05166503b2 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -119,7 +119,7 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const { // Conservatively assume all callee-saved registers will be saved. for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) { - unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize(); + unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R)); Offset = alignTo(Offset + Size, Size); } diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 93c5f496ce97..8f39ebd42a5c 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -2750,7 +2750,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // caller side but removing it breaks the frame size calculation. CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1); - CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), Callee.getNode()); + const ExternalSymbolSDNode *ES = + dyn_cast_or_null<const ExternalSymbolSDNode>(Callee.getNode()); + CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), + ES ? ES->getSymbol() : nullptr); // Get a count of how many bytes are to be pushed on the stack. unsigned NextStackOffset = CCInfo.getNextStackOffset(); @@ -2985,7 +2988,11 @@ SDValue MipsTargetLowering::LowerCallResult( SmallVector<CCValAssign, 16> RVLocs; MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); - CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI); + + const ExternalSymbolSDNode *ES = + dyn_cast_or_null<const ExternalSymbolSDNode>(CLI.Callee.getNode()); + CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy, + ES ? ES->getSymbol() : nullptr); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp index 5bf4c958c7b9..63034ecab93b 100644 --- a/lib/Target/Mips/MipsMachineFunction.cpp +++ b/lib/Target/Mips/MipsMachineFunction.cpp @@ -40,11 +40,7 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() { const TargetRegisterClass *RC = STI.inMips16Mode() ? &Mips::CPU16RegsRegClass - : STI.inMicroMipsMode() - ? STI.hasMips64() - ? &Mips::GPRMM16_64RegClass - : &Mips::GPRMM16RegClass - : static_cast<const MipsTargetMachine &>(MF.getTarget()) + : static_cast<const MipsTargetMachine &>(MF.getTarget()) .getABI() .IsN64() ? &Mips::GPR64RegClass @@ -53,14 +49,15 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() { } void MipsFunctionInfo::createEhDataRegsFI() { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); for (int I = 0; I < 4; ++I) { - const TargetRegisterClass *RC = + const TargetRegisterClass &RC = static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64() - ? &Mips::GPR64RegClass - : &Mips::GPR32RegClass; + ? Mips::GPR64RegClass + : Mips::GPR32RegClass; - EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(RC->getSize(), - RC->getAlignment(), false); + EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC), + TRI.getSpillAlignment(RC), false); } } @@ -69,11 +66,12 @@ void MipsFunctionInfo::createISRRegFI() { // The current implementation only supports Mips32r2+ not Mips64rX. Status // is always 32 bits, ErrorPC is 32 or 64 bits dependent on architecture, // however Mips32r2+ is the supported architecture. - const TargetRegisterClass *RC = &Mips::GPR32RegClass; + const TargetRegisterClass &RC = Mips::GPR32RegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); for (int I = 0; I < 2; ++I) ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject( - RC->getSize(), RC->getAlignment(), false); + TRI.getSpillSize(RC), TRI.getSpillAlignment(RC), false); } bool MipsFunctionInfo::isEhDataRegFI(int FI) const { @@ -93,9 +91,10 @@ MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) { } int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); if (MoveF64ViaSpillFI == -1) { MoveF64ViaSpillFI = MF.getFrameInfo().CreateStackObject( - RC->getSize(), RC->getAlignment(), false); + TRI.getSpillSize(*RC), TRI.getSpillAlignment(*RC), false); } return MoveF64ViaSpillFI; } diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp index f33857fe628f..68dcbdfb4211 100644 --- a/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -116,9 +116,10 @@ static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) { /// Return type of register Reg. static MVT::SimpleValueType getRegTy(unsigned Reg, MachineFunction &MF) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg); - assert(RC->vt_end() - RC->vt_begin() == 1); - return *RC->vt_begin(); + assert(TRI.legalclasstypes_end(*RC) - TRI.legalclasstypes_begin(*RC) == 1); + return *TRI.legalclasstypes_begin(*RC); } /// Do the following transformation: diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index ef8d18c6deb1..e765b4625206 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -260,7 +260,8 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, // copy dst_hi, $vr1 unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg(); - unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2; + const TargetRegisterClass *DstRC = RegInfo.getMinimalPhysRegClass(Dst); + unsigned VRegSize = RegInfo.getRegSizeInBits(*DstRC) / 16; const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize); unsigned VR0 = MRI.createVirtualRegister(RC); unsigned VR1 = MRI.createVirtualRegister(RC); @@ -858,6 +859,7 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); MipsABIInfo ABI = STI.getABI(); unsigned FP = ABI.GetFramePtr(); @@ -883,10 +885,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, if (ExpandPseudo(MF).expand()) { // The spill slot should be half the size of the accumulator. If target is // mips64, it should be 64-bit, otherwise it should be 32-bt. - const TargetRegisterClass *RC = STI.hasMips64() ? - &Mips::GPR64RegClass : &Mips::GPR32RegClass; - int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(), - RC->getAlignment(), false); + const TargetRegisterClass &RC = STI.hasMips64() ? + Mips::GPR64RegClass : Mips::GPR32RegClass; + int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC), + TRI->getSpillAlignment(RC), + false); RS->addScavengingFrameIndex(FI); } @@ -897,10 +900,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, if (isInt<16>(MaxSPOffset)) return; - const TargetRegisterClass *RC = - ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; - int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(), - RC->getAlignment(), false); + const TargetRegisterClass &RC = + ABI.ArePtrs64bit() ? Mips::GPR64RegClass : Mips::GPR32RegClass; + int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC), + TRI->getSpillAlignment(RC), + false); RS->addScavengingFrameIndex(FI); } diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index 91e712a7a54e..ee074798563d 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -207,13 +207,16 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Opc = Mips::SDC1; else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = Mips::SDC164; - else if (RC->hasType(MVT::v16i8)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8)) Opc = Mips::ST_B; - else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) || + TRI->isTypeLegalForClass(*RC, MVT::v8f16)) Opc = Mips::ST_H; - else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) || + TRI->isTypeLegalForClass(*RC, MVT::v4f32)) Opc = Mips::ST_W; - else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) || + TRI->isTypeLegalForClass(*RC, MVT::v2f64)) Opc = Mips::ST_D; else if (Mips::LO32RegClass.hasSubClassEq(RC)) Opc = Mips::SW; @@ -280,13 +283,16 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Opc = Mips::LDC1; else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = Mips::LDC164; - else if (RC->hasType(MVT::v16i8)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8)) Opc = Mips::LD_B; - else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) || + TRI->isTypeLegalForClass(*RC, MVT::v8f16)) Opc = Mips::LD_H; - else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) || + TRI->isTypeLegalForClass(*RC, MVT::v4f32)) Opc = Mips::LD_W; - else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) || + TRI->isTypeLegalForClass(*RC, MVT::v2f64)) Opc = Mips::LD_D; else if (Mips::HI32RegClass.hasSubClassEq(RC)) Opc = Mips::LW; @@ -567,8 +573,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc, const MCInstrDesc &Desc = get(Opc); assert(Desc.NumOperands == 2 && "Unary instruction expected."); const MipsRegisterInfo *RI = &getRegisterInfo(); - unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize(); - unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize(); + unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI, MF)); + unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI, MF)); return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize); } diff --git a/lib/Target/Mips/Relocation.txt b/lib/Target/Mips/Relocation.txt new file mode 100644 index 000000000000..f1a6fd8645f6 --- /dev/null +++ b/lib/Target/Mips/Relocation.txt @@ -0,0 +1,125 @@ +MIPS Relocation Principles + +In LLVM, there are several elements of the llvm::ISD::NodeType enum +that deal with addresses and/or relocations. These are defined in +include/llvm/Target/TargetSelectionDAG.td, namely: + GlobalAddress, GlobalTLSAddress, JumpTable, ConstantPool, + ExternalSymbol, BlockAddress +The MIPS backend uses several principles to handle these. + +1. Code for lowering addresses references to machine dependent code is +factored into common code for generating different address forms and +is called by the relocation model specific lowering function, using +templated functions. For example: + + // lib/Target/Mips/MipsISelLowering.cpp + SDValue MipsTargetLowering:: + lowerJumpTable(SDValue Op, SelectionDAG &DAG) const + +calls + + template <class NodeTy> // lib/Target/Mips/MipsISelLowering.h + SDValue getAddrLocal(NodeTy *N, const SDLoc &DL, EVT Ty, + SelectionDAG &DAG, bool IsN32OrN64) const + +which calls the overloaded function: + + // lib/Target/Mips/MipsISelLowering.h + SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + +2. Generic address nodes are lowered to some combination of target +independent and machine specific SDNodes (for example: +MipsISD::{Highest, Higher, Hi, Lo}) depending upon relocation model, +ABI, and compilation options. + +The choice of specific instructions that are to be used is delegated +to ISel which in turn relies on TableGen patterns to choose subtarget +specific instructions. For example, in getAddrLocal, the pseudo-code +generated is: + + (add (load (wrapper $gp, %got(sym)), %lo(sym)) + +where "%lo" represents an instance of an SDNode with opcode +"MipsISD::Lo", "wrapper" indicates one with opcode "MipsISD::Wrapper", +and "%got" the global table pointer "getGlobalReg(...)". The "add" is +"ISD::ADD", not a target dependent one. + +3. A TableGen multiclass pattern "MipsHiLoRelocs" is used to define a +template pattern parameterized over the load upper immediate +instruction, add operation, the zero register, and register class. +Here the instantiation of MipsHiLoRelocs in MipsInstrInfo.td is used +to MIPS32 to compute addresses for the static relocation model. + + // lib/Target/Mips/MipsInstrInfo.td + multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu, + Register ZeroReg, RegisterOperand GPROpnd> { + def : MipsPat<(MipsHi tglobaladdr:$in), (Lui tglobaladdr:$in)>; + ... + def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>; + ... + def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)), + (Addiu GPROpnd:$hi, tglobaladdr:$lo)>; + ... + } + defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>; + + // lib/Target/Mips/Mips64InstrInfo.td + defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, SYM_32; + +The instantiation in Mips64InstrInfo.td is used for MIPS64 in ILP32 +mode, as guarded by the predicate "SYM_32" and also for a submode of +LP64 where symbols are assumed to be 32 bits wide. A similar +multiclass for MIPS64 in LP64 mode is also defined: + + // lib/Target/Mips/Mips64InstrInfo.td + multiclass MipsHighestHigherHiLoRelocs<Instruction Lui, + Instruction Daddiu> { + ... + def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)), + (Lui tglobaladdr:$in)>; + ... + def : MipsPat<(MipsHigher (i64 tglobaladdr:$in)), + (Daddiu ZERO_64, tglobaladdr:$in)>; + ... + def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaladdr:$lo))), + (Daddiu GPR64:$hi, tglobaladdr:$lo)>; + ... + def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))), + (Daddiu GPR64:$hi, tglobaladdr:$lo)>; + ... + def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))), + (Daddiu GPR64:$hi, tglobaladdr:$lo)>; + } + +and it is instantiated twice: + + // lib/Target/Mips/Mips64InstrInfo.td + defm : MipsHighestHigherHiLoRelocs<LUi64, DADDiu>, SYM_64; + // lib/Target/Mips/MicroMips64r6InstrInfo.td + defm : MipsHighestHigherHiLoRelocs<LUi64, DADDIU_MM64R6>, SYM_64, + ISA_MICROMIPS64R6; + +These patterns are used during instruction selection to match +MipsISD::{Highest, Higher, Hi, Lo} to a specific machine instruction +and operands. + +More details on how multiclasses in TableGen work can be found in the +section "Multiclass definitions and instances" in the document +"TableGen Language Introduction" + +4. Instruction definitions are multiply defined to cover the different +register classes. In some cases, such as LW/LW64, this also accounts +for the difference in the results of instruction execution. On MIPS32, +"lw" loads a 32 bit value from memory. On MIPS64, "lw" loads a 32 bit +value from memory and sign extends the value to 64 bits. + + // lib/Target/Mips/MipsInstrInfo.td + def LUi : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM; + // lib/Target/Mips/Mips64InstrInfo.td + def LUi64 : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM; + +defines two names "LUi" and "LUi64" with two different register +classes, but with the same encoding---"LUI_FM". These instructions load a +16-bit immediate into bits 31-16 and clear the lower 15 bits. On MIPS64, +the result is sign-extended to 64 bits. diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 3026f0be242d..0f6c2e53e60a 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -38,7 +38,7 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); - if (DestRC->getSize() != SrcRC->getSize()) + if (RegInfo.getRegSizeInBits(*DestRC) != RegInfo.getRegSizeInBits(*SrcRC)) report_fatal_error("Copy one register into another with a different width"); unsigned Op; diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index 4c9430a2eca0..2a402deccbca 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1898,12 +1898,13 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) || hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) { - const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; - const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC; - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + const TargetRegisterClass &GPRC = PPC::GPRCRegClass; + const TargetRegisterClass &G8RC = PPC::G8RCRegClass; + const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC; + const TargetRegisterInfo &TRI = *Subtarget.getRegisterInfo(); + unsigned Size = TRI.getSpillSize(RC); + unsigned Align = TRI.getSpillAlignment(RC); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); // Might we have over-aligned allocas? bool HasAlVars = MFI.hasVarSizedObjects() && @@ -1911,9 +1912,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, // These kinds of spills might need two registers. if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars) - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); } } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index f7663d8e5185..4659a2ea8032 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9057,6 +9057,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -9070,7 +9071,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); @@ -9153,7 +9154,6 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); - const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 8e159f47ea2e..790a8902b3d2 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -440,8 +440,8 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode)); } -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void PPCInstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(PPC::NOP); } diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index f11aed8fa268..b30d09e03ec4 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -269,7 +269,7 @@ public: /// unsigned getInstSizeInBytes(const MachineInstr &MI) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; + void getNoop(MCInst &NopInst) const override; std::pair<unsigned, unsigned> decomposeMachineOperandsTargetFlags(unsigned TF) const override; diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 455d1ee1564a..f120a98e9457 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -3234,6 +3234,7 @@ SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -3245,7 +3246,8 @@ SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); + (void)TRI; unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index c8ff9558cc88..fee008b9572a 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -104,8 +104,9 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI, MachineOperand &LowOffsetOp = MI->getOperand(2); LowOffsetOp.setImm(LowOffsetOp.getImm() + 8); - // Clear the kill flags for the base and index registers in the first - // instruction. + // Clear the kill flags on the registers in the first instruction. + if (EarlierMI->getOperand(0).isReg() && EarlierMI->getOperand(0).isUse()) + EarlierMI->getOperand(0).setIsKill(false); EarlierMI->getOperand(1).setIsKill(false); EarlierMI->getOperand(3).setIsKill(false); @@ -1114,10 +1115,9 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( return nullptr; unsigned OpNum = Ops[0]; - assert(Size == - MF.getRegInfo() - .getRegClass(MI.getOperand(OpNum).getReg()) - ->getSize() && + assert(Size * 8 == + TRI->getRegSizeInBits(*MF.getRegInfo() + .getRegClass(MI.getOperand(OpNum).getReg())) && "Invalid size combination"); if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 && diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index d9c2dba5bace..4178ec0b28f0 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -45,10 +45,11 @@ using namespace llvm; //===----------------------------------------------------------------------===// MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const { + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); const TargetRegisterClass *TRC = MRI->getRegClass(RegNo); for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) - if (TRC->hasType(T)) + if (TRI->isTypeLegalForClass(*TRC, T)) return T; DEBUG(errs() << "Unknown type for register number: " << RegNo); llvm_unreachable("Unknown register type"); diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 324da650e74e..c1cfc82b4a81 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3094,6 +3094,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { else if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); else if (IDVal.startswith(".att_syntax")) { + getParser().setParsingInlineAsm(false); if (getLexer().isNot(AsmToken::EndOfStatement)) { if (Parser.getTok().getString() == "prefix") Parser.Lex(); @@ -3106,6 +3107,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return false; } else if (IDVal.startswith(".intel_syntax")) { getParser().setAssemblerDialect(1); + getParser().setParsingInlineAsm(true); if (getLexer().isNot(AsmToken::EndOfStatement)) { if (Parser.getTok().getString() == "noprefix") Parser.Lex(); diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index fdcc7e1ab7b0..19c93cfff0fe 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -95,7 +95,8 @@ void initializeFixupBWInstPassPass(PassRegistry &); /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); -InstructionSelector *createX86InstructionSelector(X86Subtarget &, +InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, + X86Subtarget &, X86RegisterBankInfo &); void initializeEvexToVexInstPassPass(PassRegistry &); diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 8fcc8e31d5d4..d2f650cf8f47 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -273,6 +273,16 @@ def FeatureFastSHLDRotate "fast-shld-rotate", "HasFastSHLDRotate", "true", "SHLD can be used as a faster rotate">; +// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka +// "string operations"). See "REP String Enhancement" in the Intel Software +// Development Manual. This feature essentially means that REP MOVSB will copy +// using the largest available size instead of copying bytes one by one, making +// it at least as fast as REPMOVS{W,D,Q}. +def FeatureERMSB + : SubtargetFeature< + "ermsb", "HasERMSB", "true", + "REP MOVS/STOS are fast">; + //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// @@ -498,6 +508,7 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [ FeatureAVX2, FeatureBMI, FeatureBMI2, + FeatureERMSB, FeatureFMA, FeatureLZCNT, FeatureMOVBE, diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp index 137ef166aaeb..161bfa7b5474 100644 --- a/lib/Target/X86/X86CallLowering.cpp +++ b/lib/Target/X86/X86CallLowering.cpp @@ -53,7 +53,6 @@ void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, return; } - SmallVector<uint64_t, 4> BitOffsets; SmallVector<unsigned, 8> SplitRegs; EVT PartVT = TLI.getRegisterType(Context, VT); @@ -64,8 +63,10 @@ void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)), PartTy, OrigArg.Flags}; SplitArgs.push_back(Info); - PerformArgSplit(Info.Reg, PartVT.getSizeInBits() * i); + SplitRegs.push_back(Info.Reg); } + + PerformArgSplit(SplitRegs); } namespace { @@ -112,10 +113,9 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); SmallVector<ArgInfo, 8> SplitArgs; - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](unsigned Reg, uint64_t Offset) { - MIRBuilder.buildExtract(Reg, VReg, Offset); - }); + splitToValueTypes( + OrigArg, SplitArgs, DL, MRI, + [&](ArrayRef<unsigned> Regs) { MIRBuilder.buildUnmerge(Regs, VReg); }); FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86); if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) @@ -183,22 +183,10 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, for (auto &Arg : F.args()) { ArgInfo OrigArg(VRegs[Idx], Arg.getType()); setArgFlags(OrigArg, Idx + 1, DL, F); - LLT Ty = MRI.getType(VRegs[Idx]); - unsigned Dst = VRegs[Idx]; - bool Split = false; splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](unsigned Reg, uint64_t Offset) { - if (!Split) { - Split = true; - Dst = MRI.createGenericVirtualRegister(Ty); - MIRBuilder.buildUndef(Dst); - } - unsigned Tmp = MRI.createGenericVirtualRegister(Ty); - MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset); - Dst = Tmp; + [&](ArrayRef<unsigned> Regs) { + MIRBuilder.buildMerge(VRegs[Idx], Regs); }); - if (Dst != VRegs[Idx]) - MIRBuilder.buildCopy(VRegs[Idx], Dst); Idx++; } diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h index 204e6974c702..8a8afb568298 100644 --- a/lib/Target/X86/X86CallLowering.h +++ b/lib/Target/X86/X86CallLowering.h @@ -34,14 +34,15 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; + private: /// A function of this type is used to perform value split action. - typedef std::function<void(unsigned, uint64_t)> SplitArgTy; + typedef std::function<void(ArrayRef<unsigned>)> SplitArgTy; void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, SplitArgTy SplitArg) const; }; -} // End of namespace llvm; +} // namespace llvm #endif diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 036f5d2610e4..b8477810b4c9 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -2149,7 +2149,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { if (!LHSReg || !RHSReg) return false; - unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); + const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo(); + unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8); unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill); updateValueMap(I, ResultReg); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 8678a13b95d0..a94045cd536d 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1783,6 +1783,14 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return Offset + FPDelta; } +int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, + int FI, unsigned &FrameReg, + int Adjustment) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + FrameReg = TRI->getStackRegister(); + return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment; +} + int X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, unsigned &FrameReg, @@ -1839,9 +1847,6 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 && "we don't handle this case!"); - // Fill in FrameReg output argument. - FrameReg = TRI->getStackRegister(); - // This is how the math works out: // // %rsp grows (i.e. gets lower) left to right. Each box below is @@ -1866,12 +1871,8 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, // (C - E) == (C - A) - (B - A) + (B - E) // { Using [1], [2] and [3] above } // == getObjectOffset - LocalAreaOffset + StackSize - // - - // Get the Offset from the StackPointer - int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); - return Offset + StackSize; + return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize); } bool X86FrameLowering::assignCalleeSavedSpillSlots( @@ -1923,14 +1924,15 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( continue; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + unsigned Size = TRI->getSpillSize(*RC); + unsigned Align = TRI->getSpillAlignment(*RC); // ensure alignment - SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); + SpillSlotOffset -= std::abs(SpillSlotOffset) % Align; // spill into slot - SpillSlotOffset -= RC->getSize(); - int SlotIndex = - MFI.CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + SpillSlotOffset -= Size; + int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); - MFI.ensureMaxAlignment(RC->getAlignment()); + MFI.ensureMaxAlignment(Align); } return true; diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 863dc8b22968..7d214cabad53 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -100,6 +100,8 @@ public: int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + int getFrameIndexReferenceSP(const MachineFunction &MF, + int FI, unsigned &SPReg, int Adjustment) const; int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, unsigned &FrameReg, bool IgnoreSPUpdates) const override; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b5f29fb400ef..ada46643a5fe 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5060,8 +5060,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); + return DAG.getBuildVector( + ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -14424,8 +14424,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, // If the input is a buildvector just emit a smaller one. unsigned ElemsPerChunk = ResVT.getVectorNumElements(); if (In.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT, - makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk)); + return DAG.getBuildVector( + ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk)); // Everything else is legal. return Op; @@ -25944,6 +25944,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -25960,7 +25961,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, DstReg = MI.getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); + (void)TRI; unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); @@ -30207,7 +30209,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); - if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || + if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) || TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) { // If we changed the computation somewhere in the DAG, this change will @@ -33777,7 +33779,7 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || + if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) || TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); } @@ -35937,7 +35939,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. // MVT::Other is used to specify clobber names. - if (Res.second->hasType(VT) || VT == MVT::Other) + if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should @@ -35975,11 +35977,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) Res.second = &X86::FR64RegClass; - else if (X86::VR128RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT)) Res.second = &X86::VR128RegClass; - else if (X86::VR256RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT)) Res.second = &X86::VR256RegClass; - else if (X86::VR512RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { // Type mismatch and not a clobber: Return an error; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index bfd21c062aa2..66382014f6e8 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -989,10 +989,12 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // Constraints = "$src1 = $dst" - def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; - def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; - def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; + let mayLoad = 1, mayStore = 1 in { + def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; + } // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 7b456fd68343..26444dd1f619 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -6284,9 +6284,11 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + const TargetRegisterClass &RC = *MRI.getRegClass(DstReg); assert(Cond.size() == 1 && "Invalid Cond array"); unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), - MRI.getRegClass(DstReg)->getSize(), + TRI.getRegSizeInBits(RC) / 8, false /*HasMemoryOperand*/); BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); } @@ -6557,7 +6559,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, bool HasAVX512 = STI.hasAVX512(); bool HasVLX = STI.hasVLX(); - switch (RC->getSize()) { + switch (STI.getRegisterInfo()->getSpillSize(*RC)) { default: llvm_unreachable("Unknown spill size"); case 1: @@ -6603,28 +6605,36 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { - assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); - // If stack is realigned we can use aligned stores. - if (isStackAligned) - return load ? - (HasVLX ? X86::VMOVAPSZ128rm : - HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : - HasAVX ? X86::VMOVAPSrm : - X86::MOVAPSrm): - (HasVLX ? X86::VMOVAPSZ128mr : - HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX : - HasAVX ? X86::VMOVAPSmr : - X86::MOVAPSmr); - else - return load ? - (HasVLX ? X86::VMOVUPSZ128rm : - HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX : - HasAVX ? X86::VMOVUPSrm : - X86::MOVUPSrm): - (HasVLX ? X86::VMOVUPSZ128mr : - HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : - HasAVX ? X86::VMOVUPSmr : - X86::MOVUPSmr); + if (X86::VR128XRegClass.hasSubClassEq(RC)) { + // If stack is realigned we can use aligned stores. + if (isStackAligned) + return load ? + (HasVLX ? X86::VMOVAPSZ128rm : + HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : + HasAVX ? X86::VMOVAPSrm : + X86::MOVAPSrm): + (HasVLX ? X86::VMOVAPSZ128mr : + HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX : + HasAVX ? X86::VMOVAPSmr : + X86::MOVAPSmr); + else + return load ? + (HasVLX ? X86::VMOVUPSZ128rm : + HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX : + HasAVX ? X86::VMOVUPSrm : + X86::MOVUPSrm): + (HasVLX ? X86::VMOVUPSZ128mr : + HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : + HasAVX ? X86::VMOVUPSmr : + X86::MOVUPSmr); + } + if (X86::BNDRRegClass.hasSubClassEq(RC)) { + if (STI.is64Bit()) + return load ? X86::BNDMOVRM64rm : X86::BNDMOVMR64mr; + else + return load ? X86::BNDMOVRM32rm : X86::BNDMOVMR32mr; + } + llvm_unreachable("Unknown 16-byte regclass"); } case 32: assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); @@ -6709,9 +6719,9 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); - assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() && + assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && "Stack slot too small for store"); - unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); @@ -6728,7 +6738,8 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl<MachineInstr*> &NewMIs) const { - unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); @@ -6748,7 +6759,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); - unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); @@ -6763,7 +6774,8 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl<MachineInstr*> &NewMIs) const { - unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); @@ -7222,7 +7234,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, NewOpc = getSETFromCond(NewCC, HasMemoryOperand); else { unsigned DstReg = Instr.getOperand(0).getReg(); - NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(), + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + NewOpc = getCMovFromCond(NewCC, TRI->getRegSizeInBits(*DstRC)/8, HasMemoryOperand); } @@ -7750,7 +7763,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( unsigned DstIdx = (Imm >> 4) & 3; unsigned SrcIdx = (Imm >> 6) & 3; - unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size <= RCSize && 4 <= Align) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; @@ -7772,7 +7787,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. if (OpNum == 2) { - unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size <= RCSize && 8 <= Align) { unsigned NewOpCode = (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : @@ -7861,7 +7878,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; bool NarrowToMOV32rm = false; if (Size) { - unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, + &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size < RCSize) { // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -8302,11 +8322,13 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineFunction &MF) { unsigned Opc = LoadMI.getOpcode(); unsigned UserOpc = UserMI.getOpcode(); - unsigned RegSize = - MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = + MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg()); + unsigned RegSize = TRI.getRegSizeInBits(*RC); if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) && - RegSize > 4) { + RegSize > 32) { // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes), and its user // instruction isn't scalar (SS). @@ -8357,7 +8379,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, } if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) && - RegSize > 8) { + RegSize > 64) { // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes), and its user // instruction isn't scalar (SD). @@ -8702,6 +8724,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, bool FoldedStore = I->second.second & TB_FOLDED_STORE; const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); unsigned NumDefs = MCID.NumDefs; std::vector<SDValue> AddrOps; @@ -8724,7 +8747,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, // Emit the load instruction. SDNode *Load = nullptr; if (FoldedLoad) { - EVT VT = *RC->vt_begin(); + EVT VT = *TRI.legalclasstypes_begin(*RC); std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs = MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), @@ -8736,7 +8759,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, @@ -8752,7 +8775,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { DstRC = getRegClass(MCID, 0, &RI, MF); - VTs.push_back(*DstRC->vt_begin()); + VTs.push_back(*TRI.legalclasstypes_begin(*DstRC)); } for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { EVT VT = N->getValueType(i); @@ -8781,7 +8804,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; SDNode *Store = @@ -9514,7 +9537,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { } /// Return the noop instruction to use for a noop. -void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +void X86InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 2fee48570ce1..38567831b3a4 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -457,7 +457,7 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; + void getNoop(MCInst &NopInst) const override; bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index e31d2769047b..c3def461afdc 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -897,6 +897,7 @@ def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; +def HasERMSB : Predicate<"Subtarget->hasERMSB()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e1bf28cbf612..f22a50200c9a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4602,17 +4602,17 @@ def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), (v4i32 (scalar_to_vector (loadi32 addr:$src))))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; let isCodeGenOnly = 1 in def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; } // ExeDomain = SSEPackedInt @@ -4681,7 +4681,7 @@ def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), VEX; def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (extractelt (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>; @@ -4694,7 +4694,7 @@ def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt @@ -4721,7 +4721,7 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))], IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), @@ -4811,12 +4811,12 @@ let Predicates = [UseSSE2] in { } } -// These are the correct encodings of the instructions so that we know how to -// read correct assembly, even though we continue to emit the wrong ones for -// compatibility with Darwin's buggy assembler. -def : InstAlias<"movq\t{$src, $dst|$dst, $src}", +// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of +// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add +// these aliases. +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; -def : InstAlias<"movq\t{$src, $dst|$dst, $src}", +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", @@ -7144,33 +7144,37 @@ let Predicates = [UseSSE41] in { /// SS42I_binop_rm - Simple SSE 4.2 binary operator multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, bit Is2Addr = 1> { + X86MemOperand x86memop, OpndItins itins, + bit Is2Addr = 1> { def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>; + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, Sched<[itins.Sched]>; def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>; + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, - loadv2i64, i128mem, 0>, VEX_4V, VEX_WIG; + loadv2i64, i128mem, SSE_INTALU_ITINS_P, 0>, + VEX_4V, VEX_WIG; let Predicates = [HasAVX2] in defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L, VEX_WIG; + loadv4i64, i256mem, SSE_INTALU_ITINS_P, 0>, + VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, SSE_INTALU_ITINS_P>; //===----------------------------------------------------------------------===// // SSE4.2 - String/text Processing Instructions diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index fb9315792892..d0f1b7091da9 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -39,11 +39,16 @@ using namespace llvm; namespace { +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "X86GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + class X86InstructionSelector : public InstructionSelector { public: - X86InstructionSelector(const X86Subtarget &STI, + X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI, const X86RegisterBankInfo &RBI); + void beginFunction(const MachineFunction &MF) override; bool select(MachineInstr &I) const override; private: @@ -70,10 +75,17 @@ private: bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + const X86TargetMachine &TM; const X86Subtarget &STI; const X86InstrInfo &TII; const X86RegisterInfo &TRI; const X86RegisterBankInfo &RBI; + bool OptForSize; + bool OptForMinSize; + + PredicateBitset AvailableFeatures; + PredicateBitset computeAvailableFeatures(const MachineFunction *MF, + const X86Subtarget *Subtarget) const; #define GET_GLOBALISEL_TEMPORARIES_DECL #include "X86GenGlobalISel.inc" @@ -86,10 +98,12 @@ private: #include "X86GenGlobalISel.inc" #undef GET_GLOBALISEL_IMPL -X86InstructionSelector::X86InstructionSelector(const X86Subtarget &STI, +X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM, + const X86Subtarget &STI, const X86RegisterBankInfo &RBI) - : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI) + : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), OptForSize(false), + OptForMinSize(false), AvailableFeatures() #define GET_GLOBALISEL_TEMPORARIES_INIT #include "X86GenGlobalISel.inc" #undef GET_GLOBALISEL_TEMPORARIES_INIT @@ -181,6 +195,12 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return true; } +void X86InstructionSelector::beginFunction(const MachineFunction &MF) { + OptForSize = MF.getFunction()->optForSize(); + OptForMinSize = MF.getFunction()->optForMinSize(); + AvailableFeatures = computeAvailableFeatures(&MF, &STI); +} + bool X86InstructionSelector::select(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -571,7 +591,8 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, } InstructionSelector * -llvm::createX86InstructionSelector(X86Subtarget &Subtarget, +llvm::createX86InstructionSelector(const X86TargetMachine &TM, + X86Subtarget &Subtarget, X86RegisterBankInfo &RBI) { - return new X86InstructionSelector(Subtarget, RBI); + return new X86InstructionSelector(TM, Subtarget, RBI); } diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 9bab9a4cf3ba..1f16f3c9a14d 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -137,25 +137,29 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case X86::FR32RegClassID: case X86::FR64RegClassID: // If AVX-512 isn't supported we should only inflate to these classes. - if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize()) + if (!Subtarget.hasAVX512() && + getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::VR128RegClassID: case X86::VR256RegClassID: // If VLX isn't supported we should only inflate to these classes. - if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize()) + if (!Subtarget.hasVLX() && + getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::VR128XRegClassID: case X86::VR256XRegClassID: // If VLX isn't support we shouldn't inflate to these classes. - if (Subtarget.hasVLX() && Super->getSize() == RC->getSize()) + if (Subtarget.hasVLX() && + getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::FR32XRegClassID: case X86::FR64XRegClassID: // If AVX-512 isn't support we shouldn't inflate to these classes. - if (Subtarget.hasAVX512() && Super->getSize() == RC->getSize()) + if (Subtarget.hasAVX512() && + getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::GR8RegClassID: @@ -168,7 +172,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case X86::VR512RegClassID: // Don't return a super-class that would shrink the spill size. // That can happen with the vector and float classes. - if (Super->getSize() == RC->getSize()) + if (getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; } Super = *I++; @@ -669,32 +673,28 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineFunction &MF = *MI.getParent()->getParent(); const X86FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); - unsigned BasePtr; - unsigned Opc = MI.getOpcode(); - bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm || - Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64; - - if (hasBasePointer(MF)) - BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); - else if (needsStackRealignment(MF)) - BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr); - else if (AfterFPPop) - BasePtr = StackPtr; - else - BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + // Determine base register and offset. + int FIOffset; + unsigned BasePtr; + if (MI.isReturn()) { + assert((!needsStackRealignment(MF) || + MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) && + "Return instruction can only reference SP relative frame objects"); + FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0); + } else { + FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr); + } // LOCAL_ESCAPE uses a single offset, with no register. It only works in the // simple FP case, and doesn't work with stack realignment. On 32-bit, the // offset is from the traditional base pointer location. On 64-bit, the // offset is from the SP at the end of the prologue, not the FP location. This // matches the behavior of llvm.frameaddress. - unsigned IgnoredFrameReg; + unsigned Opc = MI.getOpcode(); if (Opc == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); - int Offset; - Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); - FI.ChangeToImmediate(Offset); + FI.ChangeToImmediate(FIOffset); return; } @@ -710,15 +710,6 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // FrameIndex with base register. Add an offset to the offset. MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false); - // Now add the frame object offset to the offset from EBP. - int FIOffset; - if (AfterFPPop) { - // Tail call jmp happens after FP is popped. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - FIOffset = MFI.getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); - } else - FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); - if (BasePtr == StackPtr) FIOffset += SPAdj; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 9da8a18965ea..1a72a0ba3a64 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -106,7 +106,6 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SDValue Count; ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); unsigned BytesLeft = 0; - bool TwoRepStos = false; if (ValC) { unsigned ValReg; uint64_t Val = ValC->getZExtValue() & 255; @@ -163,20 +162,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); - if (TwoRepStos) { - InFlag = Chain.getValue(1); - Count = Size; - EVT CVT = Count.getValueType(); - SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, - DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl, - CVT)); - Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX, - Left, InFlag); - InFlag = Chain.getValue(1); - Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; - Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); - } else if (BytesLeft) { + if (BytesLeft) { // Handle the last 1 - 7 bytes. unsigned Offset = SizeVal - BytesLeft; EVT AddrVT = Dst.getValueType(); @@ -195,6 +181,24 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( return Chain; } +namespace { + +// Represents a cover of a buffer of SizeVal bytes with blocks of size +// AVT, as well as how many bytes remain (BytesLeft is always smaller than +// the block size). +struct RepMovsRepeats { + RepMovsRepeats(const uint64_t SizeVal, const MVT& AVT) { + const unsigned UBytes = AVT.getSizeInBits() / 8; + Count = SizeVal / UBytes; + BytesLeft = SizeVal % UBytes; + } + + unsigned Count; + unsigned BytesLeft; +}; + +} // namespace + SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, @@ -229,7 +233,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( return SDValue(); MVT AVT; - if (Align & 1) + if (Subtarget.hasERMSB()) + // If the target has enhanced REPMOVSB, then it's at least as fast to use + // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle + // BytesLeft. + AVT = MVT::i8; + else if (Align & 1) AVT = MVT::i8; else if (Align & 2) AVT = MVT::i16; @@ -240,14 +249,18 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( // QWORD aligned AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; - unsigned UBytes = AVT.getSizeInBits() / 8; - unsigned CountVal = SizeVal / UBytes; - SDValue Count = DAG.getIntPtrConstant(CountVal, dl); - unsigned BytesLeft = SizeVal % UBytes; + RepMovsRepeats Repeats(SizeVal, AVT); + if (Repeats.BytesLeft > 0 && + DAG.getMachineFunction().getFunction()->optForMinSize()) { + // When agressively optimizing for size, avoid generating the code to handle + // BytesLeft. + AVT = MVT::i8; + Repeats = RepMovsRepeats(SizeVal, AVT); + } SDValue InFlag; Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, - Count, InFlag); + DAG.getIntPtrConstant(Repeats.Count, dl), InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); @@ -262,9 +275,9 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( SmallVector<SDValue, 4> Results; Results.push_back(RepMovs); - if (BytesLeft) { + if (Repeats.BytesLeft) { // Handle the last 1 - 7 bytes. - unsigned Offset = SizeVal - BytesLeft; + unsigned Offset = SizeVal - Repeats.BytesLeft; EVT DstVT = Dst.getValueType(); EVT SrcVT = Src.getValueType(); EVT SizeVT = Size.getValueType(); @@ -275,7 +288,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), - DAG.getConstant(BytesLeft, dl, SizeVT), + DAG.getConstant(Repeats.BytesLeft, dl, + SizeVT), Align, isVolatile, AlwaysInline, false, DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 92a68759195c..4154530d04e7 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -303,6 +303,7 @@ void X86Subtarget::initializeEnvironment() { HasFastVectorFSQRT = false; HasFastLZCNT = false; HasFastSHLDRotate = false; + HasERMSB = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index d0d88d326949..fd057f36c890 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -232,6 +232,9 @@ protected: /// True if SHLD based rotate is fast. bool HasFastSHLDRotate; + /// True if the processor has enhanced REP MOVSB/STOSB. + bool HasERMSB; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -472,6 +475,7 @@ public: bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } + bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 03a1958121ab..623cf38aa951 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -286,7 +286,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo()); GISel->RegBankInfo.reset(RBI); - GISel->InstSelector.reset(createX86InstructionSelector(*I, *RBI)); + GISel->InstSelector.reset(createX86InstructionSelector(*this, *I, *RBI)); #endif I->setGISelAccessor(*GISel); } diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp index a752357400b3..784612038c09 100644 --- a/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/lib/Target/XCore/XCoreFrameLowering.cpp @@ -575,18 +575,17 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const { assert(RS && "requiresRegisterScavenging failed"); MachineFrameInfo &MFI = MF.getFrameInfo(); - const TargetRegisterClass *RC = &XCore::GRRegsRegClass; + const TargetRegisterClass &RC = XCore::GRRegsRegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>(); // Reserve slots close to SP or frame pointer for Scavenging spills. // When using SP for small frames, we don't need any scratch registers. // When using SP for large frames, we may need 2 scratch registers. // When using FP, for large or small frames, we may need 1 scratch register. + unsigned Size = TRI.getSpillSize(RC); + unsigned Align = TRI.getSpillAlignment(RC); if (XFI->isLargeFrame(MF) || hasFP(MF)) - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); if (XFI->isLargeFrame(MF) && !hasFP(MF)) - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); } diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 45437815fa37..2efcd46cd8d4 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -1605,7 +1605,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) || + if (TLI.ShrinkDemandedConstant(OutVal, DemandedMask, TLO) || TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); @@ -1622,7 +1622,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Time, DemandedMask) || + if (TLI.ShrinkDemandedConstant(Time, DemandedMask, TLO) || TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp index e91536ca1e83..75af0e97dfb5 100644 --- a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp +++ b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp @@ -10,6 +10,7 @@ #include "XCoreMachineFunctionInfo.h" #include "XCoreInstrInfo.h" #include "llvm/IR/Function.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -35,13 +36,15 @@ int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) { if (LRSpillSlotSet) { return LRSpillSlot; } - const TargetRegisterClass *RC = &XCore::GRRegsRegClass; + const TargetRegisterClass &RC = XCore::GRRegsRegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); if (! MF.getFunction()->isVarArg()) { // A fixed offset of 0 allows us to save / restore LR using entsp / retsp. - LRSpillSlot = MFI.CreateFixedObject(RC->getSize(), 0, true); + LRSpillSlot = MFI.CreateFixedObject(TRI.getSpillSize(RC), 0, true); } else { - LRSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true); + LRSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC), + TRI.getSpillAlignment(RC), true); } LRSpillSlotSet = true; return LRSpillSlot; @@ -51,9 +54,11 @@ int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) { if (FPSpillSlotSet) { return FPSpillSlot; } - const TargetRegisterClass *RC = &XCore::GRRegsRegClass; + const TargetRegisterClass &RC = XCore::GRRegsRegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); - FPSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true); + FPSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC), + TRI.getSpillAlignment(RC), true); FPSpillSlotSet = true; return FPSpillSlot; } @@ -62,10 +67,13 @@ const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) { if (EHSpillSlotSet) { return EHSpillSlot; } - const TargetRegisterClass *RC = &XCore::GRRegsRegClass; + const TargetRegisterClass &RC = XCore::GRRegsRegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); - EHSpillSlot[0] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true); - EHSpillSlot[1] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true); + unsigned Size = TRI.getSpillSize(RC); + unsigned Align = TRI.getSpillAlignment(RC); + EHSpillSlot[0] = MFI.CreateStackObject(Size, Align, true); + EHSpillSlot[1] = MFI.CreateStackObject(Size, Align, true); EHSpillSlotSet = true; return EHSpillSlot; } |