diff options
Diffstat (limited to 'llvm/lib/Target/PowerPC/PPCInstrInfo.cpp')
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 1073 |
1 files changed, 948 insertions, 125 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 11c97210ead9..9e3c6c569bd7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -21,12 +21,15 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackMaps.h" @@ -73,6 +76,14 @@ static cl::opt<bool> UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden, cl::desc("Use the old (incorrect) instruction latency calculation")); +static cl::opt<float> + FMARPFactor("ppc-fma-rp-factor", cl::Hidden, cl::init(1.5), + cl::desc("register pressure factor for the transformations.")); + +static cl::opt<bool> EnableFMARegPressureReduction( + "ppc-fma-rp-reduction", cl::Hidden, cl::init(true), + cl::desc("enable register pressure reduce in machine combiner pass.")); + // Pin the vtable to this file. void PPCInstrInfo::anchor() {} @@ -259,14 +270,6 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case PPC::XVMULDP: case PPC::XVMULSP: case PPC::XSMULSP: - // QPX Add: - case PPC::QVFADD: - case PPC::QVFADDS: - case PPC::QVFADDSs: - // QPX Multiply: - case PPC::QVFMUL: - case PPC::QVFMULS: - case PPC::QVFMULSs: return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && Inst.getFlag(MachineInstr::MIFlag::FmNsz); // Fixed point: @@ -286,23 +289,23 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { #define InfoArrayIdxFMULInst 2 #define InfoArrayIdxAddOpIdx 3 #define InfoArrayIdxMULOpIdx 4 +#define InfoArrayIdxFSubInst 5 // Array keeps info for FMA instructions: // Index 0(InfoArrayIdxFMAInst): FMA instruction; -// Index 1(InfoArrayIdxFAddInst): ADD instruction assoaicted with FMA; -// Index 2(InfoArrayIdxFMULInst): MUL instruction assoaicted with FMA; +// Index 1(InfoArrayIdxFAddInst): ADD instruction associated with FMA; +// Index 2(InfoArrayIdxFMULInst): MUL instruction associated with FMA; // Index 3(InfoArrayIdxAddOpIdx): ADD operand index in FMA operands; // Index 4(InfoArrayIdxMULOpIdx): first MUL operand index in FMA operands; -// second MUL operand index is plus 1. -static const uint16_t FMAOpIdxInfo[][5] = { +// second MUL operand index is plus 1; +// Index 5(InfoArrayIdxFSubInst): SUB instruction associated with FMA. +static const uint16_t FMAOpIdxInfo[][6] = { // FIXME: Add more FMA instructions like XSNMADDADP and so on. - {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2}, - {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2}, - {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2}, - {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2}, - {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1}, - {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}, - {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1}, - {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}}; + {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2, PPC::XSSUBDP}, + {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2, PPC::XSSUBSP}, + {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2, PPC::XVSUBDP}, + {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2, PPC::XVSUBSP}, + {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1, PPC::FSUB}, + {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1, PPC::FSUBS}}; // Check if an opcode is a FMA instruction. If it is, return the index in array // FMAOpIdxInfo. Otherwise, return -1. @@ -313,6 +316,8 @@ int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const { return -1; } +// On PowerPC target, we have two kinds of patterns related to FMA: +// 1: Improve ILP. // Try to reassociate FMA chains like below: // // Pattern 1: @@ -336,11 +341,35 @@ int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const { // // breaking the dependency between A and B, allowing FMA to be executed in // parallel (or back-to-back in a pipeline) instead of depending on each other. +// +// 2: Reduce register pressure. +// Try to reassociate FMA with FSUB and a constant like below: +// C is a floatint point const. +// +// Pattern 1: +// A = FSUB X, Y (Leaf) +// D = FMA B, C, A (Root) +// --> +// A = FMA B, Y, -C +// D = FMA A, X, C +// +// Pattern 2: +// A = FSUB X, Y (Leaf) +// D = FMA B, A, C (Root) +// --> +// A = FMA B, Y, -C +// D = FMA A, X, C +// +// Before the transformation, A must be assigned with different hardware +// register with D. After the transformation, A and D must be assigned with +// same hardware register due to TIE attricute of FMA instructions. +// bool PPCInstrInfo::getFMAPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, + bool DoRegPressureReduce) const { MachineBasicBlock *MBB = Root.getParent(); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); auto IsAllOpsVirtualReg = [](const MachineInstr &Instr) { for (const auto &MO : Instr.explicit_operands()) @@ -349,16 +378,35 @@ bool PPCInstrInfo::getFMAPatterns( return true; }; - auto IsReassociable = [&](const MachineInstr &Instr, int16_t &AddOpIdx, - bool IsLeaf, bool IsAdd) { - int16_t Idx = -1; - if (!IsAdd) { - Idx = getFMAOpIdxInfo(Instr.getOpcode()); - if (Idx < 0) - return false; - } else if (Instr.getOpcode() != - FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())] - [InfoArrayIdxFAddInst]) + auto IsReassociableAddOrSub = [&](const MachineInstr &Instr, + unsigned OpType) { + if (Instr.getOpcode() != + FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())][OpType]) + return false; + + // Instruction can be reassociated. + // fast math flags may prohibit reassociation. + if (!(Instr.getFlag(MachineInstr::MIFlag::FmReassoc) && + Instr.getFlag(MachineInstr::MIFlag::FmNsz))) + return false; + + // Instruction operands are virtual registers for reassociation. + if (!IsAllOpsVirtualReg(Instr)) + return false; + + // For register pressure reassociation, the FSub must have only one use as + // we want to delete the sub to save its def. + if (OpType == InfoArrayIdxFSubInst && + !MRI->hasOneNonDBGUse(Instr.getOperand(0).getReg())) + return false; + + return true; + }; + + auto IsReassociableFMA = [&](const MachineInstr &Instr, int16_t &AddOpIdx, + int16_t &MulOpIdx, bool IsLeaf) { + int16_t Idx = getFMAOpIdxInfo(Instr.getOpcode()); + if (Idx < 0) return false; // Instruction can be reassociated. @@ -371,65 +419,356 @@ bool PPCInstrInfo::getFMAPatterns( if (!IsAllOpsVirtualReg(Instr)) return false; - if (IsAdd && IsLeaf) + MulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx]; + if (IsLeaf) return true; AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx]; const MachineOperand &OpAdd = Instr.getOperand(AddOpIdx); - MachineInstr *MIAdd = MRI.getUniqueVRegDef(OpAdd.getReg()); + MachineInstr *MIAdd = MRI->getUniqueVRegDef(OpAdd.getReg()); // If 'add' operand's def is not in current block, don't do ILP related opt. if (!MIAdd || MIAdd->getParent() != MBB) return false; // If this is not Leaf FMA Instr, its 'add' operand should only have one use // as this fma will be changed later. - return IsLeaf ? true : MRI.hasOneNonDBGUse(OpAdd.getReg()); + return IsLeaf ? true : MRI->hasOneNonDBGUse(OpAdd.getReg()); }; int16_t AddOpIdx = -1; + int16_t MulOpIdx = -1; + + bool IsUsedOnceL = false; + bool IsUsedOnceR = false; + MachineInstr *MULInstrL = nullptr; + MachineInstr *MULInstrR = nullptr; + + auto IsRPReductionCandidate = [&]() { + // Currently, we only support float and double. + // FIXME: add support for other types. + unsigned Opcode = Root.getOpcode(); + if (Opcode != PPC::XSMADDASP && Opcode != PPC::XSMADDADP) + return false; + + // Root must be a valid FMA like instruction. + // Treat it as leaf as we don't care its add operand. + if (IsReassociableFMA(Root, AddOpIdx, MulOpIdx, true)) { + assert((MulOpIdx >= 0) && "mul operand index not right!"); + Register MULRegL = TRI->lookThruSingleUseCopyChain( + Root.getOperand(MulOpIdx).getReg(), MRI); + Register MULRegR = TRI->lookThruSingleUseCopyChain( + Root.getOperand(MulOpIdx + 1).getReg(), MRI); + if (!MULRegL && !MULRegR) + return false; + + if (MULRegL && !MULRegR) { + MULRegR = + TRI->lookThruCopyLike(Root.getOperand(MulOpIdx + 1).getReg(), MRI); + IsUsedOnceL = true; + } else if (!MULRegL && MULRegR) { + MULRegL = + TRI->lookThruCopyLike(Root.getOperand(MulOpIdx).getReg(), MRI); + IsUsedOnceR = true; + } else { + IsUsedOnceL = true; + IsUsedOnceR = true; + } + + if (!Register::isVirtualRegister(MULRegL) || + !Register::isVirtualRegister(MULRegR)) + return false; + + MULInstrL = MRI->getVRegDef(MULRegL); + MULInstrR = MRI->getVRegDef(MULRegR); + return true; + } + return false; + }; + + // Register pressure fma reassociation patterns. + if (DoRegPressureReduce && IsRPReductionCandidate()) { + assert((MULInstrL && MULInstrR) && "wrong register preduction candidate!"); + // Register pressure pattern 1 + if (isLoadFromConstantPool(MULInstrL) && IsUsedOnceR && + IsReassociableAddOrSub(*MULInstrR, InfoArrayIdxFSubInst)) { + LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_BCA\n"); + Patterns.push_back(MachineCombinerPattern::REASSOC_XY_BCA); + return true; + } + + // Register pressure pattern 2 + if ((isLoadFromConstantPool(MULInstrR) && IsUsedOnceL && + IsReassociableAddOrSub(*MULInstrL, InfoArrayIdxFSubInst))) { + LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_BAC\n"); + Patterns.push_back(MachineCombinerPattern::REASSOC_XY_BAC); + return true; + } + } + + // ILP fma reassociation patterns. // Root must be a valid FMA like instruction. - if (!IsReassociable(Root, AddOpIdx, false, false)) + AddOpIdx = -1; + if (!IsReassociableFMA(Root, AddOpIdx, MulOpIdx, false)) return false; assert((AddOpIdx >= 0) && "add operand index not right!"); Register RegB = Root.getOperand(AddOpIdx).getReg(); - MachineInstr *Prev = MRI.getUniqueVRegDef(RegB); + MachineInstr *Prev = MRI->getUniqueVRegDef(RegB); // Prev must be a valid FMA like instruction. AddOpIdx = -1; - if (!IsReassociable(*Prev, AddOpIdx, false, false)) + if (!IsReassociableFMA(*Prev, AddOpIdx, MulOpIdx, false)) return false; assert((AddOpIdx >= 0) && "add operand index not right!"); Register RegA = Prev->getOperand(AddOpIdx).getReg(); - MachineInstr *Leaf = MRI.getUniqueVRegDef(RegA); + MachineInstr *Leaf = MRI->getUniqueVRegDef(RegA); AddOpIdx = -1; - if (IsReassociable(*Leaf, AddOpIdx, true, false)) { + if (IsReassociableFMA(*Leaf, AddOpIdx, MulOpIdx, true)) { Patterns.push_back(MachineCombinerPattern::REASSOC_XMM_AMM_BMM); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_XMM_AMM_BMM\n"); return true; } - if (IsReassociable(*Leaf, AddOpIdx, true, true)) { + if (IsReassociableAddOrSub(*Leaf, InfoArrayIdxFAddInst)) { Patterns.push_back(MachineCombinerPattern::REASSOC_XY_AMM_BMM); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_AMM_BMM\n"); return true; } return false; } +void PPCInstrInfo::finalizeInsInstrs( + MachineInstr &Root, MachineCombinerPattern &P, + SmallVectorImpl<MachineInstr *> &InsInstrs) const { + assert(!InsInstrs.empty() && "Instructions set to be inserted is empty!"); + + MachineFunction *MF = Root.getMF(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineConstantPool *MCP = MF->getConstantPool(); + + int16_t Idx = getFMAOpIdxInfo(Root.getOpcode()); + if (Idx < 0) + return; + + uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx]; + + // For now we only need to fix up placeholder for register pressure reduce + // patterns. + Register ConstReg = 0; + switch (P) { + case MachineCombinerPattern::REASSOC_XY_BCA: + ConstReg = + TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx).getReg(), MRI); + break; + case MachineCombinerPattern::REASSOC_XY_BAC: + ConstReg = + TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx + 1).getReg(), MRI); + break; + default: + // Not register pressure reduce patterns. + return; + } + + MachineInstr *ConstDefInstr = MRI->getVRegDef(ConstReg); + // Get const value from const pool. + const Constant *C = getConstantFromConstantPool(ConstDefInstr); + assert(isa<llvm::ConstantFP>(C) && "not a valid constant!"); + + // Get negative fp const. + APFloat F1((dyn_cast<ConstantFP>(C))->getValueAPF()); + F1.changeSign(); + Constant *NegC = ConstantFP::get(dyn_cast<ConstantFP>(C)->getContext(), F1); + Align Alignment = MF->getDataLayout().getPrefTypeAlign(C->getType()); + + // Put negative fp const into constant pool. + unsigned ConstPoolIdx = MCP->getConstantPoolIndex(NegC, Alignment); + + MachineOperand *Placeholder = nullptr; + // Record the placeholder PPC::ZERO8 we add in reassociateFMA. + for (auto *Inst : InsInstrs) { + for (MachineOperand &Operand : Inst->explicit_operands()) { + assert(Operand.isReg() && "Invalid instruction in InsInstrs!"); + if (Operand.getReg() == PPC::ZERO8) { + Placeholder = &Operand; + break; + } + } + } + + assert(Placeholder && "Placeholder does not exist!"); + + // Generate instructions to load the const fp from constant pool. + // We only support PPC64 and medium code model. + Register LoadNewConst = + generateLoadForNewConst(ConstPoolIdx, &Root, C->getType(), InsInstrs); + + // Fill the placeholder with the new load from constant pool. + Placeholder->setReg(LoadNewConst); +} + +bool PPCInstrInfo::shouldReduceRegisterPressure( + MachineBasicBlock *MBB, RegisterClassInfo *RegClassInfo) const { + + if (!EnableFMARegPressureReduction) + return false; + + // Currently, we only enable register pressure reducing in machine combiner + // for: 1: PPC64; 2: Code Model is Medium; 3: Power9 which also has vector + // support. + // + // So we need following instructions to access a TOC entry: + // + // %6:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, %const.0 + // %7:vssrc = DFLOADf32 target-flags(ppc-toc-lo) %const.0, + // killed %6:g8rc_and_g8rc_nox0, implicit $x2 :: (load 4 from constant-pool) + // + // FIXME: add more supported targets, like Small and Large code model, PPC32, + // AIX. + if (!(Subtarget.isPPC64() && Subtarget.hasP9Vector() && + Subtarget.getTargetMachine().getCodeModel() == CodeModel::Medium)) + return false; + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + + auto GetMBBPressure = [&](MachineBasicBlock *MBB) -> std::vector<unsigned> { + RegionPressure Pressure; + RegPressureTracker RPTracker(Pressure); + + // Initialize the register pressure tracker. + RPTracker.init(MBB->getParent(), RegClassInfo, nullptr, MBB, MBB->end(), + /*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true); + + for (MachineBasicBlock::iterator MII = MBB->instr_end(), + MIE = MBB->instr_begin(); + MII != MIE; --MII) { + MachineInstr &MI = *std::prev(MII); + if (MI.isDebugValue() || MI.isDebugLabel()) + continue; + RegisterOperands RegOpers; + RegOpers.collect(MI, *TRI, *MRI, false, false); + RPTracker.recedeSkipDebugValues(); + assert(&*RPTracker.getPos() == &MI && "RPTracker sync error!"); + RPTracker.recede(RegOpers); + } + + // Close the RPTracker to finalize live ins. + RPTracker.closeRegion(); + + return RPTracker.getPressure().MaxSetPressure; + }; + + // For now we only care about float and double type fma. + unsigned VSSRCLimit = TRI->getRegPressureSetLimit( + *MBB->getParent(), PPC::RegisterPressureSets::VSSRC); + + // Only reduce register pressure when pressure is high. + return GetMBBPressure(MBB)[PPC::RegisterPressureSets::VSSRC] > + (float)VSSRCLimit * FMARPFactor; +} + +bool PPCInstrInfo::isLoadFromConstantPool(MachineInstr *I) const { + // I has only one memory operand which is load from constant pool. + if (!I->hasOneMemOperand()) + return false; + + MachineMemOperand *Op = I->memoperands()[0]; + return Op->isLoad() && Op->getPseudoValue() && + Op->getPseudoValue()->kind() == PseudoSourceValue::ConstantPool; +} + +Register PPCInstrInfo::generateLoadForNewConst( + unsigned Idx, MachineInstr *MI, Type *Ty, + SmallVectorImpl<MachineInstr *> &InsInstrs) const { + // Now we only support PPC64, Medium code model and P9 with vector. + // We have immutable pattern to access const pool. See function + // shouldReduceRegisterPressure. + assert((Subtarget.isPPC64() && Subtarget.hasP9Vector() && + Subtarget.getTargetMachine().getCodeModel() == CodeModel::Medium) && + "Target not supported!\n"); + + MachineFunction *MF = MI->getMF(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + + // Generate ADDIStocHA8 + Register VReg1 = MRI->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass); + MachineInstrBuilder TOCOffset = + BuildMI(*MF, MI->getDebugLoc(), get(PPC::ADDIStocHA8), VReg1) + .addReg(PPC::X2) + .addConstantPoolIndex(Idx); + + assert((Ty->isFloatTy() || Ty->isDoubleTy()) && + "Only float and double are supported!"); + + unsigned LoadOpcode; + // Should be float type or double type. + if (Ty->isFloatTy()) + LoadOpcode = PPC::DFLOADf32; + else + LoadOpcode = PPC::DFLOADf64; + + const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg()); + Register VReg2 = MRI->createVirtualRegister(RC); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad, + Ty->getScalarSizeInBits() / 8, MF->getDataLayout().getPrefTypeAlign(Ty)); + + // Generate Load from constant pool. + MachineInstrBuilder Load = + BuildMI(*MF, MI->getDebugLoc(), get(LoadOpcode), VReg2) + .addConstantPoolIndex(Idx) + .addReg(VReg1, getKillRegState(true)) + .addMemOperand(MMO); + + Load->getOperand(1).setTargetFlags(PPCII::MO_TOC_LO); + + // Insert the toc load instructions into InsInstrs. + InsInstrs.insert(InsInstrs.begin(), Load); + InsInstrs.insert(InsInstrs.begin(), TOCOffset); + return VReg2; +} + +// This function returns the const value in constant pool if the \p I is a load +// from constant pool. +const Constant * +PPCInstrInfo::getConstantFromConstantPool(MachineInstr *I) const { + MachineFunction *MF = I->getMF(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + MachineConstantPool *MCP = MF->getConstantPool(); + assert(I->mayLoad() && "Should be a load instruction.\n"); + for (auto MO : I->uses()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (Reg == 0 || !Register::isVirtualRegister(Reg)) + continue; + // Find the toc address. + MachineInstr *DefMI = MRI->getVRegDef(Reg); + for (auto MO2 : DefMI->uses()) + if (MO2.isCPI()) + return (MCP->getConstants())[MO2.getIndex()].Val.ConstVal; + } + return nullptr; +} + bool PPCInstrInfo::getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, + bool DoRegPressureReduce) const { // Using the machine combiner in this way is potentially expensive, so // restrict to when aggressive optimizations are desired. if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive) return false; - if (getFMAPatterns(Root, Patterns)) + if (getFMAPatterns(Root, Patterns, DoRegPressureReduce)) return true; - return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); } void PPCInstrInfo::genAlternativeCodeSequence( @@ -440,6 +779,8 @@ void PPCInstrInfo::genAlternativeCodeSequence( switch (Pattern) { case MachineCombinerPattern::REASSOC_XY_AMM_BMM: case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: + case MachineCombinerPattern::REASSOC_XY_BCA: + case MachineCombinerPattern::REASSOC_XY_BAC: reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg); break; default: @@ -450,8 +791,6 @@ void PPCInstrInfo::genAlternativeCodeSequence( } } -// Currently, only handle two patterns REASSOC_XY_AMM_BMM and -// REASSOC_XMM_AMM_BMM. See comments for getFMAPatterns. void PPCInstrInfo::reassociateFMA( MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, @@ -459,6 +798,7 @@ void PPCInstrInfo::reassociateFMA( DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { MachineFunction *MF = Root.getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); MachineOperand &OpC = Root.getOperand(0); Register RegC = OpC.getReg(); const TargetRegisterClass *RC = MRI.getRegClass(RegC); @@ -468,13 +808,42 @@ void PPCInstrInfo::reassociateFMA( int16_t Idx = getFMAOpIdxInfo(FmaOp); assert(Idx >= 0 && "Root must be a FMA instruction"); + bool IsILPReassociate = + (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) || + (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM); + uint16_t AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx]; uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx]; - MachineInstr *Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg()); - MachineInstr *Leaf = - MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg()); - uint16_t IntersectedFlags = - Root.getFlags() & Prev->getFlags() & Leaf->getFlags(); + + MachineInstr *Prev = nullptr; + MachineInstr *Leaf = nullptr; + switch (Pattern) { + default: + llvm_unreachable("not recognized pattern!"); + case MachineCombinerPattern::REASSOC_XY_AMM_BMM: + case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: + Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg()); + Leaf = MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg()); + break; + case MachineCombinerPattern::REASSOC_XY_BAC: { + Register MULReg = + TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx).getReg(), &MRI); + Leaf = MRI.getVRegDef(MULReg); + break; + } + case MachineCombinerPattern::REASSOC_XY_BCA: { + Register MULReg = TRI->lookThruCopyLike( + Root.getOperand(FirstMulOpIdx + 1).getReg(), &MRI); + Leaf = MRI.getVRegDef(MULReg); + break; + } + } + + uint16_t IntersectedFlags = 0; + if (IsILPReassociate) + IntersectedFlags = Root.getFlags() & Prev->getFlags() & Leaf->getFlags(); + else + IntersectedFlags = Root.getFlags() & Leaf->getFlags(); auto GetOperandInfo = [&](const MachineOperand &Operand, Register &Reg, bool &KillFlag) { @@ -484,36 +853,51 @@ void PPCInstrInfo::reassociateFMA( }; auto GetFMAInstrInfo = [&](const MachineInstr &Instr, Register &MulOp1, - Register &MulOp2, bool &MulOp1KillFlag, - bool &MulOp2KillFlag) { + Register &MulOp2, Register &AddOp, + bool &MulOp1KillFlag, bool &MulOp2KillFlag, + bool &AddOpKillFlag) { GetOperandInfo(Instr.getOperand(FirstMulOpIdx), MulOp1, MulOp1KillFlag); GetOperandInfo(Instr.getOperand(FirstMulOpIdx + 1), MulOp2, MulOp2KillFlag); + GetOperandInfo(Instr.getOperand(AddOpIdx), AddOp, AddOpKillFlag); }; - Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32; + Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32, RegA11, + RegA21, RegB; bool KillX = false, KillY = false, KillM11 = false, KillM12 = false, - KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false; + KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false, + KillA11 = false, KillA21 = false, KillB = false; - GetFMAInstrInfo(Root, RegM31, RegM32, KillM31, KillM32); - GetFMAInstrInfo(*Prev, RegM21, RegM22, KillM21, KillM22); + GetFMAInstrInfo(Root, RegM31, RegM32, RegB, KillM31, KillM32, KillB); + + if (IsILPReassociate) + GetFMAInstrInfo(*Prev, RegM21, RegM22, RegA21, KillM21, KillM22, KillA21); if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { - GetFMAInstrInfo(*Leaf, RegM11, RegM12, KillM11, KillM12); + GetFMAInstrInfo(*Leaf, RegM11, RegM12, RegA11, KillM11, KillM12, KillA11); GetOperandInfo(Leaf->getOperand(AddOpIdx), RegX, KillX); } else if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) { GetOperandInfo(Leaf->getOperand(1), RegX, KillX); GetOperandInfo(Leaf->getOperand(2), RegY, KillY); + } else { + // Get FSUB instruction info. + GetOperandInfo(Leaf->getOperand(1), RegX, KillX); + GetOperandInfo(Leaf->getOperand(2), RegY, KillY); } // Create new virtual registers for the new results instead of // recycling legacy ones because the MachineCombiner's computation of the // critical path requires a new register definition rather than an existing // one. + // For register pressure reassociation, we only need create one virtual + // register for the new fma. Register NewVRA = MRI.createVirtualRegister(RC); InstrIdxForVirtReg.insert(std::make_pair(NewVRA, 0)); - Register NewVRB = MRI.createVirtualRegister(RC); - InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1)); + Register NewVRB = 0; + if (IsILPReassociate) { + NewVRB = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1)); + } Register NewVRD = 0; if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { @@ -532,7 +916,11 @@ void PPCInstrInfo::reassociateFMA( MI->getOperand(FirstMulOpIdx + 1).setIsKill(KillRegMul2); }; - if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) { + MachineInstrBuilder NewARegPressure, NewCRegPressure; + switch (Pattern) { + default: + llvm_unreachable("not recognized pattern!"); + case MachineCombinerPattern::REASSOC_XY_AMM_BMM: { // Create new instructions for insertion. MachineInstrBuilder MINewB = BuildMI(*MF, Prev->getDebugLoc(), get(FmaOp), NewVRB) @@ -565,7 +953,9 @@ void PPCInstrInfo::reassociateFMA( InsInstrs.push_back(MINewA); InsInstrs.push_back(MINewB); InsInstrs.push_back(MINewC); - } else if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { + break; + } + case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: { assert(NewVRD && "new FMA register not created!"); // Create new instructions for insertion. MachineInstrBuilder MINewA = @@ -607,6 +997,47 @@ void PPCInstrInfo::reassociateFMA( InsInstrs.push_back(MINewB); InsInstrs.push_back(MINewD); InsInstrs.push_back(MINewC); + break; + } + case MachineCombinerPattern::REASSOC_XY_BAC: + case MachineCombinerPattern::REASSOC_XY_BCA: { + Register VarReg; + bool KillVarReg = false; + if (Pattern == MachineCombinerPattern::REASSOC_XY_BCA) { + VarReg = RegM31; + KillVarReg = KillM31; + } else { + VarReg = RegM32; + KillVarReg = KillM32; + } + // We don't want to get negative const from memory pool too early, as the + // created entry will not be deleted even if it has no users. Since all + // operand of Leaf and Root are virtual register, we use zero register + // here as a placeholder. When the InsInstrs is selected in + // MachineCombiner, we call finalizeInsInstrs to replace the zero register + // with a virtual register which is a load from constant pool. + NewARegPressure = BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), NewVRA) + .addReg(RegB, getKillRegState(RegB)) + .addReg(RegY, getKillRegState(KillY)) + .addReg(PPC::ZERO8); + NewCRegPressure = BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), RegC) + .addReg(NewVRA, getKillRegState(true)) + .addReg(RegX, getKillRegState(KillX)) + .addReg(VarReg, getKillRegState(KillVarReg)); + // For now, we only support xsmaddadp/xsmaddasp, their add operand are + // both at index 1, no need to adjust. + // FIXME: when add more fma instructions support, like fma/fmas, adjust + // the operand index here. + break; + } + } + + if (!IsILPReassociate) { + setSpecialOperandAttr(*NewARegPressure, IntersectedFlags); + setSpecialOperandAttr(*NewCRegPressure, IntersectedFlags); + + InsInstrs.push_back(NewARegPressure); + InsInstrs.push_back(NewCRegPressure); } assert(!InsInstrs.empty() && @@ -614,7 +1045,8 @@ void PPCInstrInfo::reassociateFMA( // Record old instructions for deletion. DelInstrs.push_back(Leaf); - DelInstrs.push_back(Prev); + if (IsILPReassociate) + DelInstrs.push_back(Prev); DelInstrs.push_back(&Root); } @@ -666,7 +1098,6 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::LI8: case PPC::LIS: case PPC::LIS8: - case PPC::QVGPCI: case PPC::ADDIStocHA: case PPC::ADDIStocHA8: case PPC::ADDItocL: @@ -683,6 +1114,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::V_SETALLONES: case PPC::CRSET: case PPC::CRUNSET: + case PPC::XXSETACCZ: return true; } return false; @@ -1283,14 +1715,22 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(31); return; } else if (PPC::CRRCRegClass.contains(SrcReg) && - PPC::G8RCRegClass.contains(DestReg)) { - BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg).addReg(SrcReg); - getKillRegState(KillSrc); - return; - } else if (PPC::CRRCRegClass.contains(SrcReg) && - PPC::GPRCRegClass.contains(DestReg)) { - BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg); + (PPC::G8RCRegClass.contains(DestReg) || + PPC::GPRCRegClass.contains(DestReg))) { + bool Is64Bit = PPC::G8RCRegClass.contains(DestReg); + unsigned MvCode = Is64Bit ? PPC::MFOCRF8 : PPC::MFOCRF; + unsigned ShCode = Is64Bit ? PPC::RLWINM8 : PPC::RLWINM; + unsigned CRNum = TRI->getEncodingValue(SrcReg); + BuildMI(MBB, I, DL, get(MvCode), DestReg).addReg(SrcReg); getKillRegState(KillSrc); + if (CRNum == 7) + return; + // Shift the CR bits to make the CR field in the lowest 4 bits of GRC. + BuildMI(MBB, I, DL, get(ShCode), DestReg) + .addReg(DestReg, RegState::Kill) + .addImm(CRNum * 4 + 4) + .addImm(28) + .addImm(31); return; } else if (PPC::G8RCRegClass.contains(SrcReg) && PPC::VSFRCRegClass.contains(DestReg)) { @@ -1343,17 +1783,53 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) || PPC::VSSRCRegClass.contains(DestReg, SrcReg)) Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf; - else if (PPC::QFRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMR; - else if (PPC::QSRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRs; - else if (PPC::QBRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRb; + else if (Subtarget.pairedVectorMemops() && + PPC::VSRpRCRegClass.contains(DestReg, SrcReg)) { + if (SrcReg > PPC::VSRp15) + SrcReg = PPC::V0 + (SrcReg - PPC::VSRp16) * 2; + else + SrcReg = PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2; + if (DestReg > PPC::VSRp15) + DestReg = PPC::V0 + (DestReg - PPC::VSRp16) * 2; + else + DestReg = PPC::VSL0 + (DestReg - PPC::VSRp0) * 2; + BuildMI(MBB, I, DL, get(PPC::XXLOR), DestReg). + addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, I, DL, get(PPC::XXLOR), DestReg + 1). + addReg(SrcReg + 1).addReg(SrcReg + 1, getKillRegState(KillSrc)); + return; + } else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::CROR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) Opc = PPC::EVOR; - else + else if ((PPC::ACCRCRegClass.contains(DestReg) || + PPC::UACCRCRegClass.contains(DestReg)) && + (PPC::ACCRCRegClass.contains(SrcReg) || + PPC::UACCRCRegClass.contains(SrcReg))) { + // If primed, de-prime the source register, copy the individual registers + // and prime the destination if needed. The vector subregisters are + // vs[(u)acc * 4] - vs[(u)acc * 4 + 3]. If the copy is not a kill and the + // source is primed, we need to re-prime it after the copy as well. + PPCRegisterInfo::emitAccCopyInfo(MBB, DestReg, SrcReg); + bool DestPrimed = PPC::ACCRCRegClass.contains(DestReg); + bool SrcPrimed = PPC::ACCRCRegClass.contains(SrcReg); + MCRegister VSLSrcReg = + PPC::VSL0 + (SrcReg - (SrcPrimed ? PPC::ACC0 : PPC::UACC0)) * 4; + MCRegister VSLDestReg = + PPC::VSL0 + (DestReg - (DestPrimed ? PPC::ACC0 : PPC::UACC0)) * 4; + if (SrcPrimed) + BuildMI(MBB, I, DL, get(PPC::XXMFACC), SrcReg).addReg(SrcReg); + for (unsigned Idx = 0; Idx < 4; Idx++) + BuildMI(MBB, I, DL, get(PPC::XXLOR), VSLDestReg + Idx) + .addReg(VSLSrcReg + Idx) + .addReg(VSLSrcReg + Idx, getKillRegState(KillSrc)); + if (DestPrimed) + BuildMI(MBB, I, DL, get(PPC::XXMTACC), DestReg).addReg(DestReg); + if (SrcPrimed && !KillSrc) + BuildMI(MBB, I, DL, get(PPC::XXMTACC), SrcReg).addReg(SrcReg); + return; + } else llvm_unreachable("Impossible reg-to-reg copy"); const MCInstrDesc &MCID = get(Opc); @@ -1364,7 +1840,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc)); } -static unsigned getSpillIndex(const TargetRegisterClass *RC) { +unsigned PPCInstrInfo::getSpillIndex(const TargetRegisterClass *RC) const { int OpcodeIndex = 0; if (PPC::GPRCRegClass.hasSubClassEq(RC) || @@ -1391,16 +1867,20 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) { OpcodeIndex = SOK_VectorFloat8Spill; } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_VectorFloat4Spill; - } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_VRSaveSpill; - } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat8Spill; - } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat4Spill; - } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadBitSpill; } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SpillToVSR; + } else if (PPC::ACCRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_AccumulatorSpill; + } else if (PPC::UACCRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_UAccumulatorSpill; + } else if (PPC::VSRpRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_PairedVecSpill; } else { llvm_unreachable("Unknown regclass!"); } @@ -1437,9 +1917,6 @@ void PPCInstrInfo::StoreRegToStackSlot( PPC::CRBITRCRegClass.hasSubClassEq(RC)) FuncInfo->setSpillsCR(); - if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) - FuncInfo->setSpillsVRSAVE(); - if (isXFormMemOp(Opcode)) FuncInfo->setHasNonRISpills(); } @@ -1495,9 +1972,6 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, PPC::CRBITRCRegClass.hasSubClassEq(RC)) FuncInfo->setSpillsCR(); - if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) - FuncInfo->setSpillsVRSAVE(); - if (isXFormMemOp(Opcode)) FuncInfo->setHasNonRISpills(); } @@ -1667,6 +2141,17 @@ bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const { return false; } +bool PPCInstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // Set MFFS and MTFSF as scheduling boundary to avoid unexpected code motion + // across them, since some FP operations may change content of FPSCR. + // TODO: Model FPSCR in PPC instruction definitions and remove the workaround + if (MI.getOpcode() == PPC::MFFS || MI.getOpcode() == PPC::MTFSF) + return true; + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); +} + bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, ArrayRef<MachineOperand> Pred) const { unsigned OpC = MI.getOpcode(); @@ -1675,6 +2160,10 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, bool isPPC64 = Subtarget.isPPC64(); MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR))); + // Need add Def and Use for CTR implicit operand. + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addReg(Pred[1].getReg(), RegState::Implicit) + .addReg(Pred[1].getReg(), RegState::ImplicitDefine); } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { MI.setDesc(get(PPC::BCLR)); MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); @@ -1694,6 +2183,10 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, bool isPPC64 = Subtarget.isPPC64(); MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : (isPPC64 ? PPC::BDZ8 : PPC::BDZ))); + // Need add Def and Use for CTR implicit operand. + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addReg(Pred[1].getReg(), RegState::Implicit) + .addReg(Pred[1].getReg(), RegState::ImplicitDefine); } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); MI.RemoveOperand(0); @@ -1734,19 +2227,24 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) : (setLR ? PPC::BCCTRL : PPC::BCCTR))); MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); - return true; } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n) : (setLR ? PPC::BCCTRLn : PPC::BCCTRn))); MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); - return true; + } else { + MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) + : (setLR ? PPC::BCCCTRL : PPC::BCCCTR))); + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .add(Pred[1]); } - MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) - : (setLR ? PPC::BCCCTRL : PPC::BCCCTR))); - MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addImm(Pred[0].getImm()) - .add(Pred[1]); + // Need add Def and Use for LR implicit operand. + if (setLR) + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit) + .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine); + return true; } @@ -1784,8 +2282,9 @@ bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, return false; } -bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI, - std::vector<MachineOperand> &Pred) const { +bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI, + std::vector<MachineOperand> &Pred, + bool SkipDead) const { // Note: At the present time, the contents of Pred from this function is // unused by IfConversion. This implementation follows ARM by pushing the // CR-defining operand. Because the 'DZ' and 'DNZ' count as types of @@ -2071,6 +2570,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (NewOpC == -1) return false; + // This transformation should not be performed if `nsw` is missing and is not + // `equalityOnly` comparison. Since if there is overflow, sub_lt, sub_gt in + // CRReg do not reflect correct order. If `equalityOnly` is true, sub_eq in + // CRReg can reflect if compared values are equal, this optz is still valid. + if (!equalityOnly && (NewOpC == PPC::SUBF_rec || NewOpC == PPC::SUBF8_rec) && + Sub && !Sub->getFlag(MachineInstr::NoSWrap)) + return false; + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP // needs to be updated to be based on SUB. Push the condition code // operands to OperandsToUpdate. If it is safe to remove CmpInstr, the @@ -2221,6 +2728,112 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return true; } +bool PPCInstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { + const MachineOperand *BaseOp; + OffsetIsScalable = false; + if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI)) + return false; + BaseOps.push_back(BaseOp); + return true; +} + +static bool isLdStSafeToCluster(const MachineInstr &LdSt, + const TargetRegisterInfo *TRI) { + // If this is a volatile load/store, don't mess with it. + if (LdSt.hasOrderedMemoryRef() || LdSt.getNumExplicitOperands() != 3) + return false; + + if (LdSt.getOperand(2).isFI()) + return true; + + assert(LdSt.getOperand(2).isReg() && "Expected a reg operand."); + // Can't cluster if the instruction modifies the base register + // or it is update form. e.g. ld r2,3(r2) + if (LdSt.modifiesRegister(LdSt.getOperand(2).getReg(), TRI)) + return false; + + return true; +} + +// Only cluster instruction pair that have the same opcode, and they are +// clusterable according to PowerPC specification. +static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc, + const PPCSubtarget &Subtarget) { + switch (FirstOpc) { + default: + return false; + case PPC::STD: + case PPC::STFD: + case PPC::STXSD: + case PPC::DFSTOREf64: + return FirstOpc == SecondOpc; + // PowerPC backend has opcode STW/STW8 for instruction "stw" to deal with + // 32bit and 64bit instruction selection. They are clusterable pair though + // they are different opcode. + case PPC::STW: + case PPC::STW8: + return SecondOpc == PPC::STW || SecondOpc == PPC::STW8; + } +} + +bool PPCInstrInfo::shouldClusterMemOps( + ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { + + assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); + const MachineOperand &BaseOp1 = *BaseOps1.front(); + const MachineOperand &BaseOp2 = *BaseOps2.front(); + assert((BaseOp1.isReg() || BaseOp1.isFI()) && + "Only base registers and frame indices are supported."); + + // The NumLoads means the number of loads that has been clustered. + // Don't cluster memory op if there are already two ops clustered at least. + if (NumLoads > 2) + return false; + + // Cluster the load/store only when they have the same base + // register or FI. + if ((BaseOp1.isReg() != BaseOp2.isReg()) || + (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) || + (BaseOp1.isFI() && BaseOp1.getIndex() != BaseOp2.getIndex())) + return false; + + // Check if the load/store are clusterable according to the PowerPC + // specification. + const MachineInstr &FirstLdSt = *BaseOp1.getParent(); + const MachineInstr &SecondLdSt = *BaseOp2.getParent(); + unsigned FirstOpc = FirstLdSt.getOpcode(); + unsigned SecondOpc = SecondLdSt.getOpcode(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + // Cluster the load/store only when they have the same opcode, and they are + // clusterable opcode according to PowerPC specification. + if (!isClusterableLdStOpcPair(FirstOpc, SecondOpc, Subtarget)) + return false; + + // Can't cluster load/store that have ordered or volatile memory reference. + if (!isLdStSafeToCluster(FirstLdSt, TRI) || + !isLdStSafeToCluster(SecondLdSt, TRI)) + return false; + + int64_t Offset1 = 0, Offset2 = 0; + unsigned Width1 = 0, Width2 = 0; + const MachineOperand *Base1 = nullptr, *Base2 = nullptr; + if (!getMemOperandWithOffsetWidth(FirstLdSt, Base1, Offset1, Width1, TRI) || + !getMemOperandWithOffsetWidth(SecondLdSt, Base2, Offset2, Width2, TRI) || + Width1 != Width2) + return false; + + assert(Base1 == &BaseOp1 && Base2 == &BaseOp2 && + "getMemOperandWithOffsetWidth return incorrect base op"); + // The caller should already have ordered FirstMemOp/SecondMemOp by offset. + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); + return Offset1 + Width1 == Offset2; +} + /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. /// @@ -2270,7 +2883,14 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { {MO_PLT, "ppc-plt"}, {MO_PIC_FLAG, "ppc-pic"}, {MO_PCREL_FLAG, "ppc-pcrel"}, - {MO_GOT_FLAG, "ppc-got"}}; + {MO_GOT_FLAG, "ppc-got"}, + {MO_PCREL_OPT_FLAG, "ppc-opt-pcrel"}, + {MO_TLSGD_FLAG, "ppc-tlsgd"}, + {MO_TLSLD_FLAG, "ppc-tlsld"}, + {MO_TPREL_FLAG, "ppc-tprel"}, + {MO_GOT_TLSGD_PCREL_FLAG, "ppc-got-tlsgd-pcrel"}, + {MO_GOT_TLSLD_PCREL_FLAG, "ppc-got-tlsld-pcrel"}, + {MO_GOT_TPREL_PCREL_FLAG, "ppc-got-tprel-pcrel"}}; return makeArrayRef(TargetFlags); } @@ -2351,6 +2971,31 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { auto DL = MI.getDebugLoc(); switch (MI.getOpcode()) { + case PPC::BUILD_UACC: { + MCRegister ACC = MI.getOperand(0).getReg(); + MCRegister UACC = MI.getOperand(1).getReg(); + if (ACC - PPC::ACC0 != UACC - PPC::UACC0) { + MCRegister SrcVSR = PPC::VSL0 + (UACC - PPC::UACC0) * 4; + MCRegister DstVSR = PPC::VSL0 + (ACC - PPC::ACC0) * 4; + // FIXME: This can easily be improved to look up to the top of the MBB + // to see if the inputs are XXLOR's. If they are and SrcReg is killed, + // we can just re-target any such XXLOR's to DstVSR + offset. + for (int VecNo = 0; VecNo < 4; VecNo++) + BuildMI(MBB, MI, DL, get(PPC::XXLOR), DstVSR + VecNo) + .addReg(SrcVSR + VecNo) + .addReg(SrcVSR + VecNo); + } + // BUILD_UACC is expanded to 4 copies of the underlying vsx regisers. + // So after building the 4 copies, we can replace the BUILD_UACC instruction + // with a NOP. + LLVM_FALLTHROUGH; + } + case PPC::KILL_PAIR: { + MI.setDesc(get(PPC::UNENCODED_NOP)); + MI.RemoveOperand(1); + MI.RemoveOperand(0); + return true; + } case TargetOpcode::LOAD_STACK_GUARD: { assert(Subtarget.isTargetLinux() && "Only Linux target is expected to contain LOAD_STACK_GUARD"); @@ -2642,7 +3287,10 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( } unsigned PPCInstrInfo::getSpillTarget() const { - return Subtarget.hasP9Vector() ? 1 : 0; + // With P10, we may need to spill paired vector registers or accumulator + // registers. MMA implies paired vectors, so we can just check that. + bool IsP10Variant = Subtarget.isISA3_1() || Subtarget.pairedVectorMemops(); + return IsP10Variant ? 2 : Subtarget.hasP9Vector() ? 1 : 0; } const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const { @@ -2653,13 +3301,35 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const { return LoadSpillOpcodesArray[getSpillTarget()]; } -void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, +void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr *StartMI, MachineInstr *EndMI, unsigned RegNo) const { + // Conservatively clear kill flag for the register if the instructions are in + // different basic blocks and in SSA form, because the kill flag may no longer + // be right. There is no need to bother with dead flags since defs with no + // uses will be handled by DCE. + MachineRegisterInfo &MRI = StartMI->getParent()->getParent()->getRegInfo(); + if (MRI.isSSA() && (StartMI->getParent() != EndMI->getParent())) { + MRI.clearKillFlags(RegNo); + return; + } // Instructions between [StartMI, EndMI] should be in same basic block. - assert((StartMI.getParent() == EndMI.getParent()) && + assert((StartMI->getParent() == EndMI->getParent()) && "Instructions are not in same basic block"); + // If before RA, StartMI may be def through COPY, we need to adjust it to the + // real def. See function getForwardingDefMI. + if (MRI.isSSA()) { + bool Reads, Writes; + std::tie(Reads, Writes) = StartMI->readsWritesVirtualRegister(RegNo); + if (!Reads && !Writes) { + assert(Register::isVirtualRegister(RegNo) && + "Must be a virtual register"); + // Get real def and ignore copies. + StartMI = MRI.getVRegDef(RegNo); + } + } + bool IsKillSet = false; auto clearOperandKillInfo = [=] (MachineInstr &MI, unsigned Index) { @@ -2672,21 +3342,21 @@ void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, // Set killed flag for EndMI. // No need to do anything if EndMI defines RegNo. int UseIndex = - EndMI.findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo()); + EndMI->findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo()); if (UseIndex != -1) { - EndMI.getOperand(UseIndex).setIsKill(true); + EndMI->getOperand(UseIndex).setIsKill(true); IsKillSet = true; // Clear killed flag for other EndMI operands related to RegNo. In some // upexpected cases, killed may be set multiple times for same register // operand in same MI. - for (int i = 0, e = EndMI.getNumOperands(); i != e; ++i) + for (int i = 0, e = EndMI->getNumOperands(); i != e; ++i) if (i != UseIndex) - clearOperandKillInfo(EndMI, i); + clearOperandKillInfo(*EndMI, i); } // Walking the inst in reverse order (EndMI -> StartMI]. - MachineBasicBlock::reverse_iterator It = EndMI; - MachineBasicBlock::reverse_iterator E = EndMI.getParent()->rend(); + MachineBasicBlock::reverse_iterator It = *EndMI; + MachineBasicBlock::reverse_iterator E = EndMI->getParent()->rend(); // EndMI has been handled above, skip it here. It++; MachineOperand *MO = nullptr; @@ -2712,13 +3382,13 @@ void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, } else if ((MO = It->findRegisterDefOperand(RegNo, false, true, &getRegisterInfo()))) { // No use found, set dead for its def. - assert(&*It == &StartMI && "No new def between StartMI and EndMI."); + assert(&*It == StartMI && "No new def between StartMI and EndMI."); MO->setIsDead(true); break; } } - if ((&*It) == &StartMI) + if ((&*It) == StartMI) break; } // Ensure RegMo liveness is killed after EndMI. @@ -3011,6 +3681,143 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, return false; } +bool PPCInstrInfo::combineRLWINM(MachineInstr &MI, + MachineInstr **ToErase) const { + MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo(); + unsigned FoldingReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(FoldingReg)) + return false; + MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg); + if (SrcMI->getOpcode() != PPC::RLWINM && + SrcMI->getOpcode() != PPC::RLWINM_rec && + SrcMI->getOpcode() != PPC::RLWINM8 && + SrcMI->getOpcode() != PPC::RLWINM8_rec) + return false; + assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() && + MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() && + SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) && + "Invalid PPC::RLWINM Instruction!"); + uint64_t SHSrc = SrcMI->getOperand(2).getImm(); + uint64_t SHMI = MI.getOperand(2).getImm(); + uint64_t MBSrc = SrcMI->getOperand(3).getImm(); + uint64_t MBMI = MI.getOperand(3).getImm(); + uint64_t MESrc = SrcMI->getOperand(4).getImm(); + uint64_t MEMI = MI.getOperand(4).getImm(); + + assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) && + "Invalid PPC::RLWINM Instruction!"); + // If MBMI is bigger than MEMI, we always can not get run of ones. + // RotatedSrcMask non-wrap: + // 0........31|32........63 + // RotatedSrcMask: B---E B---E + // MaskMI: -----------|--E B------ + // Result: ----- --- (Bad candidate) + // + // RotatedSrcMask wrap: + // 0........31|32........63 + // RotatedSrcMask: --E B----|--E B---- + // MaskMI: -----------|--E B------ + // Result: --- -----|--- ----- (Bad candidate) + // + // One special case is RotatedSrcMask is a full set mask. + // RotatedSrcMask full: + // 0........31|32........63 + // RotatedSrcMask: ------EB---|-------EB--- + // MaskMI: -----------|--E B------ + // Result: -----------|--- ------- (Good candidate) + + // Mark special case. + bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31); + + // For other MBMI > MEMI cases, just return. + if ((MBMI > MEMI) && !SrcMaskFull) + return false; + + // Handle MBMI <= MEMI cases. + APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI); + // In MI, we only need low 32 bits of SrcMI, just consider about low 32 + // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0, + // while in PowerPC ISA, lowerest bit is at index 63. + APInt MaskSrc = APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc); + + APInt RotatedSrcMask = MaskSrc.rotl(SHMI); + APInt FinalMask = RotatedSrcMask & MaskMI; + uint32_t NewMB, NewME; + bool Simplified = false; + + // If final mask is 0, MI result should be 0 too. + if (FinalMask.isNullValue()) { + bool Is64Bit = + (MI.getOpcode() == PPC::RLWINM8 || MI.getOpcode() == PPC::RLWINM8_rec); + Simplified = true; + LLVM_DEBUG(dbgs() << "Replace Instr: "); + LLVM_DEBUG(MI.dump()); + + if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) { + // Replace MI with "LI 0" + MI.RemoveOperand(4); + MI.RemoveOperand(3); + MI.RemoveOperand(2); + MI.getOperand(1).ChangeToImmediate(0); + MI.setDesc(get(Is64Bit ? PPC::LI8 : PPC::LI)); + } else { + // Replace MI with "ANDI_rec reg, 0" + MI.RemoveOperand(4); + MI.RemoveOperand(3); + MI.getOperand(2).setImm(0); + MI.setDesc(get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec)); + MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); + if (SrcMI->getOperand(1).isKill()) { + MI.getOperand(1).setIsKill(true); + SrcMI->getOperand(1).setIsKill(false); + } else + // About to replace MI.getOperand(1), clear its kill flag. + MI.getOperand(1).setIsKill(false); + } + + LLVM_DEBUG(dbgs() << "With: "); + LLVM_DEBUG(MI.dump()); + + } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, NewME) && + NewMB <= NewME) || + SrcMaskFull) { + // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger + // than NewME. Otherwise we get a 64 bit value after folding, but MI + // return a 32 bit value. + Simplified = true; + LLVM_DEBUG(dbgs() << "Converting Instr: "); + LLVM_DEBUG(MI.dump()); + + uint16_t NewSH = (SHSrc + SHMI) % 32; + MI.getOperand(2).setImm(NewSH); + // If SrcMI mask is full, no need to update MBMI and MEMI. + if (!SrcMaskFull) { + MI.getOperand(3).setImm(NewMB); + MI.getOperand(4).setImm(NewME); + } + MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); + if (SrcMI->getOperand(1).isKill()) { + MI.getOperand(1).setIsKill(true); + SrcMI->getOperand(1).setIsKill(false); + } else + // About to replace MI.getOperand(1), clear its kill flag. + MI.getOperand(1).setIsKill(false); + + LLVM_DEBUG(dbgs() << "To: "); + LLVM_DEBUG(MI.dump()); + } + if (Simplified & MRI->use_nodbg_empty(FoldingReg) && + !SrcMI->hasImplicitDef()) { + // If FoldingReg has no non-debug use and it has no implicit def (it + // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI. + // Otherwise keep it. + *ToErase = SrcMI; + LLVM_DEBUG(dbgs() << "Delete dead instruction: "); + LLVM_DEBUG(SrcMI->dump()); + } + return Simplified; +} + bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, ImmInstrInfo &III, bool PostRA) const { // The vast majority of the instructions would need their operand 2 replaced @@ -3732,6 +4539,20 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI, } return false; } + case PPC::SUBFIC: + case PPC::SUBFIC8: { + // Only transform this if the CARRY implicit operand is dead. + if (MI.getNumOperands() > 3 && !MI.getOperand(3).isDead()) + return false; + int64_t Minuend = MI.getOperand(2).getImm(); + if (isInt<16>(Minuend - SExtImm)) { + ReplaceWithLI = true; + Is64BitLI = Opc == PPC::SUBFIC8; + NewImm = Minuend - SExtImm; + break; + } + return false; + } case PPC::RLDICL: case PPC::RLDICL_rec: case PPC::RLDICL_32: @@ -3849,7 +4670,7 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI, // ForwardingOperandReg = LI imm1 // y = op2 imm2, ForwardingOperandReg(killed) if (IsForwardingOperandKilled) - fixupIsDeadOrKill(DefMI, MI, ForwardingOperandReg); + fixupIsDeadOrKill(&DefMI, &MI, ForwardingOperandReg); LLVM_DEBUG(dbgs() << "With:\n"); LLVM_DEBUG(MI.dump()); @@ -3941,9 +4762,9 @@ bool PPCInstrInfo::transformToNewImmFormFedByAdd( // Update kill flag if (RegMO->isKill() || IsKilledFor(RegMO->getReg())) - fixupIsDeadOrKill(DefMI, MI, RegMO->getReg()); + fixupIsDeadOrKill(&DefMI, &MI, RegMO->getReg()); if (ForwardKilledOperandReg != ~0U) - fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg); + fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg); } LLVM_DEBUG(dbgs() << "With:\n"); @@ -4054,12 +4875,12 @@ bool PPCInstrInfo::transformToImmFormFedByAdd( // x = ADD reg(killed), imm // y = XOP 0, x if (IsFwdFeederRegKilled || RegMO->isKill()) - fixupIsDeadOrKill(DefMI, MI, RegMO->getReg()); + fixupIsDeadOrKill(&DefMI, &MI, RegMO->getReg()); // Pattern 3: // ForwardKilledOperandReg = ADD reg, imm // y = XOP 0, ForwardKilledOperandReg(killed) if (ForwardKilledOperandReg != ~0U) - fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg); + fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg); LLVM_DEBUG(dbgs() << "With:\n"); LLVM_DEBUG(MI.dump()); @@ -4215,7 +5036,7 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, // ForwardKilledOperandReg = LI imm // y = XOP reg, ForwardKilledOperandReg(killed) if (ForwardKilledOperandReg != ~0U) - fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg); + fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg); return true; } @@ -4618,13 +5439,15 @@ MachineInstr *PPCInstrInfo::findLoopInstr( bool PPCInstrInfo::getMemOperandWithOffsetWidth( const MachineInstr &LdSt, const MachineOperand *&BaseReg, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const { - if (!LdSt.mayLoadOrStore()) + if (!LdSt.mayLoadOrStore() || LdSt.getNumExplicitOperands() != 3) return false; // Handle only loads/stores with base register followed by immediate offset. - if (LdSt.getNumExplicitOperands() != 3) + if (!LdSt.getOperand(1).isImm() || + (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) return false; - if (!LdSt.getOperand(1).isImm() || !LdSt.getOperand(2).isReg()) + if (!LdSt.getOperand(1).isImm() || + (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) return false; if (!LdSt.hasOneMemOperand()) |