diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
46 files changed, 1224 insertions, 530 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index c4680cbedadf..91dc611fb265 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -317,6 +317,9 @@ extern char &SIFormMemoryClausesID; void initializeSIPostRABundlerPass(PassRegistry&); extern char &SIPostRABundlerID; +void initializeGCNCreateVOPDPass(PassRegistry &); +extern char &GCNCreateVOPDID; + void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); extern char &AMDGPUUnifyDivergentExitNodesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 94d7844e8a32..a8108b1d637b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -626,13 +626,13 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { Constant *FoldedT = SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL); - if (isa<ConstantExpr>(FoldedT)) + if (!FoldedT || isa<ConstantExpr>(FoldedT)) return false; Constant *FoldedF = SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL); - if (isa<ConstantExpr>(FoldedF)) + if (!FoldedF || isa<ConstantExpr>(FoldedF)) return false; IRBuilder<> Builder(&BO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b00df27f5fd3..589992c7a7ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1883,20 +1883,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return true; } +// Match an immediate (if Imm is true) or an SGPR (if Imm is false) +// offset. If Imm32Only is true, match only 32-bit immediate offsets +// available on CI. bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, - SDValue &Offset, bool &Imm) const { + SDValue &Offset, bool Imm, + bool Imm32Only) const { ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); if (!C) { + if (Imm) + return false; if (ByteOffsetNode.getValueType().isScalarInteger() && ByteOffsetNode.getValueType().getSizeInBits() == 32) { Offset = ByteOffsetNode; - Imm = false; return true; } if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { Offset = ByteOffsetNode.getOperand(0); - Imm = false; return true; } } @@ -1908,9 +1912,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, int64_t ByteOffset = C->getSExtValue(); Optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false); - if (EncodedOffset) { + if (EncodedOffset && Imm && !Imm32Only) { Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); - Imm = true; return true; } @@ -1919,7 +1922,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, return false; EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); - if (EncodedOffset) { + if (EncodedOffset && Imm32Only) { Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); return true; } @@ -1927,11 +1930,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset)) return false; - SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); - Offset = SDValue( - CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); + if (!Imm) { + SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); + Offset = SDValue( + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); + return true; + } - return true; + return false; } SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { @@ -1959,8 +1965,12 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { Ops), 0); } +// Match a base and an immediate (if Imm is true) or an SGPR +// (if Imm is false) offset. If Imm32Only is true, match only 32-bit +// immediate offsets available on CI. bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, - SDValue &Offset, bool &Imm) const { + SDValue &Offset, bool Imm, + bool Imm32Only) const { SDLoc SL(Addr); // A 32-bit (address + offset) should not cause unsigned 32-bit integer @@ -1977,41 +1987,34 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, assert(N0 && N1 && isa<ConstantSDNode>(N1)); } if (N0 && N1) { - if (SelectSMRDOffset(N1, Offset, Imm)) { + if (SelectSMRDOffset(N1, Offset, Imm, Imm32Only)) { SBase = Expand32BitAddress(N0); return true; } } + return false; } + if (!Imm) + return false; SBase = Expand32BitAddress(Addr); Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); - Imm = true; return true; } bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - bool Imm = false; - return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; + return SelectSMRD(Addr, SBase, Offset, /* Imm */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); - - bool Imm = false; - if (!SelectSMRD(Addr, SBase, Offset, Imm)) - return false; - - return !Imm && isa<ConstantSDNode>(Offset); + return SelectSMRD(Addr, SBase, Offset, /* Imm */ true, /* Imm32Only */ true); } bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const { - bool Imm = false; - return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && - !isa<ConstantSDNode>(Offset); + return SelectSMRD(Addr, SBase, Offset, /* Imm */ false); } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 862be9dc5568..7894b8eb5b67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -193,11 +193,11 @@ private: bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, - bool &Imm) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool Imm, + bool Imm32Only) const; SDValue Expand32BitAddress(SDValue Addr) const; - bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, - bool &Imm) const; + bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool Imm, + bool Imm32Only = false) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index ef7929012597..bf520a560404 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4803,6 +4803,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { case AtomicRMWInst::Nand: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: return AtomicExpansionKind::CmpXChg; default: return AtomicExpansionKind::None; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3f242fdb6d8e..70fae9d784a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1180,7 +1180,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); if (Arg) { - const int64_t Value = Arg.getValue().Value.getSExtValue(); + const int64_t Value = Arg.value().Value.getSExtValue(); if (Value == 0) { unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); @@ -3235,7 +3235,7 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) - return false; + return Register(); if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { return Def->getOperand(1).getReg(); @@ -3851,27 +3851,36 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { getAddrModeInfo(*MI, *MRI, AddrInfo); // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, - // then we can select all ptr + 32-bit offsets not just immediate offsets. - if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) + // then we can select all ptr + 32-bit offsets. + if (AddrInfo.empty()) return None; const GEPInfo &GEPInfo = AddrInfo[0]; + Register PtrReg = GEPInfo.SgprParts[0]; + // SGPR offset is unsigned. - if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm)) - return None; + if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) && + GEPInfo.Imm != 0) { + // If we make it this far we have a load with an 32-bit immediate offset. + // It is OK to select this using a sgpr offset, because we have already + // failed trying to select this load into one of the _IMM variants since + // the _IMM Patterns are considered before the _SGPR patterns. + Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(GEPInfo.Imm); + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; + } - // If we make it this far we have a load with an 32-bit immediate offset. - // It is OK to select this using a sgpr offset, because we have already - // failed trying to select this load into one of the _IMM variants since - // the _IMM Patterns are considered before the _SGPR patterns. - Register PtrReg = GEPInfo.SgprParts[0]; - Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg) - .addImm(GEPInfo.Imm); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, - [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); } - }}; + if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) { + if (Register OffsetReg = + matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) { + return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, + [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}}; + } + } + + return None; } std::pair<Register, int> @@ -4231,7 +4240,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { }, [=](MachineInstrBuilder &MIB) { // vaddr if (FI) - MIB.addFrameIndex(FI.getValue()); + MIB.addFrameIndex(FI.value()); else MIB.addReg(VAddr); }, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 31012915457b..26e6b9a10688 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -542,63 +542,37 @@ def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val), } } // End foreach as -// TODO: Add GISelPredicateCode for the ret and noret PatFrags once -// GlobalISelEmitter allows pattern matches where src and dst def count -// mismatch. - -multiclass ret_noret_op { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - def "_ret" : PatFrag<(ops node:$ptr, node:$data), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; - } - - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - def "_noret" : PatFrag<(ops node:$ptr, node:$data), - (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; - } +multiclass noret_op { + let HasNoUse = true in + def "_noret" : PatFrag<(ops node:$ptr, node:$data), + (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>; } -defm int_amdgcn_flat_atomic_fadd : ret_noret_op; -defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op; -defm int_amdgcn_flat_atomic_fmin : ret_noret_op; -defm int_amdgcn_flat_atomic_fmax : ret_noret_op; -defm int_amdgcn_global_atomic_fadd : ret_noret_op; -defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op; -defm int_amdgcn_global_atomic_fmin : ret_noret_op; -defm int_amdgcn_global_atomic_fmax : ret_noret_op; -defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op; - -multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - defm "_noret" : binary_atomic_op<atomic_op, IsInt>; - } +defm int_amdgcn_flat_atomic_fadd : noret_op; +defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op; +defm int_amdgcn_flat_atomic_fmin : noret_op; +defm int_amdgcn_flat_atomic_fmax : noret_op; +defm int_amdgcn_global_atomic_fadd : noret_op; +defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op; +defm int_amdgcn_global_atomic_fmin : noret_op; +defm int_amdgcn_global_atomic_fmax : noret_op; +defm int_amdgcn_ds_fadd_v2bf16 : noret_op; - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - defm "_ret" : binary_atomic_op<atomic_op, IsInt>; - } +multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { + let HasNoUse = true in + defm "_noret" : binary_atomic_op<atomic_op, IsInt>; } -multiclass ret_noret_ternary_atomic_op<SDNode atomic_op> { - let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return false; }] in { - defm "_noret" : ternary_atomic_op<atomic_op>; - } - - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }], - GISelPredicateCode = [{ return true; }] in { - defm "_ret" : ternary_atomic_op<atomic_op>; - } +multiclass noret_ternary_atomic_op<SDNode atomic_op> { + let HasNoUse = true in + defm "_noret" : ternary_atomic_op<atomic_op>; } multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> { foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in { defm "_"#as : binary_atomic_op<atomic_op, IsInt>; - defm "_"#as : ret_noret_binary_atomic_op<atomic_op, IsInt>; + defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>; } } } @@ -640,13 +614,15 @@ def store_align16_local: PatFrag<(ops node:$val, node:$ptr), let AddressSpaces = StoreAddress_local.AddrSpaces in { defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_local : ret_noret_ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_local_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_local : noret_ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_local_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; } let AddressSpaces = StoreAddress_region.AddrSpaces in { -defm atomic_cmp_swap_region : ret_noret_ternary_atomic_op<atomic_cmp_swap>; -defm atomic_cmp_swap_region_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_region : noret_ternary_atomic_op<atomic_cmp_swap>; +defm atomic_cmp_swap_region_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>; +defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index ed6ddbf426fd..38e04dedd9fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -171,6 +171,10 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { } void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { + // FIXME: Enable feature predicate checks once all the test pass. + // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(), + // getSubtargetInfo().getFeatureBits()); + if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index 1b513c456307..745734aac2b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -131,8 +131,8 @@ public: bool IsAOneAddressSpace = isOneAddressSpace(A); bool IsBOneAddressSpace = isOneAddressSpace(B); - return AIO.getValue() >= BIO.getValue() && - (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); + return AIO.value() >= BIO.value() && + (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); } }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 77816a783630..6bd906439ee8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -40,9 +40,9 @@ using namespace llvm; #include "AMDGPUGenSubtargetInfo.inc" #undef AMDGPUSubtarget -static cl::opt<bool> DisablePowerSched( - "amdgpu-disable-power-sched", - cl::desc("Disable scheduling to minimize mAI power bursts"), +static cl::opt<bool> EnablePowerSched( + "amdgpu-enable-power-sched", + cl::desc("Enable scheduling to minimize mAI power bursts"), cl::init(false)); static cl::opt<bool> EnableVGPRIndexMode( @@ -916,7 +916,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation { void apply(ScheduleDAGInstrs *DAGInstrs) override { const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasMAIInsts() || DisablePowerSched) + if (!ST.hasMAIInsts()) return; DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); @@ -966,7 +966,8 @@ void GCNSubtarget::getPostRAMutations( std::unique_ptr<ScheduleDAGMutation> GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { - return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); + return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo) + : nullptr; } const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1c6b9d35695a..971e44723758 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,11 +22,13 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" +#include "GCNVOPDUtils.h" #include "R600.h" #include "R600TargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" @@ -278,6 +280,12 @@ static cl::opt<bool> cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable GFX11+ VOPD +static cl::opt<bool> + EnableVOPD("amdgpu-enable-vopd", + cl::desc("Enable VOPD, dual issue of VALU in wave32"), + cl::init(true), cl::Hidden); + // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt<bool> EnableDCEInRA("amdgpu-dce-in-ra", @@ -383,6 +391,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeSIPostRABundlerPass(*PR); + initializeGCNCreateVOPDPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -920,6 +929,8 @@ public: DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); DAG->addMutation(createIGroupLPDAGMutation()); DAG->addMutation(createSchedBarrierDAGMutation()); + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + DAG->addMutation(createVOPDPairingMutation()); return DAG; } @@ -1399,6 +1410,8 @@ void GCNPassConfig::addPreSched2() { } void GCNPassConfig::addPreEmitPass() { + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + addPass(&GCNCreateVOPDID); addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index a087323e5de7..04dd3e938a15 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1412,10 +1412,12 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">; multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> { foreach RtnMode = ["ret", "noret"] in { - defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode + defvar Op = !cast<SDPatternOperator>(OpPrefix + # !if(!eq(RtnMode, "ret"), "", "_noret") # !if(isIntr, "", "_" # vt.Size)); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)), (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, @@ -1428,6 +1430,7 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1439,10 +1442,12 @@ multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> { multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> { foreach RtnMode = ["ret", "noret"] in { - defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global_" # RtnMode + defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global" + # !if(!eq(RtnMode, "ret"), "", "_noret") # "_" # vt.Size); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); @@ -1465,6 +1470,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> !if(!eq(vt, i32), sub0, sub0_sub1)), Addr64ResDag) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1495,13 +1501,14 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, list<string> RtnModes = ["ret", "noret"]> { foreach RtnMode = RtnModes in { - defvar Op = !cast<SDPatternOperator>(!if(!eq(RtnMode, "none"), - OpPrefix, OpPrefix # "_" # RtnMode)); - defvar InstSuffix = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")), - "_RTN", ""); - defvar CachePolicy = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")), + defvar Op = !cast<SDPatternOperator>(OpPrefix + # !if(!eq(RtnMode, "ret"), "", "_noret")); + + defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); + defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), (timm:$cachepolicy)); + let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), @@ -1534,6 +1541,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) >; + } // end let AddedComplexity } // end foreach RtnMode } @@ -1551,7 +1559,7 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i32, "BUFFER_ATOMIC_OR">; defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i32, "BUFFER_ATOMIC_XOR">; defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i32, "BUFFER_ATOMIC_INC">; defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i32, "BUFFER_ATOMIC_DEC">; -defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["none"]>; +defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["ret"]>; defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i64, "BUFFER_ATOMIC_SWAP_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i64, "BUFFER_ATOMIC_ADD_X2">; defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i64, "BUFFER_ATOMIC_SUB_X2">; @@ -1643,7 +1651,8 @@ let SubtargetPredicate = isGFX90APlus in { foreach RtnMode = ["ret", "noret"] in { -defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # "_" # RtnMode); +defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap + # !if(!eq(RtnMode, "ret"), "", "_noret")); defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", ""); defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), (timm:$cachepolicy)); diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 27b723875aa4..d8387bf6f1ae 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -950,10 +950,11 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; } // End AddedComplexity = 100 -class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), - (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds)) ->; +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, + bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { @@ -965,75 +966,88 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { !cast<PatFrag>(frag#"_local_"#vt.Size)>; } - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; } multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; } def : DSAtomicRetPat<inst, vt, - !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; + !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; def : DSAtomicRetPat<noRetInst, vt, - !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { // Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode. -class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < +class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, + int complexity = 0, bit gds=0> : GCNPat< (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), - (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds)) ->; + (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), + /* complexity */ 1>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), + /* complexity */ 1>; } - def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; - def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } } // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 let SubtargetPredicate = isGFX11Plus in { // The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode. -class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < +class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, + int complexity = 0, bit gds=0> : GCNPat< (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), - (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds)) ->; + (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_"#vt.Size)>; def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt, - !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>; + !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>; - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>; - def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), + /* complexity */ 0, /* gds */ 1>; + def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), + /* complexity */ 1, /* gds */ 1>; } } // End SubtargetPredicate = isGFX11Plus @@ -1090,17 +1104,20 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp } // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = isGFX90APlus in { -def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>; +def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>; +let AddedComplexity = 1 in def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>; } let SubtargetPredicate = isGFX940Plus in { -def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_ret_32>; +def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>; +let AddedComplexity = 1 in def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>; def : GCNPat < - (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)), + (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)), (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) >; +let AddedComplexity = 1 in def : GCNPat < (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index cb2822818549..c634e15945ad 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1015,31 +1015,35 @@ class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt multiclass FlatAtomicPat <string inst, string node, ValueType vt, ValueType data_vt = vt> { - defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size); + defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size); defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt, - ValueType data_vt = vt, bit isIntr = 0> { - defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + ValueType data_vt = vt, int complexity = 0, + bit isIntr = 0> { + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size)); defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); + let AddedComplexity = complexity in def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; + let AddedComplexity = !add(complexity, 1) in def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt, ValueType data_vt = vt> { - defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>; + defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>; } class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < @@ -1260,17 +1264,16 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt, ValueType data_vt = vt, bit isIntr = 0> { - defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size)); + defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size)); defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size)); - let AddedComplexity = 10 in { - defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>; - } + defm : FlatSignedAtomicPat <inst, node, vt, data_vt, /* complexity */ 10, isIntr>; - let AddedComplexity = 11 in { - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>; - def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; - } + let AddedComplexity = 13 in + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>; + + let AddedComplexity = 12 in + def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; } multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt, diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp new file mode 100644 index 000000000000..83dc3bebf4d3 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -0,0 +1,175 @@ +//===- GCNCreateVOPD.cpp - Create VOPD Instructions ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Combine VALU pairs into VOPD instructions +/// Only works on wave32 +/// Has register requirements, we reject creating VOPD if the requirements are +/// not met. +/// shouldCombineVOPD mutator in postRA machine scheduler puts candidate +/// instructions for VOPD back-to-back +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "GCNVOPDUtils.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include <utility> + +#define DEBUG_TYPE "gcn-create-vopd" +STATISTIC(NumVOPDCreated, "Number of VOPD Insts Created."); + +using namespace llvm; + +namespace { + +class GCNCreateVOPD : public MachineFunctionPass { +private: +public: + static char ID; + const GCNSubtarget *ST = nullptr; + + GCNCreateVOPD() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "GCN Create VOPD Instructions"; + } + + bool doReplace(const SIInstrInfo *SII, + std::pair<MachineInstr *, MachineInstr *> &Pair) { + auto *FirstMI = Pair.first; + auto *SecondMI = Pair.second; + unsigned Opc1 = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), + AMDGPU::getVOPDOpcode(Opc2)); + assert(NewOpcode != -1 && + "Should have previously determined this as a possible VOPD\n"); + + auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI, + FirstMI->getDebugLoc(), SII->get(NewOpcode)) + .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags()); + VOPDInst.add(FirstMI->getOperand(0)) + .add(SecondMI->getOperand(0)) + .add(FirstMI->getOperand(1)); + + switch (Opc1) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(FirstMI->getOperand(2)); + VOPDInst.add(FirstMI->getOperand(3)); + break; + default: + VOPDInst.add(FirstMI->getOperand(2)); + break; + } + + VOPDInst.add(SecondMI->getOperand(1)); + + switch (Opc2) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(SecondMI->getOperand(2)); + VOPDInst.add(SecondMI->getOperand(3)); + break; + default: + VOPDInst.add(SecondMI->getOperand(2)); + break; + } + + VOPDInst.copyImplicitOps(*FirstMI); + VOPDInst.copyImplicitOps(*SecondMI); + + LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: " + << *Pair.first << "\tY: " << *Pair.second << "\n"); + FirstMI->eraseFromParent(); + SecondMI->eraseFromParent(); + ++NumVOPDCreated; + return true; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32()) + return false; + LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n"); + + const SIInstrInfo *SII = ST->getInstrInfo(); + bool Changed = false; + + SmallVector<std::pair<MachineInstr *, MachineInstr *>> ReplaceCandidates; + + for (auto &MBB : MF) { + auto MII = MBB.begin(), E = MBB.end(); + while (MII != E) { + auto *FirstMI = &*MII; + MII = next_nodbg(MII, MBB.end()); + if (MII == MBB.end()) + break; + if (FirstMI->isDebugInstr()) + continue; + auto *SecondMI = &*MII; + unsigned Opc = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + std::pair<MachineInstr *, MachineInstr *> Pair; + + if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) + Pair = {FirstMI, SecondMI}; + else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) + Pair = {SecondMI, FirstMI}; + else + continue; + // checkVOPDRegConstraints cares about program order, but doReplace + // cares about X-Y order in the constituted VOPD + if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) { + ReplaceCandidates.push_back(Pair); + ++MII; + } + } + } + for (auto &Pair : ReplaceCandidates) { + Changed |= doReplace(SII, Pair); + } + + return Changed; + } +}; + +} // namespace + +char GCNCreateVOPD::ID = 0; + +char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID; + +INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions", + false, false) diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index 1cd880eaa48e..5d254518c67a 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -143,13 +143,20 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const { } int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { - auto DPP32 = AMDGPU::getDPPOp32(Op); + int DPP32 = AMDGPU::getDPPOp32(Op); if (IsShrinkable) { assert(DPP32 == -1); - auto E32 = AMDGPU::getVOPe32(Op); + int E32 = AMDGPU::getVOPe32(Op); DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32); } - return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32; + if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1) + return DPP32; + int DPP64 = -1; + if (ST->hasVOP3DPP()) + DPP64 = AMDGPU::getDPPOp64(Op); + if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1) + return DPP64; + return -1; } // tracks the register operand definition and returns: @@ -188,6 +195,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); + bool HasVOP3DPP = ST->hasVOP3DPP(); auto OrigOp = OrigMI.getOpcode(); auto DPPOp = getDPPOp(OrigOp, IsShrinkable); if (DPPOp == -1) { @@ -201,10 +209,18 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, bool Fail = false; do { - auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); - assert(Dst); - DPPInst.add(*Dst); - int NumOperands = 1; + int NumOperands = 0; + if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) { + DPPInst.add(*Dst); + ++NumOperands; + } + if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) { + if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) { + DPPInst.add(*SDst); + ++NumOperands; + } + // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst + } const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); if (OldIdx != -1) { @@ -230,7 +246,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::OpName::src0_modifiers)) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src0_modifiers)); - assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + assert(HasVOP3DPP || + (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); DPPInst.addImm(Mod0->getImm()); ++NumOperands; } else if (AMDGPU::getNamedOperandIdx(DPPOp, @@ -253,7 +270,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::OpName::src1_modifiers)) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src1_modifiers)); - assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + assert(HasVOP3DPP || + (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); DPPInst.addImm(Mod1->getImm()); ++NumOperands; } else if (AMDGPU::getNamedOperandIdx(DPPOp, @@ -261,7 +279,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.addImm(0); ++NumOperands; } - if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + if (Src1) { if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); Fail = true; @@ -270,8 +289,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, DPPInst.add(*Src1); ++NumOperands; } - - if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { + if (auto *Mod2 = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) { + assert(NumOperands == + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); + assert(HasVOP3DPP || + (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); + DPPInst.addImm(Mod2->getImm()); + ++NumOperands; + } + auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); + if (Src2) { if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) || !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); @@ -279,8 +307,62 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, break; } DPPInst.add(*Src2); + ++NumOperands; + } + if (HasVOP3DPP) { + auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp); + if (ClampOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::clamp) != -1) { + DPPInst.addImm(ClampOpr->getImm()); + } + auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in); + if (VdstInOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::vdst_in) != -1) { + DPPInst.add(*VdstInOpr); + } + auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod); + if (OmodOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::omod) != -1) { + DPPInst.addImm(OmodOpr->getImm()); + } + // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to + // all 1. + if (auto *OpSelOpr = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) { + auto OpSel = OpSelOpr->getImm(); + if (OpSel != 0) { + LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n"); + Fail = true; + break; + } + if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel) != -1) + DPPInst.addImm(OpSel); + } + if (auto *OpSelHiOpr = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) { + auto OpSelHi = OpSelHiOpr->getImm(); + // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check + // the bitmask for 3 op_sel_hi bits set + assert(Src2 && "Expected vop3p with 3 operands"); + if (OpSelHi != 7) { + LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n"); + Fail = true; + break; + } + if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel_hi) != -1) + DPPInst.addImm(OpSelHi); + } + auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo); + if (NegOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_lo) != -1) { + DPPInst.addImm(NegOpr->getImm()); + } + auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi); + if (NegHiOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_hi) != -1) { + DPPInst.addImm(NegHiOpr->getImm()); + } } - DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); @@ -531,8 +613,16 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { } bool IsShrinkable = isShrinkable(OrigMI); - if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { - LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); + if (!(IsShrinkable || + ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) || + TII->isVOP3(OrigOp)) && + ST->hasVOP3DPP()) || + TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { + LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n"); + break; + } + if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) { + LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n"); break; } @@ -543,9 +633,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { break; } + auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); assert(Src0 && "Src1 without Src0?"); - if (Src1 && Src1->isIdenticalTo(*Src0)) { - assert(Src1->isReg()); + if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) || + (Src2 && Src2->isIdenticalTo(*Src0)))) || + (Use == Src1 && (Src1->isIdenticalTo(*Src0) || + (Src2 && Src2->isIdenticalTo(*Src1))))) { LLVM_DEBUG( dbgs() << " " << OrigMI diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp new file mode 100644 index 000000000000..a5008e39d91a --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -0,0 +1,212 @@ +//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#include "GCNVOPDUtils.h" +#include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/MC/MCInst.h" + +using namespace llvm; + +#define DEBUG_TYPE "gcn-vopd-utils" + +bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI) { + const MachineFunction *MF = FirstMI.getMF(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo()); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const unsigned NumVGPRBanks = 4; + // Literals also count against scalar bus limit + SmallVector<const MachineOperand *> UniqueLiterals; + auto addLiteral = [&](const MachineOperand &Op) { + for (auto &Literal : UniqueLiterals) { + if (Literal->isIdenticalTo(Op)) + return; + } + UniqueLiterals.push_back(&Op); + }; + SmallVector<Register> UniqueScalarRegs; + assert([&]() -> bool { + for (auto MII = MachineBasicBlock::const_iterator(&FirstMI); + MII != FirstMI.getParent()->instr_end(); ++MII) { + if (&*MII == &SecondMI) + return true; + } + return false; + }() && "Expected FirstMI to precede SecondMI"); + // Cannot pair dependent instructions + for (const auto &Use : SecondMI.uses()) + if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg())) + return false; + + struct ComponentInfo { + ComponentInfo(const MachineInstr &MI) : MI(MI) {} + Register Dst, Reg0, Reg1, Reg2; + const MachineInstr &MI; + }; + ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)}; + + for (ComponentInfo &Comp : CInfo) { + switch (Comp.MI.getOpcode()) { + case AMDGPU::V_FMAMK_F32: + // cannot inline the fixed literal in fmamk + addLiteral(Comp.MI.getOperand(2)); + Comp.Reg2 = Comp.MI.getOperand(3).getReg(); + break; + case AMDGPU::V_FMAAK_F32: + // cannot inline the fixed literal in fmaak + addLiteral(Comp.MI.getOperand(3)); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_DOT2_F32_F16: + case AMDGPU::V_DOT2_F32_BF16: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + Comp.Reg2 = Comp.MI.getOperand(0).getReg(); + break; + case AMDGPU::V_CNDMASK_B32_e32: + UniqueScalarRegs.push_back(AMDGPU::VCC_LO); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_MOV_B32_e32: + break; + default: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + } + + Comp.Dst = Comp.MI.getOperand(0).getReg(); + + const MachineOperand &Op0 = Comp.MI.getOperand(1); + if (Op0.isReg()) { + if (!TRI->isVectorRegister(MRI, Op0.getReg())) { + if (!is_contained(UniqueScalarRegs, Op0.getReg())) + UniqueScalarRegs.push_back(Op0.getReg()); + } else + Comp.Reg0 = Op0.getReg(); + } else { + if (!TII.isInlineConstant(Comp.MI, 1)) + addLiteral(Op0); + } + } + + if (UniqueLiterals.size() > 1) + return false; + if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) + return false; + + // check port 0 + if (CInfo[0].Reg0 && CInfo[1].Reg0 && + CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks) + return false; + // check port 1 + if (CInfo[0].Reg1 && CInfo[1].Reg1 && + CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks) + return false; + // check port 2 + if (CInfo[0].Reg2 && CInfo[1].Reg2 && + !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1)) + return false; + if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1)) + return false; + + LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI + << "\n\tY: " << SecondMI << "\n"); + return true; +} + +/// Check if the instr pair, FirstMI and SecondMI, should be scheduled +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII); + unsigned Opc2 = SecondMI.getOpcode(); + auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + + // One instruction case + if (!FirstMI) + return SecondCanBeVOPD.Y; + + unsigned Opc = FirstMI->getOpcode(); + auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + + if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) || + (FirstCanBeVOPD.Y && SecondCanBeVOPD.X))) + return false; + + return checkVOPDRegConstraints(STII, *FirstMI, SecondMI); +} + +/// Adapts design from MacroFusion +/// Puts valid candidate instructions back-to-back so they can easily +/// be turned into VOPD instructions +/// Greedily pairs instruction candidates. O(n^2) algorithm. +struct VOPDPairingMutation : ScheduleDAGMutation { + ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer + + VOPDPairingMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer + : shouldScheduleAdjacent(shouldScheduleAdjacent) {} + + void apply(ScheduleDAGInstrs *DAG) override { + const TargetInstrInfo &TII = *DAG->TII; + const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); + if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) { + LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n"); + return; + } + + std::vector<SUnit>::iterator ISUI, JSUI; + for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) { + const MachineInstr *IMI = ISUI->getInstr(); + if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI)) + continue; + if (!hasLessThanNumFused(*ISUI, 2)) + continue; + + for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) { + if (JSUI->isBoundaryNode()) + continue; + const MachineInstr *JMI = JSUI->getInstr(); + if (!hasLessThanNumFused(*JSUI, 2) || + !shouldScheduleAdjacent(TII, ST, IMI, *JMI)) + continue; + if (fuseInstructionPair(*DAG, *ISUI, *JSUI)) + break; + } + } + LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n"); + } +}; + +std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() { + return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent); +} diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h new file mode 100644 index 000000000000..22361b9a1a07 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h @@ -0,0 +1,32 @@ +//===- GCNVOPDUtils.h - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class SIInstrInfo; + +bool checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI); + +std::unique_ptr<ScheduleDAGMutation> createVOPDPairingMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 02c213f90f89..228963ff2a20 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -62,12 +62,6 @@ public: virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const = 0; - -protected: - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 11fe3f9ef058..fba4b1a3db66 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -36,6 +36,7 @@ using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AMDGPUGenInstrInfo.inc" #define GET_SUBTARGETINFO_MC_DESC diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 060d4b660632..c2e2563c3989 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -50,6 +50,7 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "AMDGPUGenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 78eb304fe84f..3d926e52c368 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -58,11 +58,6 @@ private: uint64_t getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const; - void - verifyInstructionPredicates(const MCInst &MI, - const FeatureBitset &AvailableFeatures) const; - }; } // end anonymous namespace @@ -90,11 +85,8 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, } void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (MI.getOpcode() == R600::RETURN || MI.getOpcode() == R600::FETCH_CLAUSE || @@ -187,5 +179,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, return MO.getImm(); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "R600GenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp index 269209a12175..b9ff195e0ddc 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp @@ -13,10 +13,12 @@ #include "R600MCTargetDesc.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/SubtargetFeature.h" using namespace llvm; #define GET_INSTRINFO_MC_DESC +#define ENABLE_INSTR_PREDICATE_VERIFIER #include "R600GenInstrInfo.inc" MCInstrInfo *llvm::createR600MCInstrInfo() { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h index 605ae851378d..b4ce748532f8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h @@ -35,6 +35,7 @@ MCInstrInfo *createR600MCInstrInfo(); #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM #define GET_INSTRINFO_SCHED_ENUM +#define GET_INSTRINFO_MC_HELPER_DECLS #include "R600GenInstrInfo.inc" #define GET_SUBTARGETINFO_ENUM diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 5e67fb5ec876..e093d78b2cc6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -310,11 +310,8 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { } void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - verifyInstructionPredicates(MI, - computeAvailableFeatures(STI.getFeatureBits())); - + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { int Opcode = MI.getOpcode(); APInt Encoding, Scratch; getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI); @@ -574,5 +571,4 @@ void SIMCCodeEmitter::getMachineOpValueCommon( llvm_unreachable("Encoding of this operand type is not supported yet."); } -#define ENABLE_INSTR_PREDICATE_VERIFIER #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index bf52f7830ad7..5199a37a0519 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1623,7 +1623,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, NewBldVec); } -SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, const SDLoc &DL) const { // Old -> New swizzle values diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index 1e75a0432ec3..e7706fa0ef5c 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -74,8 +74,8 @@ private: void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; - SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, - const SDLoc &DL) const; + SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], + SelectionDAG &DAG, const SDLoc &DL) const; SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp index 8f7807a2b472..f81f5122bbc9 100644 --- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp @@ -13,6 +13,7 @@ // #include "AMDGPUMCInstLower.h" +#include "MCTargetDesc/R600MCTargetDesc.h" #include "R600AsmPrinter.h" #include "R600Subtarget.h" #include "llvm/CodeGen/MachineOperand.h" @@ -42,6 +43,9 @@ void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { } void R600AsmPrinter::emitInstruction(const MachineInstr *MI) { + R600_MC::verifyInstructionPredicates(MI->getOpcode(), + getSubtargetInfo().getFeatureBits()); + const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>(); R600MCInstLower MCInstLowering(OutContext, STI, *this); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 094d5cd58673..d16da2a8b86b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -352,7 +352,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // TODO: Generalize to more vector types. setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16}, + MVT::v4i16, MVT::v4f16}, Custom); // Deal with vec3 vector operations when widened to vec4. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 814a7c446889..799d34e32d27 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3335,15 +3335,18 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { MachineInstr *DefMI; - const auto killDef = [&DefMI, &MBB, this]() -> void { + const auto killDef = [&]() -> void { const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); // The only user is the instruction which will be killed. - if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg())) + Register DefReg = DefMI->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(DefReg)) return; // We cannot just remove the DefMI here, calling pass will crash. DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) DefMI->removeOperand(I); + if (LV) + LV->getVarInfo(DefReg).AliveBlocks.clear(); }; int64_t Imm; @@ -3982,6 +3985,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + int Src3Idx = -1; + if (Src0Idx == -1) { + // VOPD V_DUAL_* instructions use different operand names. + Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X); + Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X); + Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y); + Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y); + } // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); @@ -4255,9 +4266,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. - for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) { if (OpIdx == -1) - break; + continue; const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 311f9f68e675..1b411eb83eb3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1242,6 +1242,9 @@ namespace AMDGPU { int getDPPOp32(uint16_t Opcode); LLVM_READONLY + int getDPPOp64(uint16_t Opcode); + + LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 29ee9f12b12d..23afd6556bc9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -193,43 +193,32 @@ def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; -multiclass SDBufferAtomicRetNoRet { - def "_ret" : PatFrag< - (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, - node:$offset, node:$cachepolicy, node:$idxen), - (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex, - node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, - node:$idxen)> { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }]; - let GISelPredicateCode = [{ return true; }]; - } - +multiclass SDBufferAtomicNoRet { def "_noret" : PatFrag< (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen)> { - let PredicateCode = [{ return SDValue(N, 0).use_empty(); }]; - let GISelPredicateCode = [{ return false; }]; + let HasNoUse = true; } } -defm SIbuffer_atomic_swap : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_add : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_sub : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_smin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_umin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_smax : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_umax : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_and : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_or : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_xor : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_inc : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_dec : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fadd : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fmin : SDBufferAtomicRetNoRet; -defm SIbuffer_atomic_fmax : SDBufferAtomicRetNoRet; +defm SIbuffer_atomic_swap : SDBufferAtomicNoRet; +defm SIbuffer_atomic_add : SDBufferAtomicNoRet; +defm SIbuffer_atomic_sub : SDBufferAtomicNoRet; +defm SIbuffer_atomic_smin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_umin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_smax : SDBufferAtomicNoRet; +defm SIbuffer_atomic_umax : SDBufferAtomicNoRet; +defm SIbuffer_atomic_and : SDBufferAtomicNoRet; +defm SIbuffer_atomic_or : SDBufferAtomicNoRet; +defm SIbuffer_atomic_xor : SDBufferAtomicNoRet; +defm SIbuffer_atomic_inc : SDBufferAtomicNoRet; +defm SIbuffer_atomic_dec : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fadd : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fmin : SDBufferAtomicNoRet; +defm SIbuffer_atomic_fmax : SDBufferAtomicNoRet; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -246,24 +235,13 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; -def SIbuffer_atomic_cmpswap_ret : PatFrag< - (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, - node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), - (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex, - node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, - node:$idxen)> { - let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }]; - let GISelPredicateCode = [{ return true; }]; -} - def SIbuffer_atomic_cmpswap_noret : PatFrag< (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen), (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset, node:$cachepolicy, node:$idxen)> { - let PredicateCode = [{ return SDValue(N, 0).use_empty(); }]; - let GISelPredicateCode = [{ return false; }]; + let HasNoUse = true; } class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode, @@ -774,13 +752,13 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, let AddressSpaces = StoreAddress_local.AddrSpaces in { defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _local_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), + defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; } let AddressSpaces = StoreAddress_region.AddrSpaces in { defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; - defm _region_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), + defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>; } } @@ -2194,21 +2172,21 @@ class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp, "$sdst", "$vdst"), ""); // use $sdst for VOPC - string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); - string isrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1", - " $src1,")); - string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); - - string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); - string fsrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1_modifiers", - " $src1_modifiers,")); - string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); - - string src0 = !if(Src0HasMods, fsrc0, isrc0); - string src1 = !if(Src1HasMods, fsrc1, isrc1); - string src2 = !if(Src2HasMods, fsrc2, isrc2); + string src0nomods = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string src1nomods = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string src2nomods = !if(!eq(NumSrcArgs, 3), " $src2", ""); + + string src0mods = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1mods = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2mods = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + + string src0 = !if(Src0HasMods, src0mods, src0nomods); + string src1 = !if(Src1HasMods, src1mods, src1nomods); + string src2 = !if(Src2HasMods, src2mods, src2nomods); string opsel = !if(HasOpSel, "$op_sel", ""); string 3PMods = !if(IsVOP3P, !if(HasOpSel, "$op_sel_hi", "") @@ -2559,8 +2537,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0, // the asm operand name via this HasModifiers flag field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret; field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp, - HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods, - HasSrc2FloatMods, DstVT >.ret; + HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers, + HasModifiers, DstVT>.ret; field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret; field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret; field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret; @@ -2800,6 +2778,14 @@ def getDPPOp32 : InstrMapping { let ValueCols = [["DPP"]]; } +def getDPPOp64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["VOP3"]; + let ValueCols = [["VOP3_DPP"]]; +} + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; @@ -2961,6 +2947,27 @@ def getVCMPXOpFromVCMP : InstrMapping { let ValueCols = [["1"]]; } +def VOPDComponentTable : GenericTable { + let FilterClass = "VOPD_Component"; + let CppTypeName = "VOPDComponentInfo"; + let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX"]; + let PrimaryKey = ["BaseVOP"]; + let PrimaryKeyName = "getVOPDComponentHelper"; +} + +def VOPDPairs : GenericTable { + let FilterClass = "VOPD_Base"; + let CppTypeName = "VOPDInfo"; + let Fields = ["Opcode", "OpX", "OpY"]; + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getVOPDOpcodeHelper"; +} + +def getVOPDInfoFromComponentOpcodes : SearchIndex { + let Table = VOPDPairs; + let Key = ["OpX", "OpY"]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 829669157893..ce8c03bb8d64 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1449,6 +1449,14 @@ def : BitConvert <v8i32, v16f16, VReg_256>; def : BitConvert <v8i32, v16i16, VReg_256>; def : BitConvert <v8f32, v16f16, VReg_256>; def : BitConvert <v8f32, v16i16, VReg_256>; +def : BitConvert <v16f16, v4i64, VReg_256>; +def : BitConvert <v16i16, v4i64, VReg_256>; +def : BitConvert <v16f16, v4f64, VReg_256>; +def : BitConvert <v16i16, v4f64, VReg_256>; +def : BitConvert <v4i64, v16f16, VReg_256>; +def : BitConvert <v4i64, v16i16, VReg_256>; +def : BitConvert <v4f64, v16f16, VReg_256>; +def : BitConvert <v4f64, v16i16, VReg_256>; // 512-bit bitcast def : BitConvert <v16i32, v16f32, VReg_512>; @@ -3012,6 +3020,35 @@ multiclass Int16Med3Pat<Instruction med3Inst, def : FPMed3Pat<f32, V_MED3_F32_e64>; +class +IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max, + SDPatternOperator max_or_min_oneuse> : AMDGPUPat < + (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1), + i32:$src2), + (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) +>; + +class +FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, + SDPatternOperator max_or_min_oneuse> : GCNPat < + (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods vt:$src2, i32:$src2_mods))), + (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +let OtherPredicates = [isGFX11Plus] in { +def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>; +def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>; +def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; +def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; +def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; +def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; +def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; +} + let OtherPredicates = [isGFX9Plus] in { def : FP16Med3Pat<f16, V_MED3_F16_e64>; defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 607383ab8cde..67077a2eaa6b 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -148,6 +148,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addUsedIfAvailable<LiveIntervals>(); // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved<MachineDominatorTree>(); AU.addPreserved<SlotIndexes>(); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index dd881ec42d53..786b6b61cb23 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -72,7 +72,7 @@ INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID; -/// Insert restore code for the callee-saved registers used in the function. +/// Insert spill code for the callee-saved registers used in the function. static void insertCSRSaves(MachineBasicBlock &SaveBlock, ArrayRef<CalleeSavedInfo> CSI, LiveIntervals *LIS) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index e426e938b856..ff5587fbb0ca 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1883,7 +1883,13 @@ void SIScheduleDAGMI::schedule() LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); buildDAGWithRegPressure(); + postprocessDAG(); + LLVM_DEBUG(dump()); + if (PrintDAGs) + dump(); + if (ViewMISchedDAGs) + viewGraph(); topologicalSort(); findRootsAndBiasEdges(TopRoots, BotRoots); diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 8a66213931ff..6b93769949bc 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2329,13 +2329,13 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { continue; if (const auto &MOI = MOA.getLoadInfo(MI)) - Changed |= expandLoad(MOI.getValue(), MI); + Changed |= expandLoad(MOI.value(), MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) - Changed |= expandStore(MOI.getValue(), MI); + Changed |= expandStore(MOI.value(), MI); else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) - Changed |= expandAtomicFence(MOI.getValue(), MI); + Changed |= expandAtomicFence(MOI.value(), MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) - Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); + Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI); } } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 5215397d5936..66bc46aaefea 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -9,6 +9,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/InitializePasses.h" @@ -20,10 +21,40 @@ using namespace llvm; namespace { class SIOptimizeExecMasking : public MachineFunctionPass { + MachineFunction *MF = nullptr; + const GCNSubtarget *ST = nullptr; + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + const MachineRegisterInfo *MRI = nullptr; + + Register isCopyFromExec(const MachineInstr &MI) const; + Register isCopyToExec(const MachineInstr &MI) const; + bool removeTerminatorBit(MachineInstr &MI) const; + MachineBasicBlock::reverse_iterator + fixTerminators(MachineBasicBlock &MBB) const; + MachineBasicBlock::reverse_iterator + findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) const; + + bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, + MCRegister Reg, bool UseLiveOuts = false, + bool IgnoreStart = false) const; + bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const; + MachineInstr *findInstrBackwards(MachineInstr &Origin, + std::function<bool(MachineInstr *)> Pred, + ArrayRef<MCRegister> NonModifiableRegs, + unsigned MaxInstructions = 20) const; + MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec, + MCRegister Exec) const; + bool optimizeExecSequence() const; + bool optimizeVCmpxAndSaveexecSequence() const; + bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr, + MachineInstr &VCmp, + MCRegister Exec) const; + public: static char ID; -public: SIOptimizeExecMasking() : MachineFunctionPass(ID) { initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry()); } @@ -53,7 +84,7 @@ char SIOptimizeExecMasking::ID = 0; char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID; /// If \p MI is a copy from exec, return the register copied to. -static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { +Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: @@ -61,8 +92,7 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B32_term: { const MachineOperand &Src = MI.getOperand(1); - if (Src.isReg() && - Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)) + if (Src.isReg() && Src.getReg() == TRI->getExec()) return MI.getOperand(0).getReg(); } } @@ -71,14 +101,13 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) { } /// If \p MI is a copy to exec, return the register copied from. -static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) { +Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: case AMDGPU::S_MOV_B32: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && - Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) && + if (Dst.isReg() && Dst.getReg() == TRI->getExec() && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; @@ -173,64 +202,64 @@ static unsigned getSaveExecOp(unsigned Opc) { // These are only terminators to get correct spill code placement during // register allocation, so turn them back into normal instructions. -static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { +bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const { switch (MI.getOpcode()) { case AMDGPU::S_MOV_B32_term: { bool RegSrc = MI.getOperand(1).isReg(); - MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); + MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32)); return true; } case AMDGPU::S_MOV_B64_term: { bool RegSrc = MI.getOperand(1).isReg(); - MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); + MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64)); return true; } case AMDGPU::S_XOR_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_XOR_B64)); + MI.setDesc(TII->get(AMDGPU::S_XOR_B64)); return true; } case AMDGPU::S_XOR_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_XOR_B32)); + MI.setDesc(TII->get(AMDGPU::S_XOR_B32)); return true; } case AMDGPU::S_OR_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B64)); + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); return true; } case AMDGPU::S_OR_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_OR_B32)); + MI.setDesc(TII->get(AMDGPU::S_OR_B32)); return true; } case AMDGPU::S_ANDN2_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64)); + MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64)); return true; } case AMDGPU::S_ANDN2_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32)); + MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32)); return true; } case AMDGPU::S_AND_B64_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_AND_B64)); + MI.setDesc(TII->get(AMDGPU::S_AND_B64)); return true; } case AMDGPU::S_AND_B32_term: { // This is only a terminator to get the correct spill code placement during // register allocation. - MI.setDesc(TII.get(AMDGPU::S_AND_B32)); + MI.setDesc(TII->get(AMDGPU::S_AND_B32)); return true; } default: @@ -241,9 +270,8 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) { // Turn all pseudoterminators in the block into their equivalent non-terminator // instructions. Returns the reverse iterator to the first non-terminator // instruction in the block. -static MachineBasicBlock::reverse_iterator fixTerminators( - const SIInstrInfo &TII, - MachineBasicBlock &MBB) { +MachineBasicBlock::reverse_iterator +SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const { MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); bool Seen = false; @@ -252,7 +280,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators( if (!I->isTerminator()) return Seen ? FirstNonTerm : I; - if (removeTerminatorBit(TII, *I)) { + if (removeTerminatorBit(*I)) { if (!Seen) { FirstNonTerm = I; Seen = true; @@ -263,17 +291,15 @@ static MachineBasicBlock::reverse_iterator fixTerminators( return FirstNonTerm; } -static MachineBasicBlock::reverse_iterator findExecCopy( - const SIInstrInfo &TII, - const GCNSubtarget &ST, - MachineBasicBlock &MBB, - MachineBasicBlock::reverse_iterator I, - unsigned CopyToExec) { +MachineBasicBlock::reverse_iterator +SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB, + MachineBasicBlock::reverse_iterator I, + unsigned CopyToExec) const { const unsigned InstLimit = 25; auto E = MBB.rend(); for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) { - Register CopyFromExec = isCopyFromExec(*I, ST); + Register CopyFromExec = isCopyFromExec(*I); if (CopyFromExec.isValid()) return I; } @@ -298,11 +324,9 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { // an arbitrary condition based on the current MachineInstr, for instance an // target instruction. Breaks prematurely by returning nullptr if one of the // registers given in NonModifiableRegs is modified by the current instruction. -static MachineInstr * -findInstrBackwards(MachineInstr &Origin, - std::function<bool(MachineInstr *)> Pred, - ArrayRef<MCRegister> NonModifiableRegs, - const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) { +MachineInstr *SIOptimizeExecMasking::findInstrBackwards( + MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred, + ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const { MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(), E = Origin.getParent()->rend(); unsigned CurrentIteration = 0; @@ -310,7 +334,7 @@ findInstrBackwards(MachineInstr &Origin, for (++A; CurrentIteration < MaxInstructions && A != E; ++A) { if (A->isDebugInstr()) continue; - + if (Pred(&*A)) return &*A; @@ -318,209 +342,64 @@ findInstrBackwards(MachineInstr &Origin, if (A->modifiesRegister(Reg, TRI)) return nullptr; } - + ++CurrentIteration; } return nullptr; } - // Determine if a register Reg is not re-defined and still in use // in the range (Stop..Start]. // It does so by backwards calculating liveness from the end of the BB until // either Stop or the beginning of the BB is reached. // After liveness is calculated, we can determine if Reg is still in use and not // defined inbetween the instructions. -static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start, - MCRegister Reg, const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI, - bool useLiveOuts = false, - bool ignoreStart = false) { +bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop, + MachineInstr &Start, + MCRegister Reg, + bool UseLiveOuts, + bool IgnoreStart) const { LivePhysRegs LR(*TRI); - if (useLiveOuts) + if (UseLiveOuts) LR.addLiveOuts(*Stop.getParent()); MachineBasicBlock::reverse_iterator A(Start); MachineBasicBlock::reverse_iterator E(Stop); - if (ignoreStart) + if (IgnoreStart) ++A; for (; A != Stop.getParent()->rend() && A != Stop; ++A) { LR.stepBackward(*A); } - return !LR.available(MRI, Reg); + return !LR.available(*MRI, Reg); } // Determine if a register Reg is not re-defined and still in use // in the range (Stop..BB.end]. -static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI, - MRI, true); +bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop, + MCRegister Reg) const { + return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true); } -// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence -// by looking at an instance of a s_and_saveexec instruction. Returns a pointer -// to the v_cmp instruction if it is safe to replace the sequence (see the -// conditions in the function body). This is after register allocation, so some -// checks on operand dependencies need to be considered. -static MachineInstr *findPossibleVCMPVCMPXOptimization( - MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI, - const SIInstrInfo *TII, MachineRegisterInfo &MRI) { - - MachineInstr *VCmp = nullptr; - - Register SaveExecDest = SaveExec.getOperand(0).getReg(); - if (!TRI->isSGPRReg(MRI, SaveExecDest)) - return nullptr; - - MachineOperand *SaveExecSrc0 = - TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); - if (!SaveExecSrc0->isReg()) - return nullptr; - - // Try to find the last v_cmp instruction that defs the saveexec input - // operand without any write to Exec or the saveexec input operand inbetween. - VCmp = findInstrBackwards( - SaveExec, - [&](MachineInstr *Check) { - return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && - Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); - }, - {Exec, SaveExecSrc0->getReg()}, TRI); - - if (!VCmp) - return nullptr; - - MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); - assert(VCmpDest && "Should have an sdst operand!"); - - // Check if any of the v_cmp source operands is written by the saveexec. - MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); - if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) && - SaveExec.modifiesRegister(Src0->getReg(), TRI)) - return nullptr; - - MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); - if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) && - SaveExec.modifiesRegister(Src1->getReg(), TRI)) - return nullptr; - - // Don't do the transformation if the destination operand is included in - // it's MBB Live-outs, meaning it's used in any of it's successors, leading - // to incorrect code if the v_cmp and therefore the def of - // the dest operand is removed. - if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) - return nullptr; - - // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the - // s_and_saveexec, skip the optimization. - if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI, - false, true) || - isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI)) - return nullptr; - - // Try to determine if there is a write to any of the VCmp - // operands between the saveexec and the vcmp. - // If yes, additional VGPR spilling might need to be inserted. In this case, - // it's not worth replacing the instruction sequence. - SmallVector<MCRegister, 2> NonDefRegs; - if (Src0->isReg()) - NonDefRegs.push_back(Src0->getReg()); - - if (Src1->isReg()) - NonDefRegs.push_back(Src1->getReg()); - - if (!findInstrBackwards( - SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, - NonDefRegs, TRI)) - return nullptr; - - return VCmp; -} - -// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the -// operands extracted from a v_cmp ..., s_and_saveexec pattern. -static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr, - MachineInstr &VCmp, MCRegister Exec, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - MachineRegisterInfo &MRI) { - const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); - - if (NewOpcode == -1) - return false; - - MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); - MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); - - Register MoveDest = SaveExecInstr.getOperand(0).getReg(); - - MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); - if (!SaveExecInstr.uses().empty()) { - bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32; - unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(*SaveExecInstr.getParent(), InsertPosIt, - SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) - .addReg(Exec); - } - - // Omit dst as V_CMPX is implicitly writing to EXEC. - // Add dummy src and clamp modifiers, if needed. - auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), - VCmp.getDebugLoc(), TII->get(NewOpcode)); - - auto TryAddImmediateValueFromNamedOperand = - [&](unsigned OperandName) -> void { - if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) - Builder.addImm(Mod->getImm()); - }; - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); - Builder.add(*Src0); - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); - Builder.add(*Src1); - - TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); - - // The kill flags may no longer be correct. - if (Src0->isReg()) - MRI.clearKillFlags(Src0->getReg()); - if (Src1->isReg()) - MRI.clearKillFlags(Src1->getReg()); - - return true; -} - -bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineRegisterInfo *MRI = &MF.getRegInfo(); - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - - // Optimize sequences emitted for control flow lowering. They are originally - // emitted as the separate operations because spill code may need to be - // inserted for the saved copy of exec. - // - // x = copy exec - // z = s_<op>_b64 x, y - // exec = copy z - // => - // x = s_<op>_saveexec_b64 y - // +// Optimize sequences emitted for control flow lowering. They are originally +// emitted as the separate operations because spill code may need to be +// inserted for the saved copy of exec. +// +// x = copy exec +// z = s_<op>_b64 x, y +// exec = copy z +// => +// x = s_<op>_saveexec_b64 y +// +bool SIOptimizeExecMasking::optimizeExecSequence() const { + MCRegister Exec = TRI->getExec(); bool Changed = false; - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); + for (MachineBasicBlock &MBB : *MF) { + MachineBasicBlock::reverse_iterator I = fixTerminators(MBB); MachineBasicBlock::reverse_iterator E = MBB.rend(); if (I == E) continue; @@ -532,7 +411,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { unsigned SearchCount = 0; const unsigned SearchLimit = 5; while (I != E && SearchCount++ < SearchLimit) { - CopyToExec = isCopyToExec(*I, ST); + CopyToExec = isCopyToExec(*I); if (CopyToExec) break; ++I; @@ -542,8 +421,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { continue; // Scan backwards to find the def. - auto CopyToExecInst = &*I; - auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec); + auto *CopyToExecInst = &*I; + auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec); if (CopyFromExecInst == E) { auto PrepareExecInst = std::next(I); if (PrepareExecInst == E) @@ -574,8 +453,9 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { MachineInstr *SaveExecInst = nullptr; SmallVector<MachineInstr *, 4> OtherUseInsts; - for (MachineBasicBlock::iterator J - = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); + for (MachineBasicBlock::iterator + J = std::next(CopyFromExecInst->getIterator()), + JE = I->getIterator(); J != JE; ++J) { if (SaveExecInst && J->readsRegister(Exec, TRI)) { LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); @@ -655,58 +535,210 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), CopyFromExec) - .addReg(OtherOp->getReg()); + .addReg(OtherOp->getReg()); SaveExecInst->eraseFromParent(); CopyToExecInst->eraseFromParent(); for (MachineInstr *OtherInst : OtherUseInsts) { - OtherInst->substituteRegister(CopyToExec, Exec, - AMDGPU::NoSubRegister, *TRI); + OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister, + *TRI); } Changed = true; } - // After all s_op_saveexec instructions are inserted, - // replace (on GFX10.3 and later) - // v_cmp_* SGPR, IMM, VGPR - // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR - // with - // s_mov_b32 EXEC_SGPR_DEST, exec_lo - // v_cmpx_* IMM, VGPR - // to reduce pipeline stalls. - if (ST.hasGFX10_3Insts()) { - DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; - const unsigned AndSaveExecOpcode = - ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; - - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - // Record relevant v_cmp / s_and_saveexec instruction pairs for - // replacement. - if (MI.getOpcode() != AndSaveExecOpcode) - continue; + return Changed; +} - if (MachineInstr *VCmp = - findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI)) - SaveExecVCmpMapping[&MI] = VCmp; - } +// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence +// by looking at an instance of a s_and_saveexec instruction. Returns a pointer +// to the v_cmp instruction if it is safe to replace the sequence (see the +// conditions in the function body). This is after register allocation, so some +// checks on operand dependencies need to be considered. +MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization( + MachineInstr &SaveExec, MCRegister Exec) const { + + MachineInstr *VCmp = nullptr; + + Register SaveExecDest = SaveExec.getOperand(0).getReg(); + if (!TRI->isSGPRReg(*MRI, SaveExecDest)) + return nullptr; + + MachineOperand *SaveExecSrc0 = + TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0); + if (!SaveExecSrc0->isReg()) + return nullptr; + + // Try to find the last v_cmp instruction that defs the saveexec input + // operand without any write to Exec or the saveexec input operand inbetween. + VCmp = findInstrBackwards( + SaveExec, + [&](MachineInstr *Check) { + return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 && + Check->modifiesRegister(SaveExecSrc0->getReg(), TRI); + }, + {Exec, SaveExecSrc0->getReg()}); + + if (!VCmp) + return nullptr; + + MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst); + assert(VCmpDest && "Should have an sdst operand!"); + + // Check if any of the v_cmp source operands is written by the saveexec. + MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0); + if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) && + SaveExec.modifiesRegister(Src0->getReg(), TRI)) + return nullptr; + + MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1); + if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) && + SaveExec.modifiesRegister(Src1->getReg(), TRI)) + return nullptr; + + // Don't do the transformation if the destination operand is included in + // it's MBB Live-outs, meaning it's used in any of it's successors, leading + // to incorrect code if the v_cmp and therefore the def of + // the dest operand is removed. + if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg())) + return nullptr; + + // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the + // s_and_saveexec, skip the optimization. + if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false, + true) || + isRegisterInUseAfter(SaveExec, VCmpDest->getReg())) + return nullptr; + + // Try to determine if there is a write to any of the VCmp + // operands between the saveexec and the vcmp. + // If yes, additional VGPR spilling might need to be inserted. In this case, + // it's not worth replacing the instruction sequence. + SmallVector<MCRegister, 2> NonDefRegs; + if (Src0->isReg()) + NonDefRegs.push_back(Src0->getReg()); + + if (Src1->isReg()) + NonDefRegs.push_back(Src1->getReg()); + + if (!findInstrBackwards( + SaveExec, [&](MachineInstr *Check) { return Check == VCmp; }, + NonDefRegs)) + return nullptr; + + return VCmp; +} + +// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the +// operands extracted from a v_cmp ..., s_and_saveexec pattern. +bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence( + MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const { + const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode()); + + if (NewOpcode == -1) + return false; + + MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1); + + Register MoveDest = SaveExecInstr.getOperand(0).getReg(); + + MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator(); + if (!SaveExecInstr.uses().empty()) { + bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32; + unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(*SaveExecInstr.getParent(), InsertPosIt, + SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest) + .addReg(Exec); + } + + // Omit dst as V_CMPX is implicitly writing to EXEC. + // Add dummy src and clamp modifiers, if needed. + auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt), + VCmp.getDebugLoc(), TII->get(NewOpcode)); + + auto TryAddImmediateValueFromNamedOperand = + [&](unsigned OperandName) -> void { + if (auto *Mod = TII->getNamedOperand(VCmp, OperandName)) + Builder.addImm(Mod->getImm()); + }; + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers); + Builder.add(*Src0); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers); + Builder.add(*Src1); + + TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp); + + // The kill flags may no longer be correct. + if (Src0->isReg()) + MRI->clearKillFlags(Src0->getReg()); + if (Src1->isReg()) + MRI->clearKillFlags(Src1->getReg()); + + return true; +} + +// After all s_op_saveexec instructions are inserted, +// replace (on GFX10.3 and later) +// v_cmp_* SGPR, IMM, VGPR +// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR +// with +// s_mov_b32 EXEC_SGPR_DEST, exec_lo +// v_cmpx_* IMM, VGPR +// to reduce pipeline stalls. +bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const { + if (!ST->hasGFX10_3Insts()) + return false; + + bool Changed = false; + + DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping; + MCRegister Exec = TRI->getExec(); + const unsigned AndSaveExecOpcode = + ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + // Record relevant v_cmp / s_and_saveexec instruction pairs for + // replacement. + if (MI.getOpcode() != AndSaveExecOpcode) + continue; + + if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec)) + SaveExecVCmpMapping[&MI] = VCmp; } + } - for (const auto &Entry : SaveExecVCmpMapping) { - MachineInstr *SaveExecInstr = Entry.getFirst(); - MachineInstr *VCmpInstr = Entry.getSecond(); + for (const auto &Entry : SaveExecVCmpMapping) { + MachineInstr *SaveExecInstr = Entry.getFirst(); + MachineInstr *VCmpInstr = Entry.getSecond(); - if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII, - TRI, *MRI)) { - SaveExecInstr->eraseFromParent(); - VCmpInstr->eraseFromParent(); + if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) { + SaveExecInstr->eraseFromParent(); + VCmpInstr->eraseFromParent(); - Changed = true; - } + Changed = true; } } return Changed; } + +bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + this->MF = &MF; + ST = &MF.getSubtarget<GCNSubtarget>(); + TRI = ST->getRegisterInfo(); + TII = ST->getInstrInfo(); + MRI = &MF.getRegInfo(); + + bool Changed = optimizeExecSequence(); + Changed |= optimizeVCmpxAndSaveexecSequence(); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index e5e65a8dbbf1..57dbad468de8 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -159,6 +159,9 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { return false; Register SelReg = Op1->getReg(); + if (SelReg.isPhysical()) + return false; + auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS); if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) return false; @@ -264,13 +267,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { // Try to remove v_cndmask_b32. if (SelLI) { - bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill(); - if (!CanRemoveSel) { - // Try to shrink the live interval and check for dead def instead. - LIS->shrinkToUses(SelLI, nullptr); - CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); - } - if (CanRemoveSel) { + // Kill status must be checked before shrinking the live range. + bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill(); + LIS->shrinkToUses(SelLI); + bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef(); + if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) { LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ad1455ed20fd..b32d5bb04d5b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2933,6 +2933,10 @@ MCRegister SIRegisterInfo::getVCC() const { return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC; } +MCRegister SIRegisterInfo::getExec() const { + return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; +} + const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { // VGPR tuples have an alignment requirement on gfx90a variants. return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 9bfbc253410b..6024158be181 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -344,6 +344,8 @@ public: MCRegister getVCC() const; + MCRegister getExec() const; + const TargetRegisterClass *getRegClass(unsigned RCID) const; // Find reaching register definition diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index e4ab72f1095b..2f334e211181 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -277,6 +277,18 @@ struct VOPC64DPPInfo { uint16_t Opcode; }; +struct VOPDComponentInfo { + uint16_t BaseVOP; + uint16_t VOPDOp; + bool CanBeVOPDX; +}; + +struct VOPDInfo { + uint16_t Opcode; + uint16_t OpX; + uint16_t OpY; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -293,6 +305,10 @@ struct VOPC64DPPInfo { #define GET_VOPC64DPPTable_IMPL #define GET_VOPC64DPP8Table_DECL #define GET_VOPC64DPP8Table_IMPL +#define GET_VOPDComponentTable_DECL +#define GET_VOPDComponentTable_IMPL +#define GET_VOPDPairs_DECL +#define GET_VOPDPairs_IMPL #define GET_WMMAOpcode2AddrMappingTable_DECL #define GET_WMMAOpcode2AddrMappingTable_IMPL #define GET_WMMAOpcode3AddrMappingTable_DECL @@ -398,6 +414,19 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info ? Info->is_gfx940_xdl : false; } +CanBeVOPD getCanBeVOPD(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + if (Info) + return {Info->CanBeVOPDX, 1}; + else + return {0, 0}; +} + +unsigned getVOPDOpcode(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + return Info ? Info->VOPDOp : ~0u; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; @@ -415,6 +444,11 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) { return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); } +int getVOPDFull(unsigned OpX, unsigned OpY) { + const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY); + return Info ? Info->Opcode : -1; +} + namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index dffeec10a14a..51cf1678207c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -470,6 +470,14 @@ bool getMAIIsDGEMM(unsigned Opc); LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +struct CanBeVOPD { + bool X; + bool Y; +}; + +LLVM_READONLY +CanBeVOPD getCanBeVOPD(unsigned Opc); + LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, @@ -483,6 +491,12 @@ LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); LLVM_READONLY +unsigned getVOPDOpcode(unsigned Opc); + +LLVM_READONLY +int getVOPDFull(unsigned OpX, unsigned OpY); + +LLVM_READONLY unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 1485a1e63129..b24857edb59a 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -495,9 +495,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=* bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); let InsDPP8 = (ins DstRCDPP:$old, - Src0DPP:$src0, - Src1DPP:$src1, - dpp8:$dpp8, FI:$fi); + Src0DPP:$src0, + Src1DPP:$src1, + dpp8:$dpp8, FI:$fi); let HasExt = 1; let HasExtDPP = 1; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index eb6c54a45263..33d3441e94c2 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -1108,7 +1108,6 @@ class VOPC64_DPP_Base<bits<10> op, string OpName, VOPProfile P> // Inst{87-84} ignored by hw let Inst{91-88} = bank_mask; let Inst{95-92} = row_mask; - } class VOPC64_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName> @@ -1148,7 +1147,6 @@ class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P> let Inst{40-32} = fi; let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0); let Inst{95-72} = dpp8{23-0}; - } class VOPC64_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName> diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 8cd3d2fe2c47..187485ffa3ae 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1215,7 +1215,9 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); - let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers)); + let HasModifiers = + !if (Features.IsMAI, 0, + !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers)); } class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> { @@ -1414,7 +1416,7 @@ multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName, VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>; multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName, - string asmName> : + string asmName> : VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>; multiclass VOP3be_Realtriple_gfx11< |