aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td76
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td27
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td77
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td27
-rw-r--r--llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp175
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp127
-rw-r--r--llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp212
-rw-r--r--llvm/lib/Target/AMDGPU/GCNVOPDUtils.h32
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h6
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h1
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/R600MCInstLower.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td125
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td37
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp524
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h14
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td6
46 files changed, 1224 insertions, 530 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c4680cbedadf..91dc611fb265 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -317,6 +317,9 @@ extern char &SIFormMemoryClausesID;
void initializeSIPostRABundlerPass(PassRegistry&);
extern char &SIPostRABundlerID;
+void initializeGCNCreateVOPDPass(PassRegistry &);
+extern char &GCNCreateVOPDID;
+
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
extern char &AMDGPUUnifyDivergentExitNodesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 94d7844e8a32..a8108b1d637b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -626,13 +626,13 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
Constant *FoldedT = SelOpNo ?
ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
- if (isa<ConstantExpr>(FoldedT))
+ if (!FoldedT || isa<ConstantExpr>(FoldedT))
return false;
Constant *FoldedF = SelOpNo ?
ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
- if (isa<ConstantExpr>(FoldedF))
+ if (!FoldedF || isa<ConstantExpr>(FoldedF))
return false;
IRBuilder<> Builder(&BO);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b00df27f5fd3..589992c7a7ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1883,20 +1883,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return true;
}
+// Match an immediate (if Imm is true) or an SGPR (if Imm is false)
+// offset. If Imm32Only is true, match only 32-bit immediate offsets
+// available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
- SDValue &Offset, bool &Imm) const {
+ SDValue &Offset, bool Imm,
+ bool Imm32Only) const {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
if (!C) {
+ if (Imm)
+ return false;
if (ByteOffsetNode.getValueType().isScalarInteger() &&
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
Offset = ByteOffsetNode;
- Imm = false;
return true;
}
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
Offset = ByteOffsetNode.getOperand(0);
- Imm = false;
return true;
}
}
@@ -1908,9 +1912,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
int64_t ByteOffset = C->getSExtValue();
Optional<int64_t> EncodedOffset =
AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
- if (EncodedOffset) {
+ if (EncodedOffset && Imm && !Imm32Only) {
Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
- Imm = true;
return true;
}
@@ -1919,7 +1922,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
return false;
EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
- if (EncodedOffset) {
+ if (EncodedOffset && Imm32Only) {
Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
return true;
}
@@ -1927,11 +1930,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
return false;
- SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
- Offset = SDValue(
- CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
+ if (!Imm) {
+ SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+ Offset = SDValue(
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
+ return true;
+ }
- return true;
+ return false;
}
SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
@@ -1959,8 +1965,12 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
Ops), 0);
}
+// Match a base and an immediate (if Imm is true) or an SGPR
+// (if Imm is false) offset. If Imm32Only is true, match only 32-bit
+// immediate offsets available on CI.
bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
- SDValue &Offset, bool &Imm) const {
+ SDValue &Offset, bool Imm,
+ bool Imm32Only) const {
SDLoc SL(Addr);
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -1977,41 +1987,34 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
assert(N0 && N1 && isa<ConstantSDNode>(N1));
}
if (N0 && N1) {
- if (SelectSMRDOffset(N1, Offset, Imm)) {
+ if (SelectSMRDOffset(N1, Offset, Imm, Imm32Only)) {
SBase = Expand32BitAddress(N0);
return true;
}
}
+ return false;
}
+ if (!Imm)
+ return false;
SBase = Expand32BitAddress(Addr);
Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
- Imm = true;
return true;
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- bool Imm = false;
- return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
+ return SelectSMRD(Addr, SBase, Offset, /* Imm */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
-
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
-
- bool Imm = false;
- if (!SelectSMRD(Addr, SBase, Offset, Imm))
- return false;
-
- return !Imm && isa<ConstantSDNode>(Offset);
+ return SelectSMRD(Addr, SBase, Offset, /* Imm */ true, /* Imm32Only */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- bool Imm = false;
- return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
- !isa<ConstantSDNode>(Offset);
+ return SelectSMRD(Addr, SBase, Offset, /* Imm */ false);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 862be9dc5568..7894b8eb5b67 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -193,11 +193,11 @@ private:
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &SAddr, SDValue &Offset) const;
- bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
- bool &Imm) const;
+ bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool Imm,
+ bool Imm32Only) const;
SDValue Expand32BitAddress(SDValue Addr) const;
- bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
- bool &Imm) const;
+ bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool Imm,
+ bool Imm32Only = false) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ef7929012597..bf520a560404 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4803,6 +4803,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::Nand:
case AtomicRMWInst::FAdd:
case AtomicRMWInst::FSub:
+ case AtomicRMWInst::FMax:
+ case AtomicRMWInst::FMin:
return AtomicExpansionKind::CmpXChg;
default:
return AtomicExpansionKind::None;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3f242fdb6d8e..70fae9d784a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1180,7 +1180,7 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
if (Arg) {
- const int64_t Value = Arg.getValue().Value.getSExtValue();
+ const int64_t Value = Arg.value().Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
@@ -3235,7 +3235,7 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
- return false;
+ return Register();
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
@@ -3851,27 +3851,36 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
getAddrModeInfo(*MI, *MRI, AddrInfo);
// FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
- // then we can select all ptr + 32-bit offsets not just immediate offsets.
- if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
+ // then we can select all ptr + 32-bit offsets.
+ if (AddrInfo.empty())
return None;
const GEPInfo &GEPInfo = AddrInfo[0];
+ Register PtrReg = GEPInfo.SgprParts[0];
+
// SGPR offset is unsigned.
- if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
- return None;
+ if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) &&
+ GEPInfo.Imm != 0) {
+ // If we make it this far we have a load with an 32-bit immediate offset.
+ // It is OK to select this using a sgpr offset, because we have already
+ // failed trying to select this load into one of the _IMM variants since
+ // the _IMM Patterns are considered before the _SGPR patterns.
+ Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
+ .addImm(GEPInfo.Imm);
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
+ }
- // If we make it this far we have a load with an 32-bit immediate offset.
- // It is OK to select this using a sgpr offset, because we have already
- // failed trying to select this load into one of the _IMM variants since
- // the _IMM Patterns are considered before the _SGPR patterns.
- Register PtrReg = GEPInfo.SgprParts[0];
- Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
- .addImm(GEPInfo.Imm);
- return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
- [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
- }};
+ if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) {
+ if (Register OffsetReg =
+ matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) {
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
+ }
+ }
+
+ return None;
}
std::pair<Register, int>
@@ -4231,7 +4240,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
},
[=](MachineInstrBuilder &MIB) { // vaddr
if (FI)
- MIB.addFrameIndex(FI.getValue());
+ MIB.addFrameIndex(FI.value());
else
MIB.addReg(VAddr);
},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 31012915457b..26e6b9a10688 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -542,63 +542,37 @@ def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val),
}
} // End foreach as
-// TODO: Add GISelPredicateCode for the ret and noret PatFrags once
-// GlobalISelEmitter allows pattern matches where src and dst def count
-// mismatch.
-
-multiclass ret_noret_op {
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return true; }] in {
- def "_ret" : PatFrag<(ops node:$ptr, node:$data),
- (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
- }
-
- let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return false; }] in {
- def "_noret" : PatFrag<(ops node:$ptr, node:$data),
- (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
- }
+multiclass noret_op {
+ let HasNoUse = true in
+ def "_noret" : PatFrag<(ops node:$ptr, node:$data),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
}
-defm int_amdgcn_flat_atomic_fadd : ret_noret_op;
-defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op;
-defm int_amdgcn_flat_atomic_fmin : ret_noret_op;
-defm int_amdgcn_flat_atomic_fmax : ret_noret_op;
-defm int_amdgcn_global_atomic_fadd : ret_noret_op;
-defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op;
-defm int_amdgcn_global_atomic_fmin : ret_noret_op;
-defm int_amdgcn_global_atomic_fmax : ret_noret_op;
-defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op;
-
-multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
- let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return false; }] in {
- defm "_noret" : binary_atomic_op<atomic_op, IsInt>;
- }
+defm int_amdgcn_flat_atomic_fadd : noret_op;
+defm int_amdgcn_flat_atomic_fadd_v2bf16 : noret_op;
+defm int_amdgcn_flat_atomic_fmin : noret_op;
+defm int_amdgcn_flat_atomic_fmax : noret_op;
+defm int_amdgcn_global_atomic_fadd : noret_op;
+defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op;
+defm int_amdgcn_global_atomic_fmin : noret_op;
+defm int_amdgcn_global_atomic_fmax : noret_op;
+defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return true; }] in {
- defm "_ret" : binary_atomic_op<atomic_op, IsInt>;
- }
+multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+ let HasNoUse = true in
+ defm "_noret" : binary_atomic_op<atomic_op, IsInt>;
}
-multiclass ret_noret_ternary_atomic_op<SDNode atomic_op> {
- let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return false; }] in {
- defm "_noret" : ternary_atomic_op<atomic_op>;
- }
-
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
- GISelPredicateCode = [{ return true; }] in {
- defm "_ret" : ternary_atomic_op<atomic_op>;
- }
+multiclass noret_ternary_atomic_op<SDNode atomic_op> {
+ let HasNoUse = true in
+ defm "_noret" : ternary_atomic_op<atomic_op>;
}
multiclass binary_atomic_op_all_as<SDNode atomic_op, bit IsInt = 1> {
foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
- defm "_"#as : ret_noret_binary_atomic_op<atomic_op, IsInt>;
+ defm "_"#as : noret_binary_atomic_op<atomic_op, IsInt>;
}
}
}
@@ -640,13 +614,15 @@ def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
let AddressSpaces = StoreAddress_local.AddrSpaces in {
defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_local : ret_noret_ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_local_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_local : noret_ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_local_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
}
let AddressSpaces = StoreAddress_region.AddrSpaces in {
-defm atomic_cmp_swap_region : ret_noret_ternary_atomic_op<atomic_cmp_swap>;
-defm atomic_cmp_swap_region_m0 : ret_noret_ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_region : noret_ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_region_m0 : noret_ternary_atomic_op<atomic_cmp_swap_glue>;
+defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ed6ddbf426fd..38e04dedd9fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -171,6 +171,10 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
}
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
+ // FIXME: Enable feature predicate checks once all the test pass.
+ // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
+ // getSubtargetInfo().getFeatureBits());
+
if (emitPseudoExpansionLowering(*OutStreamer, MI))
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
index 1b513c456307..745734aac2b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
@@ -131,8 +131,8 @@ public:
bool IsAOneAddressSpace = isOneAddressSpace(A);
bool IsBOneAddressSpace = isOneAddressSpace(B);
- return AIO.getValue() >= BIO.getValue() &&
- (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace);
+ return AIO.value() >= BIO.value() &&
+ (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace);
}
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 77816a783630..6bd906439ee8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -40,9 +40,9 @@ using namespace llvm;
#include "AMDGPUGenSubtargetInfo.inc"
#undef AMDGPUSubtarget
-static cl::opt<bool> DisablePowerSched(
- "amdgpu-disable-power-sched",
- cl::desc("Disable scheduling to minimize mAI power bursts"),
+static cl::opt<bool> EnablePowerSched(
+ "amdgpu-enable-power-sched",
+ cl::desc("Enable scheduling to minimize mAI power bursts"),
cl::init(false));
static cl::opt<bool> EnableVGPRIndexMode(
@@ -916,7 +916,7 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
void apply(ScheduleDAGInstrs *DAGInstrs) override {
const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasMAIInsts() || DisablePowerSched)
+ if (!ST.hasMAIInsts())
return;
DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
@@ -966,7 +966,8 @@ void GCNSubtarget::getPostRAMutations(
std::unique_ptr<ScheduleDAGMutation>
GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
- return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
+ return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
+ : nullptr;
}
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 1c6b9d35695a..971e44723758 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,11 +22,13 @@
#include "AMDGPUTargetTransformInfo.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
+#include "GCNVOPDUtils.h"
#include "R600.h"
#include "R600TargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
@@ -278,6 +280,12 @@ static cl::opt<bool>
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);
+// Enable GFX11+ VOPD
+static cl::opt<bool>
+ EnableVOPD("amdgpu-enable-vopd",
+ cl::desc("Enable VOPD, dual issue of VALU in wave32"),
+ cl::init(true), cl::Hidden);
+
// Option is used in lit tests to prevent deadcoding of patterns inspected.
static cl::opt<bool>
EnableDCEInRA("amdgpu-dce-in-ra",
@@ -383,6 +391,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIPreAllocateWWMRegsPass(*PR);
initializeSIFormMemoryClausesPass(*PR);
initializeSIPostRABundlerPass(*PR);
+ initializeGCNCreateVOPDPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -920,6 +929,8 @@ public:
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createSchedBarrierDAGMutation());
+ if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
+ DAG->addMutation(createVOPDPairingMutation());
return DAG;
}
@@ -1399,6 +1410,8 @@ void GCNPassConfig::addPreSched2() {
}
void GCNPassConfig::addPreEmitPass() {
+ if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
+ addPass(&GCNCreateVOPDID);
addPass(createSIMemoryLegalizerPass());
addPass(createSIInsertWaitcntsPass());
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a087323e5de7..04dd3e938a15 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1412,10 +1412,12 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
foreach RtnMode = ["ret", "noret"] in {
- defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode
+ defvar Op = !cast<SDPatternOperator>(OpPrefix
+ # !if(!eq(RtnMode, "ret"), "", "_noret")
# !if(isIntr, "", "_" # vt.Size));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+ let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
(vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
@@ -1428,6 +1430,7 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt
(!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset)
>;
+ } // end let AddedComplexity
} // end foreach RtnMode
}
@@ -1439,10 +1442,12 @@ multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> {
multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
foreach RtnMode = ["ret", "noret"] in {
- defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global_" # RtnMode
+ defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global"
+ # !if(!eq(RtnMode, "ret"), "", "_noret")
# "_" # vt.Size);
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+ let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
offset:$offset);
@@ -1465,6 +1470,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst>
!if(!eq(vt, i32), sub0, sub0_sub1)),
Addr64ResDag)
>;
+ } // end let AddedComplexity
} // end foreach RtnMode
}
@@ -1495,13 +1501,14 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
list<string> RtnModes = ["ret", "noret"]> {
foreach RtnMode = RtnModes in {
- defvar Op = !cast<SDPatternOperator>(!if(!eq(RtnMode, "none"),
- OpPrefix, OpPrefix # "_" # RtnMode));
- defvar InstSuffix = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")),
- "_RTN", "");
- defvar CachePolicy = !if(!or(!eq(RtnMode, "none"), !eq(RtnMode, "ret")),
+ defvar Op = !cast<SDPatternOperator>(OpPrefix
+ # !if(!eq(RtnMode, "ret"), "", "_noret"));
+
+ defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+ defvar CachePolicy = !if(!eq(RtnMode, "ret"),
(set_glc $cachepolicy), (timm:$cachepolicy));
+ let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
timm:$offset, timm:$cachepolicy, 0)),
@@ -1534,6 +1541,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
>;
+ } // end let AddedComplexity
} // end foreach RtnMode
}
@@ -1551,7 +1559,7 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_or", i32, "BUFFER_ATOMIC_OR">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i32, "BUFFER_ATOMIC_XOR">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i32, "BUFFER_ATOMIC_INC">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i32, "BUFFER_ATOMIC_DEC">;
-defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["none"]>;
+defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["ret"]>;
defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i64, "BUFFER_ATOMIC_SWAP_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i64, "BUFFER_ATOMIC_ADD_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_sub", i64, "BUFFER_ATOMIC_SUB_X2">;
@@ -1643,7 +1651,8 @@ let SubtargetPredicate = isGFX90APlus in {
foreach RtnMode = ["ret", "noret"] in {
-defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap # "_" # RtnMode);
+defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
+ # !if(!eq(RtnMode, "ret"), "", "_noret"));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
(timm:$cachepolicy));
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 27b723875aa4..d8387bf6f1ae 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -950,10 +950,11 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
} // End AddedComplexity = 100
-class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
- (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
->;
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
+ bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> {
+ let AddedComplexity = complexity;
+}
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
@@ -965,75 +966,88 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
!cast<PatFrag>(frag#"_local_"#vt.Size)>;
}
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
}
multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size), /* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
}
def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
+ !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
-class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
+class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag,
+ int complexity = 0, bit gds=0> : GCNPat<
(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))
->;
+ (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))> {
+ let AddedComplexity = complexity;
+}
multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt,
string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_ret_"#vt.Size)>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size)>;
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_local_m0_noret_"#vt.Size),
+ /* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
def : DSAtomicCmpXChgSwapped<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size),
+ /* complexity */ 1>;
}
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
}
} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10
let SubtargetPredicate = isGFX11Plus in {
// The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode.
-class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag,
+ int complexity = 0, bit gds=0> : GCNPat<
(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))
->;
+ (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))> {
+ let AddedComplexity = complexity;
+}
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, string frag> {
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_ret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt.Size)>;
+ !cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_ret_"#vt.Size), 1>;
- def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size), 1>;
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
+ def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
}
} // End SubtargetPredicate = isGFX11Plus
@@ -1090,17 +1104,20 @@ defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B64, DS_CMPSTORE_B64, i64, "atomic_cmp
} // End SubtargetPredicate = isGFX11Plus
let SubtargetPredicate = isGFX90APlus in {
-def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_ret_64>;
+def : DSAtomicRetPat<DS_ADD_RTN_F64, f64, atomic_load_fadd_local_64>;
+let AddedComplexity = 1 in
def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
}
let SubtargetPredicate = isGFX940Plus in {
-def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_ret_32>;
+def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
+let AddedComplexity = 1 in
def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>;
def : GCNPat <
- (v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)),
+ (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
>;
+let AddedComplexity = 1 in
def : GCNPat <
(v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
(DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index cb2822818549..c634e15945ad 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1015,31 +1015,35 @@ class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt
multiclass FlatAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> {
- defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size);
+ defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size);
defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+ let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
- ValueType data_vt = vt, bit isIntr = 0> {
- defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+ ValueType data_vt = vt, int complexity = 0,
+ bit isIntr = 0> {
+ defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size));
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
+ let AddedComplexity = complexity in
def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+ let AddedComplexity = !add(complexity, 1) in
def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
multiclass FlatSignedAtomicIntrPat <string inst, string node, ValueType vt,
ValueType data_vt = vt> {
- defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+ defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* complexity */ 0, /* isIntr */ 1>;
}
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
@@ -1260,17 +1264,16 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod
multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> {
- defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
+ defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_" # vt.Size));
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
- let AddedComplexity = 10 in {
- defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>;
- }
+ defm : FlatSignedAtomicPat <inst, node, vt, data_vt, /* complexity */ 10, isIntr>;
- let AddedComplexity = 11 in {
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>;
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
- }
+ let AddedComplexity = 13 in
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), noRtnNode, vt, data_vt>;
+
+ let AddedComplexity = 12 in
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>;
}
multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
new file mode 100644
index 000000000000..83dc3bebf4d3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -0,0 +1,175 @@
+//===- GCNCreateVOPD.cpp - Create VOPD Instructions ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Combine VALU pairs into VOPD instructions
+/// Only works on wave32
+/// Has register requirements, we reject creating VOPD if the requirements are
+/// not met.
+/// shouldCombineVOPD mutator in postRA machine scheduler puts candidate
+/// instructions for VOPD back-to-back
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "GCNVOPDUtils.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include <utility>
+
+#define DEBUG_TYPE "gcn-create-vopd"
+STATISTIC(NumVOPDCreated, "Number of VOPD Insts Created.");
+
+using namespace llvm;
+
+namespace {
+
+class GCNCreateVOPD : public MachineFunctionPass {
+private:
+public:
+ static char ID;
+ const GCNSubtarget *ST = nullptr;
+
+ GCNCreateVOPD() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return "GCN Create VOPD Instructions";
+ }
+
+ bool doReplace(const SIInstrInfo *SII,
+ std::pair<MachineInstr *, MachineInstr *> &Pair) {
+ auto *FirstMI = Pair.first;
+ auto *SecondMI = Pair.second;
+ unsigned Opc1 = FirstMI->getOpcode();
+ unsigned Opc2 = SecondMI->getOpcode();
+ int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1),
+ AMDGPU::getVOPDOpcode(Opc2));
+ assert(NewOpcode != -1 &&
+ "Should have previously determined this as a possible VOPD\n");
+
+ auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI,
+ FirstMI->getDebugLoc(), SII->get(NewOpcode))
+ .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags());
+ VOPDInst.add(FirstMI->getOperand(0))
+ .add(SecondMI->getOperand(0))
+ .add(FirstMI->getOperand(1));
+
+ switch (Opc1) {
+ case AMDGPU::V_MOV_B32_e32:
+ break;
+ case AMDGPU::V_FMAMK_F32:
+ case AMDGPU::V_FMAAK_F32:
+ VOPDInst.add(FirstMI->getOperand(2));
+ VOPDInst.add(FirstMI->getOperand(3));
+ break;
+ default:
+ VOPDInst.add(FirstMI->getOperand(2));
+ break;
+ }
+
+ VOPDInst.add(SecondMI->getOperand(1));
+
+ switch (Opc2) {
+ case AMDGPU::V_MOV_B32_e32:
+ break;
+ case AMDGPU::V_FMAMK_F32:
+ case AMDGPU::V_FMAAK_F32:
+ VOPDInst.add(SecondMI->getOperand(2));
+ VOPDInst.add(SecondMI->getOperand(3));
+ break;
+ default:
+ VOPDInst.add(SecondMI->getOperand(2));
+ break;
+ }
+
+ VOPDInst.copyImplicitOps(*FirstMI);
+ VOPDInst.copyImplicitOps(*SecondMI);
+
+ LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: "
+ << *Pair.first << "\tY: " << *Pair.second << "\n");
+ FirstMI->eraseFromParent();
+ SecondMI->eraseFromParent();
+ ++NumVOPDCreated;
+ return true;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32())
+ return false;
+ LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n");
+
+ const SIInstrInfo *SII = ST->getInstrInfo();
+ bool Changed = false;
+
+ SmallVector<std::pair<MachineInstr *, MachineInstr *>> ReplaceCandidates;
+
+ for (auto &MBB : MF) {
+ auto MII = MBB.begin(), E = MBB.end();
+ while (MII != E) {
+ auto *FirstMI = &*MII;
+ MII = next_nodbg(MII, MBB.end());
+ if (MII == MBB.end())
+ break;
+ if (FirstMI->isDebugInstr())
+ continue;
+ auto *SecondMI = &*MII;
+ unsigned Opc = FirstMI->getOpcode();
+ unsigned Opc2 = SecondMI->getOpcode();
+ llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
+ llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
+ std::pair<MachineInstr *, MachineInstr *> Pair;
+
+ if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
+ Pair = {FirstMI, SecondMI};
+ else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
+ Pair = {SecondMI, FirstMI};
+ else
+ continue;
+ // checkVOPDRegConstraints cares about program order, but doReplace
+ // cares about X-Y order in the constituted VOPD
+ if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) {
+ ReplaceCandidates.push_back(Pair);
+ ++MII;
+ }
+ }
+ }
+ for (auto &Pair : ReplaceCandidates) {
+ Changed |= doReplace(SII, Pair);
+ }
+
+ return Changed;
+ }
+};
+
+} // namespace
+
+char GCNCreateVOPD::ID = 0;
+
+char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID;
+
+INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions",
+ false, false)
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 1cd880eaa48e..5d254518c67a 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -143,13 +143,20 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
}
int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
- auto DPP32 = AMDGPU::getDPPOp32(Op);
+ int DPP32 = AMDGPU::getDPPOp32(Op);
if (IsShrinkable) {
assert(DPP32 == -1);
- auto E32 = AMDGPU::getVOPe32(Op);
+ int E32 = AMDGPU::getVOPe32(Op);
DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
}
- return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32;
+ if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1)
+ return DPP32;
+ int DPP64 = -1;
+ if (ST->hasVOP3DPP())
+ DPP64 = AMDGPU::getDPPOp64(Op);
+ if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1)
+ return DPP64;
+ return -1;
}
// tracks the register operand definition and returns:
@@ -188,6 +195,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
+ bool HasVOP3DPP = ST->hasVOP3DPP();
auto OrigOp = OrigMI.getOpcode();
auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
if (DPPOp == -1) {
@@ -201,10 +209,18 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
bool Fail = false;
do {
- auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
- assert(Dst);
- DPPInst.add(*Dst);
- int NumOperands = 1;
+ int NumOperands = 0;
+ if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
+ DPPInst.add(*Dst);
+ ++NumOperands;
+ }
+ if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
+ if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
+ DPPInst.add(*SDst);
+ ++NumOperands;
+ }
+ // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
+ }
const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
if (OldIdx != -1) {
@@ -230,7 +246,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
AMDGPU::OpName::src0_modifiers)) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src0_modifiers));
- assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+ assert(HasVOP3DPP ||
+ (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
DPPInst.addImm(Mod0->getImm());
++NumOperands;
} else if (AMDGPU::getNamedOperandIdx(DPPOp,
@@ -253,7 +270,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
AMDGPU::OpName::src1_modifiers)) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src1_modifiers));
- assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
+ assert(HasVOP3DPP ||
+ (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
DPPInst.addImm(Mod1->getImm());
++NumOperands;
} else if (AMDGPU::getNamedOperandIdx(DPPOp,
@@ -261,7 +279,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.addImm(0);
++NumOperands;
}
- if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+ auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+ if (Src1) {
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
Fail = true;
@@ -270,8 +289,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.add(*Src1);
++NumOperands;
}
-
- if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
+ if (auto *Mod2 =
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) {
+ assert(NumOperands ==
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
+ assert(HasVOP3DPP ||
+ (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
+ DPPInst.addImm(Mod2->getImm());
+ ++NumOperands;
+ }
+ auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
+ if (Src2) {
if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
@@ -279,8 +307,62 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
break;
}
DPPInst.add(*Src2);
+ ++NumOperands;
+ }
+ if (HasVOP3DPP) {
+ auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
+ if (ClampOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::clamp) != -1) {
+ DPPInst.addImm(ClampOpr->getImm());
+ }
+ auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
+ if (VdstInOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::vdst_in) != -1) {
+ DPPInst.add(*VdstInOpr);
+ }
+ auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
+ if (OmodOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::omod) != -1) {
+ DPPInst.addImm(OmodOpr->getImm());
+ }
+ // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
+ // all 1.
+ if (auto *OpSelOpr =
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
+ auto OpSel = OpSelOpr->getImm();
+ if (OpSel != 0) {
+ LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n");
+ Fail = true;
+ break;
+ }
+ if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel) != -1)
+ DPPInst.addImm(OpSel);
+ }
+ if (auto *OpSelHiOpr =
+ TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
+ auto OpSelHi = OpSelHiOpr->getImm();
+ // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
+ // the bitmask for 3 op_sel_hi bits set
+ assert(Src2 && "Expected vop3p with 3 operands");
+ if (OpSelHi != 7) {
+ LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n");
+ Fail = true;
+ break;
+ }
+ if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel_hi) != -1)
+ DPPInst.addImm(OpSelHi);
+ }
+ auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
+ if (NegOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_lo) != -1) {
+ DPPInst.addImm(NegOpr->getImm());
+ }
+ auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
+ if (NegHiOpr &&
+ AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_hi) != -1) {
+ DPPInst.addImm(NegHiOpr->getImm());
+ }
}
-
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
@@ -531,8 +613,16 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
}
bool IsShrinkable = isShrinkable(OrigMI);
- if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
- LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
+ if (!(IsShrinkable ||
+ ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) ||
+ TII->isVOP3(OrigOp)) &&
+ ST->hasVOP3DPP()) ||
+ TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
+ LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n");
+ break;
+ }
+ if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) {
+ LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n");
break;
}
@@ -543,9 +633,12 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
break;
}
+ auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
assert(Src0 && "Src1 without Src0?");
- if (Src1 && Src1->isIdenticalTo(*Src0)) {
- assert(Src1->isReg());
+ if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
+ (Src2 && Src2->isIdenticalTo(*Src0)))) ||
+ (Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
+ (Src2 && Src2->isIdenticalTo(*Src1))))) {
LLVM_DEBUG(
dbgs()
<< " " << OrigMI
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
new file mode 100644
index 000000000000..a5008e39d91a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -0,0 +1,212 @@
+//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AMDGPU DAG scheduling
+/// mutation to pair VOPD instructions back to back. It also contains
+// subroutines useful in the creation of VOPD instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNVOPDUtils.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gcn-vopd-utils"
+
+bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
+ const MachineInstr &FirstMI,
+ const MachineInstr &SecondMI) {
+ const MachineFunction *MF = FirstMI.getMF();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned NumVGPRBanks = 4;
+ // Literals also count against scalar bus limit
+ SmallVector<const MachineOperand *> UniqueLiterals;
+ auto addLiteral = [&](const MachineOperand &Op) {
+ for (auto &Literal : UniqueLiterals) {
+ if (Literal->isIdenticalTo(Op))
+ return;
+ }
+ UniqueLiterals.push_back(&Op);
+ };
+ SmallVector<Register> UniqueScalarRegs;
+ assert([&]() -> bool {
+ for (auto MII = MachineBasicBlock::const_iterator(&FirstMI);
+ MII != FirstMI.getParent()->instr_end(); ++MII) {
+ if (&*MII == &SecondMI)
+ return true;
+ }
+ return false;
+ }() && "Expected FirstMI to precede SecondMI");
+ // Cannot pair dependent instructions
+ for (const auto &Use : SecondMI.uses())
+ if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg()))
+ return false;
+
+ struct ComponentInfo {
+ ComponentInfo(const MachineInstr &MI) : MI(MI) {}
+ Register Dst, Reg0, Reg1, Reg2;
+ const MachineInstr &MI;
+ };
+ ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)};
+
+ for (ComponentInfo &Comp : CInfo) {
+ switch (Comp.MI.getOpcode()) {
+ case AMDGPU::V_FMAMK_F32:
+ // cannot inline the fixed literal in fmamk
+ addLiteral(Comp.MI.getOperand(2));
+ Comp.Reg2 = Comp.MI.getOperand(3).getReg();
+ break;
+ case AMDGPU::V_FMAAK_F32:
+ // cannot inline the fixed literal in fmaak
+ addLiteral(Comp.MI.getOperand(3));
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ case AMDGPU::V_FMAC_F32_e32:
+ case AMDGPU::V_DOT2_F32_F16:
+ case AMDGPU::V_DOT2_F32_BF16:
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ Comp.Reg2 = Comp.MI.getOperand(0).getReg();
+ break;
+ case AMDGPU::V_CNDMASK_B32_e32:
+ UniqueScalarRegs.push_back(AMDGPU::VCC_LO);
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ case AMDGPU::V_MOV_B32_e32:
+ break;
+ default:
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ }
+
+ Comp.Dst = Comp.MI.getOperand(0).getReg();
+
+ const MachineOperand &Op0 = Comp.MI.getOperand(1);
+ if (Op0.isReg()) {
+ if (!TRI->isVectorRegister(MRI, Op0.getReg())) {
+ if (!is_contained(UniqueScalarRegs, Op0.getReg()))
+ UniqueScalarRegs.push_back(Op0.getReg());
+ } else
+ Comp.Reg0 = Op0.getReg();
+ } else {
+ if (!TII.isInlineConstant(Comp.MI, 1))
+ addLiteral(Op0);
+ }
+ }
+
+ if (UniqueLiterals.size() > 1)
+ return false;
+ if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
+ return false;
+
+ // check port 0
+ if (CInfo[0].Reg0 && CInfo[1].Reg0 &&
+ CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks)
+ return false;
+ // check port 1
+ if (CInfo[0].Reg1 && CInfo[1].Reg1 &&
+ CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks)
+ return false;
+ // check port 2
+ if (CInfo[0].Reg2 && CInfo[1].Reg2 &&
+ !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1))
+ return false;
+ if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
+ << "\n\tY: " << SecondMI << "\n");
+ return true;
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be scheduled
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII);
+ unsigned Opc2 = SecondMI.getOpcode();
+ auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
+
+ // One instruction case
+ if (!FirstMI)
+ return SecondCanBeVOPD.Y;
+
+ unsigned Opc = FirstMI->getOpcode();
+ auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
+
+ if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) ||
+ (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
+ return false;
+
+ return checkVOPDRegConstraints(STII, *FirstMI, SecondMI);
+}
+
+/// Adapts design from MacroFusion
+/// Puts valid candidate instructions back-to-back so they can easily
+/// be turned into VOPD instructions
+/// Greedily pairs instruction candidates. O(n^2) algorithm.
+struct VOPDPairingMutation : ScheduleDAGMutation {
+ ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer
+
+ VOPDPairingMutation(
+ ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer
+ : shouldScheduleAdjacent(shouldScheduleAdjacent) {}
+
+ void apply(ScheduleDAGInstrs *DAG) override {
+ const TargetInstrInfo &TII = *DAG->TII;
+ const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+ if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) {
+ LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n");
+ return;
+ }
+
+ std::vector<SUnit>::iterator ISUI, JSUI;
+ for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) {
+ const MachineInstr *IMI = ISUI->getInstr();
+ if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI))
+ continue;
+ if (!hasLessThanNumFused(*ISUI, 2))
+ continue;
+
+ for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) {
+ if (JSUI->isBoundaryNode())
+ continue;
+ const MachineInstr *JMI = JSUI->getInstr();
+ if (!hasLessThanNumFused(*JSUI, 2) ||
+ !shouldScheduleAdjacent(TII, ST, IMI, *JMI))
+ continue;
+ if (fuseInstructionPair(*DAG, *ISUI, *JSUI))
+ break;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n");
+ }
+};
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() {
+ return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent);
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h
new file mode 100644
index 000000000000..22361b9a1a07
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h
@@ -0,0 +1,32 @@
+//===- GCNVOPDUtils.h - GCN VOPD Utils ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AMDGPU DAG scheduling
+/// mutation to pair VOPD instructions back to back. It also contains
+// subroutines useful in the creation of VOPD instructions
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class SIInstrInfo;
+
+bool checkVOPDRegConstraints(const SIInstrInfo &TII,
+ const MachineInstr &FirstMI,
+ const MachineInstr &SecondMI);
+
+std::unique_ptr<ScheduleDAGMutation> createVOPDPairingMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 02c213f90f89..228963ff2a20 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -62,12 +62,6 @@ public:
virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const = 0;
-
-protected:
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 11fe3f9ef058..fba4b1a3db66 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -36,6 +36,7 @@
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "AMDGPUGenInstrInfo.inc"
#define GET_SUBTARGETINFO_MC_DESC
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 060d4b660632..c2e2563c3989 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -50,6 +50,7 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "AMDGPUGenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 78eb304fe84f..3d926e52c368 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -58,11 +58,6 @@ private:
uint64_t getBinaryCodeForInstr(const MCInst &MI,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
- void
- verifyInstructionPredicates(const MCInst &MI,
- const FeatureBitset &AvailableFeatures) const;
-
};
} // end anonymous namespace
@@ -90,11 +85,8 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
}
void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
if (MI.getOpcode() == R600::RETURN ||
MI.getOpcode() == R600::FETCH_CLAUSE ||
@@ -187,5 +179,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
return MO.getImm();
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "R600GenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
index 269209a12175..b9ff195e0ddc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -13,10 +13,12 @@
#include "R600MCTargetDesc.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
using namespace llvm;
#define GET_INSTRINFO_MC_DESC
+#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "R600GenInstrInfo.inc"
MCInstrInfo *llvm::createR600MCInstrInfo() {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
index 605ae851378d..b4ce748532f8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.h
@@ -35,6 +35,7 @@ MCInstrInfo *createR600MCInstrInfo();
#define GET_INSTRINFO_ENUM
#define GET_INSTRINFO_OPERAND_ENUM
#define GET_INSTRINFO_SCHED_ENUM
+#define GET_INSTRINFO_MC_HELPER_DECLS
#include "R600GenInstrInfo.inc"
#define GET_SUBTARGETINFO_ENUM
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 5e67fb5ec876..e093d78b2cc6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -310,11 +310,8 @@ uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
}
void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- verifyInstructionPredicates(MI,
- computeAvailableFeatures(STI.getFeatureBits()));
-
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
int Opcode = MI.getOpcode();
APInt Encoding, Scratch;
getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI);
@@ -574,5 +571,4 @@ void SIMCCodeEmitter::getMachineOpValueCommon(
llvm_unreachable("Encoding of this operand type is not supported yet.");
}
-#define ENABLE_INSTR_PREDICATE_VERIFIER
#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index bf52f7830ad7..5199a37a0519 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1623,7 +1623,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
NewBldVec);
}
-SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
SelectionDAG &DAG,
const SDLoc &DL) const {
// Old -> New swizzle values
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 1e75a0432ec3..e7706fa0ef5c 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -74,8 +74,8 @@ private:
void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
MachineRegisterInfo & MRI, unsigned dword_offset) const;
- SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
- const SDLoc &DL) const;
+ SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
+ SelectionDAG &DAG, const SDLoc &DL) const;
SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
index 8f7807a2b472..f81f5122bbc9 100644
--- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
@@ -13,6 +13,7 @@
//
#include "AMDGPUMCInstLower.h"
+#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600AsmPrinter.h"
#include "R600Subtarget.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -42,6 +43,9 @@ void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
}
void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
+ R600_MC::verifyInstructionPredicates(MI->getOpcode(),
+ getSubtargetInfo().getFeatureBits());
+
const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
R600MCInstLower MCInstLowering(OutContext, STI, *this);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 094d5cd58673..d16da2a8b86b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -352,7 +352,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// TODO: Generalize to more vector types.
setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
{MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v4i16, MVT::v4f16, MVT::v16i16, MVT::v16f16},
+ MVT::v4i16, MVT::v4f16},
Custom);
// Deal with vec3 vector operations when widened to vec4.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 814a7c446889..799d34e32d27 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3335,15 +3335,18 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
(ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
!RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
MachineInstr *DefMI;
- const auto killDef = [&DefMI, &MBB, this]() -> void {
+ const auto killDef = [&]() -> void {
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
// The only user is the instruction which will be killed.
- if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg()))
+ Register DefReg = DefMI->getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(DefReg))
return;
// We cannot just remove the DefMI here, calling pass will crash.
DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
DefMI->removeOperand(I);
+ if (LV)
+ LV->getVarInfo(DefReg).AliveBlocks.clear();
};
int64_t Imm;
@@ -3982,6 +3985,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+ int Src3Idx = -1;
+ if (Src0Idx == -1) {
+ // VOPD V_DUAL_* instructions use different operand names.
+ Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
+ Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
+ Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
+ Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
+ }
// Make sure the number of operands is correct.
const MCInstrDesc &Desc = get(Opcode);
@@ -4255,9 +4266,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
- for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+ for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
if (OpIdx == -1)
- break;
+ continue;
const MachineOperand &MO = MI.getOperand(OpIdx);
if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
if (MO.isReg()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 311f9f68e675..1b411eb83eb3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1242,6 +1242,9 @@ namespace AMDGPU {
int getDPPOp32(uint16_t Opcode);
LLVM_READONLY
+ int getDPPOp64(uint16_t Opcode);
+
+ LLVM_READONLY
int getBasicFromSDWAOp(uint16_t Opcode);
LLVM_READONLY
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 29ee9f12b12d..23afd6556bc9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -193,43 +193,32 @@ def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
-multiclass SDBufferAtomicRetNoRet {
- def "_ret" : PatFrag<
- (ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
- node:$offset, node:$cachepolicy, node:$idxen),
- (!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex,
- node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
- node:$idxen)> {
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }];
- let GISelPredicateCode = [{ return true; }];
- }
-
+multiclass SDBufferAtomicNoRet {
def "_noret" : PatFrag<
(ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
node:$offset, node:$cachepolicy, node:$idxen),
(!cast<SDNode>(NAME) node:$vdata_in, node:$rsrc, node:$vindex,
node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
node:$idxen)> {
- let PredicateCode = [{ return SDValue(N, 0).use_empty(); }];
- let GISelPredicateCode = [{ return false; }];
+ let HasNoUse = true;
}
}
-defm SIbuffer_atomic_swap : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_add : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_sub : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_smin : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_umin : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_smax : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_umax : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_and : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_or : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_xor : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_inc : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_dec : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_fadd : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_fmin : SDBufferAtomicRetNoRet;
-defm SIbuffer_atomic_fmax : SDBufferAtomicRetNoRet;
+defm SIbuffer_atomic_swap : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_add : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_sub : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_smin : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_umin : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_smax : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_umax : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_and : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_or : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_xor : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_inc : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_dec : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_fadd : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_fmin : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_fmax : SDBufferAtomicNoRet;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@@ -246,24 +235,13 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;
-def SIbuffer_atomic_cmpswap_ret : PatFrag<
- (ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset,
- node:$soffset, node:$offset, node:$cachepolicy, node:$idxen),
- (SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex,
- node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
- node:$idxen)> {
- let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }];
- let GISelPredicateCode = [{ return true; }];
-}
-
def SIbuffer_atomic_cmpswap_noret : PatFrag<
(ops node:$src, node:$cmp, node:$rsrc, node:$vindex, node:$voffset,
node:$soffset, node:$offset, node:$cachepolicy, node:$idxen),
(SIbuffer_atomic_cmpswap node:$src, node:$cmp, node:$rsrc, node:$vindex,
node:$voffset, node:$soffset, node:$offset, node:$cachepolicy,
node:$idxen)> {
- let PredicateCode = [{ return SDValue(N, 0).use_empty(); }];
- let GISelPredicateCode = [{ return false; }];
+ let HasNoUse = true;
}
class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
@@ -774,13 +752,13 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
let AddressSpaces = StoreAddress_local.AddrSpaces in {
defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
- defm _local_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
+ defm _local_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
IsInt>;
}
let AddressSpaces = StoreAddress_region.AddrSpaces in {
defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
- defm _region_m0 : ret_noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
+ defm _region_m0 : noret_binary_atomic_op <!cast<SDNode>(NAME#"_glue"),
IsInt>;
}
}
@@ -2194,21 +2172,21 @@ class getAsmVOP3DPPBase <int NumSrcArgs, bit HasDst, bit HasClamp,
"$sdst",
"$vdst"),
""); // use $sdst for VOPC
- string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
- string isrc1 = !if(!eq(NumSrcArgs, 1), "",
- !if(!eq(NumSrcArgs, 2), " $src1",
- " $src1,"));
- string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
-
- string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
- string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
- !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
- " $src1_modifiers,"));
- string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
-
- string src0 = !if(Src0HasMods, fsrc0, isrc0);
- string src1 = !if(Src1HasMods, fsrc1, isrc1);
- string src2 = !if(Src2HasMods, fsrc2, isrc2);
+ string src0nomods = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+ string src1nomods = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1",
+ " $src1,"));
+ string src2nomods = !if(!eq(NumSrcArgs, 3), " $src2", "");
+
+ string src0mods = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+ string src1mods = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string src2mods = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+
+ string src0 = !if(Src0HasMods, src0mods, src0nomods);
+ string src1 = !if(Src1HasMods, src1mods, src1nomods);
+ string src2 = !if(Src2HasMods, src2mods, src2nomods);
string opsel = !if(HasOpSel, "$op_sel", "");
string 3PMods = !if(IsVOP3P,
!if(HasOpSel, "$op_sel_hi", "")
@@ -2559,8 +2537,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
// the asm operand name via this HasModifiers flag
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmVOP3DPPBase = getAsmVOP3DPPBase<NumSrcArgs, HasDst, HasClamp,
- HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasSrc0FloatMods, HasSrc1FloatMods,
- HasSrc2FloatMods, DstVT >.ret;
+ HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
+ HasModifiers, DstVT>.ret;
field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3DPPBase>.ret;
field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3DPPBase>.ret;
field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3DPPBase>.ret;
@@ -2800,6 +2778,14 @@ def getDPPOp32 : InstrMapping {
let ValueCols = [["DPP"]];
}
+def getDPPOp64 : InstrMapping {
+ let FilterClass = "VOP";
+ let RowFields = ["OpName"];
+ let ColFields = ["AsmVariantName"];
+ let KeyCol = ["VOP3"];
+ let ValueCols = [["VOP3_DPP"]];
+}
+
// Maps an commuted opcode to its original version
def getCommuteOrig : InstrMapping {
let FilterClass = "Commutable_REV";
@@ -2961,6 +2947,27 @@ def getVCMPXOpFromVCMP : InstrMapping {
let ValueCols = [["1"]];
}
+def VOPDComponentTable : GenericTable {
+ let FilterClass = "VOPD_Component";
+ let CppTypeName = "VOPDComponentInfo";
+ let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX"];
+ let PrimaryKey = ["BaseVOP"];
+ let PrimaryKeyName = "getVOPDComponentHelper";
+}
+
+def VOPDPairs : GenericTable {
+ let FilterClass = "VOPD_Base";
+ let CppTypeName = "VOPDInfo";
+ let Fields = ["Opcode", "OpX", "OpY"];
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getVOPDOpcodeHelper";
+}
+
+def getVOPDInfoFromComponentOpcodes : SearchIndex {
+ let Table = VOPDPairs;
+ let Key = ["OpX", "OpY"];
+}
+
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 829669157893..ce8c03bb8d64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1449,6 +1449,14 @@ def : BitConvert <v8i32, v16f16, VReg_256>;
def : BitConvert <v8i32, v16i16, VReg_256>;
def : BitConvert <v8f32, v16f16, VReg_256>;
def : BitConvert <v8f32, v16i16, VReg_256>;
+def : BitConvert <v16f16, v4i64, VReg_256>;
+def : BitConvert <v16i16, v4i64, VReg_256>;
+def : BitConvert <v16f16, v4f64, VReg_256>;
+def : BitConvert <v16i16, v4f64, VReg_256>;
+def : BitConvert <v4i64, v16f16, VReg_256>;
+def : BitConvert <v4i64, v16i16, VReg_256>;
+def : BitConvert <v4f64, v16f16, VReg_256>;
+def : BitConvert <v4f64, v16i16, VReg_256>;
// 512-bit bitcast
def : BitConvert <v16i32, v16f32, VReg_512>;
@@ -3012,6 +3020,35 @@ multiclass Int16Med3Pat<Instruction med3Inst,
def : FPMed3Pat<f32, V_MED3_F32_e64>;
+class
+IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max,
+ SDPatternOperator max_or_min_oneuse> : AMDGPUPat <
+ (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1),
+ i32:$src2),
+ (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
+>;
+
+class
+FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
+ SDPatternOperator max_or_min_oneuse> : GCNPat <
+ (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
+ (VOP3Mods vt:$src1, i32:$src1_mods)),
+ (vt (VOP3Mods vt:$src2, i32:$src2_mods))),
+ (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+ DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+let OtherPredicates = [isGFX11Plus] in {
+def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
+def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
+def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
+def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
let OtherPredicates = [isGFX9Plus] in {
def : FP16Med3Pat<f16, V_MED3_F16_e64>;
defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 607383ab8cde..67077a2eaa6b 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -148,6 +148,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addUsedIfAvailable<LiveIntervals>();
// Should preserve the same set that TwoAddressInstructions does.
AU.addPreserved<MachineDominatorTree>();
AU.addPreserved<SlotIndexes>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index dd881ec42d53..786b6b61cb23 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -72,7 +72,7 @@ INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID;
-/// Insert restore code for the callee-saved registers used in the function.
+/// Insert spill code for the callee-saved registers used in the function.
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
ArrayRef<CalleeSavedInfo> CSI,
LiveIntervals *LIS) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index e426e938b856..ff5587fbb0ca 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -1883,7 +1883,13 @@ void SIScheduleDAGMI::schedule()
LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
buildDAGWithRegPressure();
+ postprocessDAG();
+
LLVM_DEBUG(dump());
+ if (PrintDAGs)
+ dump();
+ if (ViewMISchedDAGs)
+ viewGraph();
topologicalSort();
findRootsAndBiasEdges(TopRoots, BotRoots);
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 8a66213931ff..6b93769949bc 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2329,13 +2329,13 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
continue;
if (const auto &MOI = MOA.getLoadInfo(MI))
- Changed |= expandLoad(MOI.getValue(), MI);
+ Changed |= expandLoad(MOI.value(), MI);
else if (const auto &MOI = MOA.getStoreInfo(MI))
- Changed |= expandStore(MOI.getValue(), MI);
+ Changed |= expandStore(MOI.value(), MI);
else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
- Changed |= expandAtomicFence(MOI.getValue(), MI);
+ Changed |= expandAtomicFence(MOI.value(), MI);
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
- Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
+ Changed |= expandAtomicCmpxchgOrRmw(MOI.value(), MI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 5215397d5936..66bc46aaefea 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -9,6 +9,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -20,10 +21,40 @@ using namespace llvm;
namespace {
class SIOptimizeExecMasking : public MachineFunctionPass {
+ MachineFunction *MF = nullptr;
+ const GCNSubtarget *ST = nullptr;
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ const MachineRegisterInfo *MRI = nullptr;
+
+ Register isCopyFromExec(const MachineInstr &MI) const;
+ Register isCopyToExec(const MachineInstr &MI) const;
+ bool removeTerminatorBit(MachineInstr &MI) const;
+ MachineBasicBlock::reverse_iterator
+ fixTerminators(MachineBasicBlock &MBB) const;
+ MachineBasicBlock::reverse_iterator
+ findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I,
+ unsigned CopyToExec) const;
+
+ bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
+ MCRegister Reg, bool UseLiveOuts = false,
+ bool IgnoreStart = false) const;
+ bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
+ MachineInstr *findInstrBackwards(MachineInstr &Origin,
+ std::function<bool(MachineInstr *)> Pred,
+ ArrayRef<MCRegister> NonModifiableRegs,
+ unsigned MaxInstructions = 20) const;
+ MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec,
+ MCRegister Exec) const;
+ bool optimizeExecSequence() const;
+ bool optimizeVCmpxAndSaveexecSequence() const;
+ bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+ MachineInstr &VCmp,
+ MCRegister Exec) const;
+
public:
static char ID;
-public:
SIOptimizeExecMasking() : MachineFunctionPass(ID) {
initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
}
@@ -53,7 +84,7 @@ char SIOptimizeExecMasking::ID = 0;
char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
/// If \p MI is a copy from exec, return the register copied to.
-static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
+Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64:
@@ -61,8 +92,7 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B32_term: {
const MachineOperand &Src = MI.getOperand(1);
- if (Src.isReg() &&
- Src.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC))
+ if (Src.isReg() && Src.getReg() == TRI->getExec())
return MI.getOperand(0).getReg();
}
}
@@ -71,14 +101,13 @@ static Register isCopyFromExec(const MachineInstr &MI, const GCNSubtarget &ST) {
}
/// If \p MI is a copy to exec, return the register copied from.
-static Register isCopyToExec(const MachineInstr &MI, const GCNSubtarget &ST) {
+Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64:
case AMDGPU::S_MOV_B32: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() &&
- Dst.getReg() == (ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC) &&
+ if (Dst.isReg() && Dst.getReg() == TRI->getExec() &&
MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
@@ -173,64 +202,64 @@ static unsigned getSaveExecOp(unsigned Opc) {
// These are only terminators to get correct spill code placement during
// register allocation, so turn them back into normal instructions.
-static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
+bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const {
switch (MI.getOpcode()) {
case AMDGPU::S_MOV_B32_term: {
bool RegSrc = MI.getOperand(1).isReg();
- MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
+ MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
return true;
}
case AMDGPU::S_MOV_B64_term: {
bool RegSrc = MI.getOperand(1).isReg();
- MI.setDesc(TII.get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
+ MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
return true;
}
case AMDGPU::S_XOR_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
+ MI.setDesc(TII->get(AMDGPU::S_XOR_B64));
return true;
}
case AMDGPU::S_XOR_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_XOR_B32));
+ MI.setDesc(TII->get(AMDGPU::S_XOR_B32));
return true;
}
case AMDGPU::S_OR_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_OR_B64));
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
return true;
}
case AMDGPU::S_OR_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_OR_B32));
+ MI.setDesc(TII->get(AMDGPU::S_OR_B32));
return true;
}
case AMDGPU::S_ANDN2_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
+ MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64));
return true;
}
case AMDGPU::S_ANDN2_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
+ MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32));
return true;
}
case AMDGPU::S_AND_B64_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+ MI.setDesc(TII->get(AMDGPU::S_AND_B64));
return true;
}
case AMDGPU::S_AND_B32_term: {
// This is only a terminator to get the correct spill code placement during
// register allocation.
- MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+ MI.setDesc(TII->get(AMDGPU::S_AND_B32));
return true;
}
default:
@@ -241,9 +270,8 @@ static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
// Turn all pseudoterminators in the block into their equivalent non-terminator
// instructions. Returns the reverse iterator to the first non-terminator
// instruction in the block.
-static MachineBasicBlock::reverse_iterator fixTerminators(
- const SIInstrInfo &TII,
- MachineBasicBlock &MBB) {
+MachineBasicBlock::reverse_iterator
+SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const {
MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
bool Seen = false;
@@ -252,7 +280,7 @@ static MachineBasicBlock::reverse_iterator fixTerminators(
if (!I->isTerminator())
return Seen ? FirstNonTerm : I;
- if (removeTerminatorBit(TII, *I)) {
+ if (removeTerminatorBit(*I)) {
if (!Seen) {
FirstNonTerm = I;
Seen = true;
@@ -263,17 +291,15 @@ static MachineBasicBlock::reverse_iterator fixTerminators(
return FirstNonTerm;
}
-static MachineBasicBlock::reverse_iterator findExecCopy(
- const SIInstrInfo &TII,
- const GCNSubtarget &ST,
- MachineBasicBlock &MBB,
- MachineBasicBlock::reverse_iterator I,
- unsigned CopyToExec) {
+MachineBasicBlock::reverse_iterator
+SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::reverse_iterator I,
+ unsigned CopyToExec) const {
const unsigned InstLimit = 25;
auto E = MBB.rend();
for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
- Register CopyFromExec = isCopyFromExec(*I, ST);
+ Register CopyFromExec = isCopyFromExec(*I);
if (CopyFromExec.isValid())
return I;
}
@@ -298,11 +324,9 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
// an arbitrary condition based on the current MachineInstr, for instance an
// target instruction. Breaks prematurely by returning nullptr if one of the
// registers given in NonModifiableRegs is modified by the current instruction.
-static MachineInstr *
-findInstrBackwards(MachineInstr &Origin,
- std::function<bool(MachineInstr *)> Pred,
- ArrayRef<MCRegister> NonModifiableRegs,
- const SIRegisterInfo *TRI, unsigned MaxInstructions = 20) {
+MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
+ MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
+ ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
E = Origin.getParent()->rend();
unsigned CurrentIteration = 0;
@@ -310,7 +334,7 @@ findInstrBackwards(MachineInstr &Origin,
for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
if (A->isDebugInstr())
continue;
-
+
if (Pred(&*A))
return &*A;
@@ -318,209 +342,64 @@ findInstrBackwards(MachineInstr &Origin,
if (A->modifiesRegister(Reg, TRI))
return nullptr;
}
-
+
++CurrentIteration;
}
return nullptr;
}
-
// Determine if a register Reg is not re-defined and still in use
// in the range (Stop..Start].
// It does so by backwards calculating liveness from the end of the BB until
// either Stop or the beginning of the BB is reached.
// After liveness is calculated, we can determine if Reg is still in use and not
// defined inbetween the instructions.
-static bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
- MCRegister Reg, const SIRegisterInfo *TRI,
- MachineRegisterInfo &MRI,
- bool useLiveOuts = false,
- bool ignoreStart = false) {
+bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop,
+ MachineInstr &Start,
+ MCRegister Reg,
+ bool UseLiveOuts,
+ bool IgnoreStart) const {
LivePhysRegs LR(*TRI);
- if (useLiveOuts)
+ if (UseLiveOuts)
LR.addLiveOuts(*Stop.getParent());
MachineBasicBlock::reverse_iterator A(Start);
MachineBasicBlock::reverse_iterator E(Stop);
- if (ignoreStart)
+ if (IgnoreStart)
++A;
for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
LR.stepBackward(*A);
}
- return !LR.available(MRI, Reg);
+ return !LR.available(*MRI, Reg);
}
// Determine if a register Reg is not re-defined and still in use
// in the range (Stop..BB.end].
-static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
- const SIRegisterInfo *TRI,
- MachineRegisterInfo &MRI) {
- return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, TRI,
- MRI, true);
+bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
+ MCRegister Reg) const {
+ return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true);
}
-// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
-// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
-// to the v_cmp instruction if it is safe to replace the sequence (see the
-// conditions in the function body). This is after register allocation, so some
-// checks on operand dependencies need to be considered.
-static MachineInstr *findPossibleVCMPVCMPXOptimization(
- MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
- const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
-
- MachineInstr *VCmp = nullptr;
-
- Register SaveExecDest = SaveExec.getOperand(0).getReg();
- if (!TRI->isSGPRReg(MRI, SaveExecDest))
- return nullptr;
-
- MachineOperand *SaveExecSrc0 =
- TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
- if (!SaveExecSrc0->isReg())
- return nullptr;
-
- // Try to find the last v_cmp instruction that defs the saveexec input
- // operand without any write to Exec or the saveexec input operand inbetween.
- VCmp = findInstrBackwards(
- SaveExec,
- [&](MachineInstr *Check) {
- return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
- Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
- },
- {Exec, SaveExecSrc0->getReg()}, TRI);
-
- if (!VCmp)
- return nullptr;
-
- MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
- assert(VCmpDest && "Should have an sdst operand!");
-
- // Check if any of the v_cmp source operands is written by the saveexec.
- MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
- if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
- SaveExec.modifiesRegister(Src0->getReg(), TRI))
- return nullptr;
-
- MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
- if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
- SaveExec.modifiesRegister(Src1->getReg(), TRI))
- return nullptr;
-
- // Don't do the transformation if the destination operand is included in
- // it's MBB Live-outs, meaning it's used in any of it's successors, leading
- // to incorrect code if the v_cmp and therefore the def of
- // the dest operand is removed.
- if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
- return nullptr;
-
- // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
- // s_and_saveexec, skip the optimization.
- if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), TRI, MRI,
- false, true) ||
- isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI, MRI))
- return nullptr;
-
- // Try to determine if there is a write to any of the VCmp
- // operands between the saveexec and the vcmp.
- // If yes, additional VGPR spilling might need to be inserted. In this case,
- // it's not worth replacing the instruction sequence.
- SmallVector<MCRegister, 2> NonDefRegs;
- if (Src0->isReg())
- NonDefRegs.push_back(Src0->getReg());
-
- if (Src1->isReg())
- NonDefRegs.push_back(Src1->getReg());
-
- if (!findInstrBackwards(
- SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
- NonDefRegs, TRI))
- return nullptr;
-
- return VCmp;
-}
-
-// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
-// operands extracted from a v_cmp ..., s_and_saveexec pattern.
-static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
- MachineInstr &VCmp, MCRegister Exec,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- MachineRegisterInfo &MRI) {
- const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
-
- if (NewOpcode == -1)
- return false;
-
- MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
- MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
-
- Register MoveDest = SaveExecInstr.getOperand(0).getReg();
-
- MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
- if (!SaveExecInstr.uses().empty()) {
- bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
- unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
- SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
- .addReg(Exec);
- }
-
- // Omit dst as V_CMPX is implicitly writing to EXEC.
- // Add dummy src and clamp modifiers, if needed.
- auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
- VCmp.getDebugLoc(), TII->get(NewOpcode));
-
- auto TryAddImmediateValueFromNamedOperand =
- [&](unsigned OperandName) -> void {
- if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
- Builder.addImm(Mod->getImm());
- };
-
- TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
- Builder.add(*Src0);
-
- TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
- Builder.add(*Src1);
-
- TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
-
- // The kill flags may no longer be correct.
- if (Src0->isReg())
- MRI.clearKillFlags(Src0->getReg());
- if (Src1->isReg())
- MRI.clearKillFlags(Src1->getReg());
-
- return true;
-}
-
-bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(MF.getFunction()))
- return false;
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
- MachineRegisterInfo *MRI = &MF.getRegInfo();
- MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-
- // Optimize sequences emitted for control flow lowering. They are originally
- // emitted as the separate operations because spill code may need to be
- // inserted for the saved copy of exec.
- //
- // x = copy exec
- // z = s_<op>_b64 x, y
- // exec = copy z
- // =>
- // x = s_<op>_saveexec_b64 y
- //
+// Optimize sequences emitted for control flow lowering. They are originally
+// emitted as the separate operations because spill code may need to be
+// inserted for the saved copy of exec.
+//
+// x = copy exec
+// z = s_<op>_b64 x, y
+// exec = copy z
+// =>
+// x = s_<op>_saveexec_b64 y
+//
+bool SIOptimizeExecMasking::optimizeExecSequence() const {
+ MCRegister Exec = TRI->getExec();
bool Changed = false;
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
+ for (MachineBasicBlock &MBB : *MF) {
+ MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
MachineBasicBlock::reverse_iterator E = MBB.rend();
if (I == E)
continue;
@@ -532,7 +411,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
unsigned SearchCount = 0;
const unsigned SearchLimit = 5;
while (I != E && SearchCount++ < SearchLimit) {
- CopyToExec = isCopyToExec(*I, ST);
+ CopyToExec = isCopyToExec(*I);
if (CopyToExec)
break;
++I;
@@ -542,8 +421,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
continue;
// Scan backwards to find the def.
- auto CopyToExecInst = &*I;
- auto CopyFromExecInst = findExecCopy(*TII, ST, MBB, I, CopyToExec);
+ auto *CopyToExecInst = &*I;
+ auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec);
if (CopyFromExecInst == E) {
auto PrepareExecInst = std::next(I);
if (PrepareExecInst == E)
@@ -574,8 +453,9 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
MachineInstr *SaveExecInst = nullptr;
SmallVector<MachineInstr *, 4> OtherUseInsts;
- for (MachineBasicBlock::iterator J
- = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
+ for (MachineBasicBlock::iterator
+ J = std::next(CopyFromExecInst->getIterator()),
+ JE = I->getIterator();
J != JE; ++J) {
if (SaveExecInst && J->readsRegister(Exec, TRI)) {
LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
@@ -655,58 +535,210 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
CopyFromExec)
- .addReg(OtherOp->getReg());
+ .addReg(OtherOp->getReg());
SaveExecInst->eraseFromParent();
CopyToExecInst->eraseFromParent();
for (MachineInstr *OtherInst : OtherUseInsts) {
- OtherInst->substituteRegister(CopyToExec, Exec,
- AMDGPU::NoSubRegister, *TRI);
+ OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
+ *TRI);
}
Changed = true;
}
- // After all s_op_saveexec instructions are inserted,
- // replace (on GFX10.3 and later)
- // v_cmp_* SGPR, IMM, VGPR
- // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
- // with
- // s_mov_b32 EXEC_SGPR_DEST, exec_lo
- // v_cmpx_* IMM, VGPR
- // to reduce pipeline stalls.
- if (ST.hasGFX10_3Insts()) {
- DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
- const unsigned AndSaveExecOpcode =
- ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- // Record relevant v_cmp / s_and_saveexec instruction pairs for
- // replacement.
- if (MI.getOpcode() != AndSaveExecOpcode)
- continue;
+ return Changed;
+}
- if (MachineInstr *VCmp =
- findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
- SaveExecVCmpMapping[&MI] = VCmp;
- }
+// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
+// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
+// to the v_cmp instruction if it is safe to replace the sequence (see the
+// conditions in the function body). This is after register allocation, so some
+// checks on operand dependencies need to be considered.
+MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization(
+ MachineInstr &SaveExec, MCRegister Exec) const {
+
+ MachineInstr *VCmp = nullptr;
+
+ Register SaveExecDest = SaveExec.getOperand(0).getReg();
+ if (!TRI->isSGPRReg(*MRI, SaveExecDest))
+ return nullptr;
+
+ MachineOperand *SaveExecSrc0 =
+ TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
+ if (!SaveExecSrc0->isReg())
+ return nullptr;
+
+ // Try to find the last v_cmp instruction that defs the saveexec input
+ // operand without any write to Exec or the saveexec input operand inbetween.
+ VCmp = findInstrBackwards(
+ SaveExec,
+ [&](MachineInstr *Check) {
+ return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+ Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+ },
+ {Exec, SaveExecSrc0->getReg()});
+
+ if (!VCmp)
+ return nullptr;
+
+ MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+ assert(VCmpDest && "Should have an sdst operand!");
+
+ // Check if any of the v_cmp source operands is written by the saveexec.
+ MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+ if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
+ SaveExec.modifiesRegister(Src0->getReg(), TRI))
+ return nullptr;
+
+ MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+ if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
+ SaveExec.modifiesRegister(Src1->getReg(), TRI))
+ return nullptr;
+
+ // Don't do the transformation if the destination operand is included in
+ // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+ // to incorrect code if the v_cmp and therefore the def of
+ // the dest operand is removed.
+ if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+ return nullptr;
+
+ // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
+ // s_and_saveexec, skip the optimization.
+ if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false,
+ true) ||
+ isRegisterInUseAfter(SaveExec, VCmpDest->getReg()))
+ return nullptr;
+
+ // Try to determine if there is a write to any of the VCmp
+ // operands between the saveexec and the vcmp.
+ // If yes, additional VGPR spilling might need to be inserted. In this case,
+ // it's not worth replacing the instruction sequence.
+ SmallVector<MCRegister, 2> NonDefRegs;
+ if (Src0->isReg())
+ NonDefRegs.push_back(Src0->getReg());
+
+ if (Src1->isReg())
+ NonDefRegs.push_back(Src1->getReg());
+
+ if (!findInstrBackwards(
+ SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
+ NonDefRegs))
+ return nullptr;
+
+ return VCmp;
+}
+
+// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
+// operands extracted from a v_cmp ..., s_and_saveexec pattern.
+bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
+ MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
+ const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
+
+ if (NewOpcode == -1)
+ return false;
+
+ MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
+ MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
+
+ Register MoveDest = SaveExecInstr.getOperand(0).getReg();
+
+ MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
+ if (!SaveExecInstr.uses().empty()) {
+ bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32;
+ unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
+ SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
+ .addReg(Exec);
+ }
+
+ // Omit dst as V_CMPX is implicitly writing to EXEC.
+ // Add dummy src and clamp modifiers, if needed.
+ auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
+ VCmp.getDebugLoc(), TII->get(NewOpcode));
+
+ auto TryAddImmediateValueFromNamedOperand =
+ [&](unsigned OperandName) -> void {
+ if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
+ Builder.addImm(Mod->getImm());
+ };
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
+ Builder.add(*Src0);
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
+ Builder.add(*Src1);
+
+ TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
+
+ // The kill flags may no longer be correct.
+ if (Src0->isReg())
+ MRI->clearKillFlags(Src0->getReg());
+ if (Src1->isReg())
+ MRI->clearKillFlags(Src1->getReg());
+
+ return true;
+}
+
+// After all s_op_saveexec instructions are inserted,
+// replace (on GFX10.3 and later)
+// v_cmp_* SGPR, IMM, VGPR
+// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
+// with
+// s_mov_b32 EXEC_SGPR_DEST, exec_lo
+// v_cmpx_* IMM, VGPR
+// to reduce pipeline stalls.
+bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const {
+ if (!ST->hasGFX10_3Insts())
+ return false;
+
+ bool Changed = false;
+
+ DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+ MCRegister Exec = TRI->getExec();
+ const unsigned AndSaveExecOpcode =
+ ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB) {
+ // Record relevant v_cmp / s_and_saveexec instruction pairs for
+ // replacement.
+ if (MI.getOpcode() != AndSaveExecOpcode)
+ continue;
+
+ if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec))
+ SaveExecVCmpMapping[&MI] = VCmp;
}
+ }
- for (const auto &Entry : SaveExecVCmpMapping) {
- MachineInstr *SaveExecInstr = Entry.getFirst();
- MachineInstr *VCmpInstr = Entry.getSecond();
+ for (const auto &Entry : SaveExecVCmpMapping) {
+ MachineInstr *SaveExecInstr = Entry.getFirst();
+ MachineInstr *VCmpInstr = Entry.getSecond();
- if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
- TRI, *MRI)) {
- SaveExecInstr->eraseFromParent();
- VCmpInstr->eraseFromParent();
+ if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) {
+ SaveExecInstr->eraseFromParent();
+ VCmpInstr->eraseFromParent();
- Changed = true;
- }
+ Changed = true;
}
}
return Changed;
}
+
+bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ this->MF = &MF;
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = ST->getInstrInfo();
+ MRI = &MF.getRegInfo();
+
+ bool Changed = optimizeExecSequence();
+ Changed |= optimizeVCmpxAndSaveexecSequence();
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index e5e65a8dbbf1..57dbad468de8 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -159,6 +159,9 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
return false;
Register SelReg = Op1->getReg();
+ if (SelReg.isPhysical())
+ return false;
+
auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, *MRI, LIS);
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
return false;
@@ -264,13 +267,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
// Try to remove v_cndmask_b32.
if (SelLI) {
- bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill();
- if (!CanRemoveSel) {
- // Try to shrink the live interval and check for dead def instead.
- LIS->shrinkToUses(SelLI, nullptr);
- CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
- }
- if (CanRemoveSel) {
+ // Kill status must be checked before shrinking the live range.
+ bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
+ LIS->shrinkToUses(SelLI);
+ bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
+ if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ad1455ed20fd..b32d5bb04d5b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2933,6 +2933,10 @@ MCRegister SIRegisterInfo::getVCC() const {
return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
}
+MCRegister SIRegisterInfo::getExec() const {
+ return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+}
+
const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
// VGPR tuples have an alignment requirement on gfx90a variants.
return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 9bfbc253410b..6024158be181 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -344,6 +344,8 @@ public:
MCRegister getVCC() const;
+ MCRegister getExec() const;
+
const TargetRegisterClass *getRegClass(unsigned RCID) const;
// Find reaching register definition
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e4ab72f1095b..2f334e211181 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -277,6 +277,18 @@ struct VOPC64DPPInfo {
uint16_t Opcode;
};
+struct VOPDComponentInfo {
+ uint16_t BaseVOP;
+ uint16_t VOPDOp;
+ bool CanBeVOPDX;
+};
+
+struct VOPDInfo {
+ uint16_t Opcode;
+ uint16_t OpX;
+ uint16_t OpY;
+};
+
#define GET_MTBUFInfoTable_DECL
#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
@@ -293,6 +305,10 @@ struct VOPC64DPPInfo {
#define GET_VOPC64DPPTable_IMPL
#define GET_VOPC64DPP8Table_DECL
#define GET_VOPC64DPP8Table_IMPL
+#define GET_VOPDComponentTable_DECL
+#define GET_VOPDComponentTable_IMPL
+#define GET_VOPDPairs_DECL
+#define GET_VOPDPairs_IMPL
#define GET_WMMAOpcode2AddrMappingTable_DECL
#define GET_WMMAOpcode2AddrMappingTable_IMPL
#define GET_WMMAOpcode3AddrMappingTable_DECL
@@ -398,6 +414,19 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
return Info ? Info->is_gfx940_xdl : false;
}
+CanBeVOPD getCanBeVOPD(unsigned Opc) {
+ const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
+ if (Info)
+ return {Info->CanBeVOPDX, 1};
+ else
+ return {0, 0};
+}
+
+unsigned getVOPDOpcode(unsigned Opc) {
+ const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
+ return Info ? Info->VOPDOp : ~0u;
+}
+
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
@@ -415,6 +444,11 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {
return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
}
+int getVOPDFull(unsigned OpX, unsigned OpY) {
+ const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY);
+ return Info ? Info->Opcode : -1;
+}
+
namespace IsaInfo {
AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index dffeec10a14a..51cf1678207c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -470,6 +470,14 @@ bool getMAIIsDGEMM(unsigned Opc);
LLVM_READONLY
bool getMAIIsGFX940XDL(unsigned Opc);
+struct CanBeVOPD {
+ bool X;
+ bool Y;
+};
+
+LLVM_READONLY
+CanBeVOPD getCanBeVOPD(unsigned Opc);
+
LLVM_READONLY
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
@@ -483,6 +491,12 @@ LLVM_READONLY
int getMCOpcode(uint16_t Opcode, unsigned Gen);
LLVM_READONLY
+unsigned getVOPDOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getVOPDFull(unsigned OpX, unsigned OpY);
+
+LLVM_READONLY
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
LLVM_READONLY
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 1485a1e63129..b24857edb59a 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -495,9 +495,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP16 = !con(InsDPP, (ins FI:$fi));
let InsDPP8 = (ins DstRCDPP:$old,
- Src0DPP:$src0,
- Src1DPP:$src1,
- dpp8:$dpp8, FI:$fi);
+ Src0DPP:$src0,
+ Src1DPP:$src1,
+ dpp8:$dpp8, FI:$fi);
let HasExt = 1;
let HasExtDPP = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index eb6c54a45263..33d3441e94c2 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1108,7 +1108,6 @@ class VOPC64_DPP_Base<bits<10> op, string OpName, VOPProfile P>
// Inst{87-84} ignored by hw
let Inst{91-88} = bank_mask;
let Inst{95-92} = row_mask;
-
}
class VOPC64_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
@@ -1148,7 +1147,6 @@ class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P>
let Inst{40-32} = fi;
let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
let Inst{95-72} = dpp8{23-0};
-
}
class VOPC64_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8cd3d2fe2c47..187485ffa3ae 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1215,7 +1215,9 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
let IsMAI = !if(Features.IsMAI, 1, P.IsMAI);
let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
- let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers));
+ let HasModifiers =
+ !if (Features.IsMAI, 0,
+ !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
}
class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> {
@@ -1414,7 +1416,7 @@ multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName,
VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>;
multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName,
- string asmName> :
+ string asmName> :
VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>;
multiclass VOP3be_Realtriple_gfx11<