aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp236
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td3
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h3
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp116
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td4
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp399
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td42
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td5
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp36
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td2
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td8
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td4
14 files changed, 672 insertions, 189 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 730bcdcf7afa..e48c1943cb01 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -434,6 +434,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
+
+ // FIXME: This is only partially true. If we have to do vector compares, any
+ // SGPR pair can be a condition register. If we have a uniform condition, we
+ // are better off doing SALU operations, where there is only one SCC. For now,
+ // we don't have a way of knowing during instruction selection if a condition
+ // will be uniform and we always use vector compares. Assume we are using
+ // vector compares until that is fixed.
setHasMultipleConditionRegisters(true);
// SI at least has hardware support for floating point exceptions, but no way
@@ -470,12 +477,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FNEG);
}
//===----------------------------------------------------------------------===//
// Target Information
//===----------------------------------------------------------------------===//
+static bool fnegFoldsIntoOp(unsigned Opc) {
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FMA:
+ case ISD::FMAD:
+ case ISD::FSIN:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMUL_LEGACY:
+ return true;
+ default:
+ return false;
+ }
+}
+
MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
return MVT::i32;
}
@@ -2679,8 +2705,93 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
return SDValue();
}
+static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
+ unsigned Op,
+ const SDLoc &SL,
+ SDValue Cond,
+ SDValue N1,
+ SDValue N2) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N1.getValueType();
+
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
+ N1.getOperand(0), N2.getOperand(0));
+ DCI.AddToWorklist(NewSelect.getNode());
+ return DAG.getNode(Op, SL, VT, NewSelect);
+}
+
+// Pull a free FP operation out of a select so it may fold into uses.
+//
+// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
+// select c, (fneg x), k -> fneg (select c, x, (fneg k))
+//
+// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
+// select c, (fabs x), +k -> fabs (select c, x, k)
+static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+ SDValue N) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Cond = N.getOperand(0);
+ SDValue LHS = N.getOperand(1);
+ SDValue RHS = N.getOperand(2);
+
+ EVT VT = N.getValueType();
+ if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
+ (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+ return distributeOpThroughSelect(DCI, LHS.getOpcode(),
+ SDLoc(N), Cond, LHS, RHS);
+ }
+
+ bool Inv = false;
+ if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
+ std::swap(LHS, RHS);
+ Inv = true;
+ }
+
+ // TODO: Support vector constants.
+ ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+ if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
+ SDLoc SL(N);
+ // If one side is an fneg/fabs and the other is a constant, we can push the
+ // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
+ SDValue NewLHS = LHS.getOperand(0);
+ SDValue NewRHS = RHS;
+
+ // Careful: if the neg can be folded up, don't try to pull it back down.
+ bool ShouldFoldNeg = true;
+
+ if (NewLHS.hasOneUse()) {
+ unsigned Opc = NewLHS.getOpcode();
+ if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+ ShouldFoldNeg = false;
+ if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
+ ShouldFoldNeg = false;
+ }
+
+ if (ShouldFoldNeg) {
+ if (LHS.getOpcode() == ISD::FNEG)
+ NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else if (CRHS->isNegative())
+ return SDValue();
+
+ if (Inv)
+ std::swap(NewLHS, NewRHS);
+
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
+ Cond, NewLHS, NewRHS);
+ DCI.AddToWorklist(NewSelect.getNode());
+ return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
+ }
+ }
+
+ return SDValue();
+}
+
+
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
+ return Folded;
+
SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC)
return SDValue();
@@ -2724,6 +2835,129 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
}
+SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ unsigned Opc = N0.getOpcode();
+
+ // If the input has multiple uses and we can either fold the negate down, or
+ // the other uses cannot, give up. This both prevents unprofitable
+ // transformations and infinite loops: we won't repeatedly try to fold around
+ // a negate that has no 'good' form.
+ //
+ // TODO: Check users can fold
+ if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
+ return SDValue();
+
+ SDLoc SL(N);
+ switch (Opc) {
+ case ISD::FADD: {
+ // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+
+ if (LHS.getOpcode() != ISD::FNEG)
+ LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+ else
+ LHS = LHS.getOperand(0);
+
+ if (RHS.getOpcode() != ISD::FNEG)
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FMUL:
+ case AMDGPUISD::FMUL_LEGACY: {
+ // (fneg (fmul x, y)) -> (fmul x, (fneg y))
+ // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
+ SDValue LHS = N0.getOperand(0);
+ SDValue RHS = N0.getOperand(1);
+
+ if (LHS.getOpcode() == ISD::FNEG)
+ LHS = LHS.getOperand(0);
+ else if (RHS.getOpcode() == ISD::FNEG)
+ RHS = RHS.getOperand(0);
+ else
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+ SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FMA:
+ case ISD::FMAD: {
+ // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
+ SDValue LHS = N0.getOperand(0);
+ SDValue MHS = N0.getOperand(1);
+ SDValue RHS = N0.getOperand(2);
+
+ if (LHS.getOpcode() == ISD::FNEG)
+ LHS = LHS.getOperand(0);
+ else if (MHS.getOpcode() == ISD::FNEG)
+ MHS = MHS.getOperand(0);
+ else
+ MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
+
+ if (RHS.getOpcode() != ISD::FNEG)
+ RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+ return Res;
+ }
+ case ISD::FP_EXTEND:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_LEGACY:
+ case ISD::FSIN:
+ case AMDGPUISD::SIN_HW: {
+ SDValue CvtSrc = N0.getOperand(0);
+ if (CvtSrc.getOpcode() == ISD::FNEG) {
+ // (fneg (fp_extend (fneg x))) -> (fp_extend x)
+ // (fneg (rcp (fneg x))) -> (rcp x)
+ return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ // (fneg (fp_extend x)) -> (fp_extend (fneg x))
+ // (fneg (rcp x)) -> (rcp (fneg x))
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+ return DAG.getNode(Opc, SL, VT, Neg);
+ }
+ case ISD::FP_ROUND: {
+ SDValue CvtSrc = N0.getOperand(0);
+
+ if (CvtSrc.getOpcode() == ISD::FNEG) {
+ // (fneg (fp_round (fneg x))) -> (fp_round x)
+ return DAG.getNode(ISD::FP_ROUND, SL, VT,
+ CvtSrc.getOperand(0), N0.getOperand(1));
+ }
+
+ if (!N0.hasOneUse())
+ return SDValue();
+
+ // (fneg (fp_round x)) -> (fp_round (fneg x))
+ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+ return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
+ }
+ default:
+ return SDValue();
+ }
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -2829,6 +3063,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performMulLoHi24Combine(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
+ case ISD::FNEG:
+ return performFNegCombine(N, DCI);
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
assert(!N->getValueType(0).isVector() &&
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 745c9923de2e..69567aa5f713 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -84,6 +84,7 @@ protected:
SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 513df3a9cdf3..59cba636c586 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -629,9 +629,10 @@ def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;
def umax_oneuse : HasOneUseBinOp<umax>;
def umin_oneuse : HasOneUseBinOp<umin>;
-def sub_oneuse : HasOneUseBinOp<sub>;
} // Properties = [SDNPCommutative, SDNPAssociative]
+def sub_oneuse : HasOneUseBinOp<sub>;
+
def select_oneuse : HasOneUseTernaryOp<select>;
// Special conversion patterns
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a1a352642242..e90487065992 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -110,7 +110,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
int AMDGPUTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo) {
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
EVT OrigTy = TLI->getValueType(DL, Ty);
if (!OrigTy.isSimple()) {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1177007644ff..0d83b2a585bf 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -83,7 +83,8 @@ public:
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
- TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>());
unsigned getCFInstrCost(unsigned Opcode);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index da9d009c542b..3cf9a1d92469 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -214,7 +214,7 @@ public:
}
bool isReg() const override {
- return isRegKind() && !Reg.Mods.hasModifiers();
+ return isRegKind() && !hasModifiers();
}
bool isRegOrImmWithInputMods(MVT type) const {
@@ -245,6 +245,15 @@ public:
return isRegOrImmWithInputMods(MVT::f64);
}
+ bool isVReg() const {
+ return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+ isRegClass(AMDGPU::VReg_64RegClassID) ||
+ isRegClass(AMDGPU::VReg_96RegClassID) ||
+ isRegClass(AMDGPU::VReg_128RegClassID) ||
+ isRegClass(AMDGPU::VReg_256RegClassID) ||
+ isRegClass(AMDGPU::VReg_512RegClassID);
+ }
+
bool isVReg32OrOff() const {
return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
}
@@ -299,28 +308,32 @@ public:
bool isRegClass(unsigned RCID) const;
+ bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
+ return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
+ }
+
bool isSCSrcB16() const {
- return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16);
}
bool isSCSrcB32() const {
- return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32);
}
bool isSCSrcB64() const {
- return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
+ return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64);
}
bool isSCSrcF16() const {
- return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
}
bool isSCSrcF32() const {
- return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
+ return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32);
}
bool isSCSrcF64() const {
- return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::f64);
+ return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::f64);
}
bool isSSrcB32() const {
@@ -350,27 +363,27 @@ public:
}
bool isVCSrcB32() const {
- return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
bool isVCSrcB64() const {
- return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
+ return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
}
bool isVCSrcB16() const {
- return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
}
bool isVCSrcF32() const {
- return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
}
bool isVCSrcF64() const {
- return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
+ return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
}
bool isVCSrcF16() const {
- return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
+ return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
}
bool isVSrcB32() const {
@@ -534,6 +547,23 @@ public:
addRegOrImmWithInputModsOperands(Inst, N);
}
+ void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
+ Modifiers Mods = getModifiers();
+ Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+ assert(isRegKind());
+ addRegOperands(Inst, N);
+ }
+
+ void addRegWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+ assert(!hasIntModifiers());
+ addRegWithInputModsOperands(Inst, N);
+ }
+
+ void addRegWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+ assert(!hasFPModifiers());
+ addRegWithInputModsOperands(Inst, N);
+ }
+
void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
if (isImm())
addImmOperands(Inst, N);
@@ -852,9 +882,12 @@ public:
StringRef &Value);
OperandMatchResultTy parseImm(OperandVector &Operands);
+ OperandMatchResultTy parseReg(OperandVector &Operands);
OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
+ OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
+ OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
+ OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
@@ -1057,7 +1090,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
}
bool AMDGPUOperand::isRegClass(unsigned RCID) const {
- return isReg() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
+ return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
}
void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
@@ -1468,23 +1501,28 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
}
OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
- auto res = parseImm(Operands);
- if (res != MatchOperand_NoMatch) {
- return res;
- }
-
+AMDGPUAsmParser::parseReg(OperandVector &Operands) {
if (auto R = parseRegister()) {
assert(R->isReg());
R->Reg.IsForcedVOP3 = isForcedVOP3();
Operands.push_back(std::move(R));
return MatchOperand_Success;
}
- return MatchOperand_ParseFail;
+ return MatchOperand_NoMatch;
}
OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+ auto res = parseImm(Operands);
+ if (res != MatchOperand_NoMatch) {
+ return res;
+ }
+
+ return parseReg(Operands);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) {
// XXX: During parsing we can't determine if minus sign means
// negate-modifier or negative immediate value.
// By default we suppose it is modifier.
@@ -1514,7 +1552,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
Abs = true;
}
- auto Res = parseRegOrImm(Operands);
+ OperandMatchResultTy Res;
+ if (AllowImm) {
+ Res = parseRegOrImm(Operands);
+ } else {
+ Res = parseReg(Operands);
+ }
if (Res != MatchOperand_Success) {
return Res;
}
@@ -1548,7 +1591,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
}
OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) {
bool Sext = false;
if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
@@ -1561,7 +1604,12 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
Parser.Lex();
}
- auto Res = parseRegOrImm(Operands);
+ OperandMatchResultTy Res;
+ if (AllowImm) {
+ Res = parseRegOrImm(Operands);
+ } else {
+ Res = parseReg(Operands);
+ }
if (Res != MatchOperand_Success) {
return Res;
}
@@ -1584,6 +1632,16 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
return MatchOperand_Success;
}
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) {
+ return parseRegOrImmWithFPInputMods(Operands, false);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
+ return parseRegOrImmWithIntInputMods(Operands, false);
+}
+
OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
if (Reg) {
@@ -3382,7 +3440,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
// Skip it.
continue;
} if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+ Op.addRegWithFPInputModsOperands(Inst, 2);
} else if (Op.isDPPCtrl()) {
Op.addImmOperands(Inst, 1);
} else if (Op.isImm()) {
@@ -3508,7 +3566,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
// Skip it.
continue;
} else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
- Op.addRegOrImmWithInputModsOperands(Inst, 2);
+ Op.addRegWithInputModsOperands(Inst, 2);
} else if (Op.isImm()) {
// Handle optional arguments
OptionalIdx[Op.getImmTy()] = I;
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 4112ad100584..48c6592ca5b2 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -333,11 +333,13 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
def DOT4_eg : DOT4_Common<0xBE>;
defm CUBE_eg : CUBE_Common<0xC0>;
-def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
+def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>;
+def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 831ac5948a68..a5c0d4923d6b 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -25,25 +25,6 @@ using namespace llvm;
namespace {
-class SIFoldOperands : public MachineFunctionPass {
-public:
- static char ID;
-
-public:
- SIFoldOperands() : MachineFunctionPass(ID) {
- initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "SI Fold Operands"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
struct FoldCandidate {
MachineInstr *UseMI;
union {
@@ -79,6 +60,36 @@ struct FoldCandidate {
}
};
+class SIFoldOperands : public MachineFunctionPass {
+public:
+ static char ID;
+ MachineRegisterInfo *MRI;
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+
+ void foldOperand(MachineOperand &OpToFold,
+ MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ SmallVectorImpl<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
+
+ void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+
+public:
+ SIFoldOperands() : MachineFunctionPass(ID) {
+ initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Fold Operands"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
} // End anonymous namespace.
INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
@@ -88,6 +99,34 @@ char SIFoldOperands::ID = 0;
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+// Wrapper around isInlineConstant that understands special cases when
+// instruction types are replaced during operand folding.
+static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
+ const MachineInstr &UseMI,
+ unsigned OpNo,
+ const MachineOperand &OpToFold) {
+ if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
+ return true;
+
+ unsigned Opc = UseMI.getOpcode();
+ switch (Opc) {
+ case AMDGPU::V_MAC_F32_e64:
+ case AMDGPU::V_MAC_F16_e64: {
+ // Special case for mac. Since this is replaced with mad when folded into
+ // src2, we need to check the legality for the final instruction.
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (static_cast<int>(OpNo) == Src2Idx) {
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ const MCInstrDesc &MadDesc
+ = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
+ }
+ }
+ default:
+ return false;
+ }
+}
+
FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
@@ -141,7 +180,7 @@ static bool updateOperand(FoldCandidate &Fold,
return false;
}
-static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
const MachineInstr *MI) {
for (auto Candidate : FoldList) {
if (Candidate.UseMI == MI)
@@ -150,7 +189,7 @@ static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
return false;
}
-static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold,
const SIInstrInfo *TII) {
@@ -160,7 +199,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
unsigned Opc = MI->getOpcode();
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
- bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
@@ -227,12 +266,12 @@ static bool isUseSafeToFold(const MachineInstr &MI,
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}
-static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
- unsigned UseOpIdx,
- std::vector<FoldCandidate> &FoldList,
- SmallVectorImpl<MachineInstr *> &CopiesToReplace,
- const SIInstrInfo *TII, const SIRegisterInfo &TRI,
- MachineRegisterInfo &MRI) {
+void SIFoldOperands::foldOperand(
+ MachineOperand &OpToFold,
+ MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ SmallVectorImpl<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (!isUseSafeToFold(*UseMI, UseOp))
@@ -264,7 +303,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
for (MachineRegisterInfo::use_iterator
- RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end();
+ RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
RSUse != RSE; ++RSUse) {
MachineInstr *RSUseMI = RSUse->getParent();
@@ -272,7 +311,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
continue;
foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
- CopiesToReplace, TII, TRI, MRI);
+ CopiesToReplace);
}
return;
@@ -287,8 +326,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC
= TargetRegisterInfo::isVirtualRegister(DestReg) ?
- MRI.getRegClass(DestReg) :
- TRI.getPhysRegClass(DestReg);
+ MRI->getRegClass(DestReg) :
+ TRI->getPhysRegClass(DestReg);
unsigned MovOp = TII->getMovOpcode(DestRC);
if (MovOp == AMDGPU::COPY)
@@ -318,7 +357,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
const TargetRegisterClass *FoldRC =
- TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+ TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
OpToFold.getImm());
@@ -328,8 +367,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
unsigned UseReg = UseOp.getReg();
const TargetRegisterClass *UseRC
= TargetRegisterInfo::isVirtualRegister(UseReg) ?
- MRI.getRegClass(UseReg) :
- TRI.getPhysRegClass(UseReg);
+ MRI->getRegClass(UseReg) :
+ TRI->getPhysRegClass(UseReg);
assert(Imm.getBitWidth() == 64);
@@ -349,20 +388,51 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
}
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
- int32_t LHS, int32_t RHS) {
+ uint32_t LHS, uint32_t RHS) {
switch (Opcode) {
case AMDGPU::V_AND_B32_e64:
+ case AMDGPU::V_AND_B32_e32:
case AMDGPU::S_AND_B32:
Result = LHS & RHS;
return true;
case AMDGPU::V_OR_B32_e64:
+ case AMDGPU::V_OR_B32_e32:
case AMDGPU::S_OR_B32:
Result = LHS | RHS;
return true;
case AMDGPU::V_XOR_B32_e64:
+ case AMDGPU::V_XOR_B32_e32:
case AMDGPU::S_XOR_B32:
Result = LHS ^ RHS;
return true;
+ case AMDGPU::V_LSHL_B32_e64:
+ case AMDGPU::V_LSHL_B32_e32:
+ case AMDGPU::S_LSHL_B32:
+ // The instruction ignores the high bits for out of bounds shifts.
+ Result = LHS << (RHS & 31);
+ return true;
+ case AMDGPU::V_LSHLREV_B32_e64:
+ case AMDGPU::V_LSHLREV_B32_e32:
+ Result = RHS << (LHS & 31);
+ return true;
+ case AMDGPU::V_LSHR_B32_e64:
+ case AMDGPU::V_LSHR_B32_e32:
+ case AMDGPU::S_LSHR_B32:
+ Result = LHS >> (RHS & 31);
+ return true;
+ case AMDGPU::V_LSHRREV_B32_e64:
+ case AMDGPU::V_LSHRREV_B32_e32:
+ Result = RHS >> (LHS & 31);
+ return true;
+ case AMDGPU::V_ASHR_I32_e64:
+ case AMDGPU::V_ASHR_I32_e32:
+ case AMDGPU::S_ASHR_I32:
+ Result = static_cast<int32_t>(LHS) >> (RHS & 31);
+ return true;
+ case AMDGPU::V_ASHRREV_I32_e64:
+ case AMDGPU::V_ASHRREV_I32_e32:
+ Result = static_cast<int32_t>(RHS) >> (LHS & 31);
+ return true;
default:
return false;
}
@@ -390,33 +460,47 @@ static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
stripExtraCopyOperands(MI);
}
+static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
+ MachineOperand &Op) {
+ if (Op.isReg()) {
+ // If this has a subregister, it obviously is a register source.
+ if (Op.getSubReg() != AMDGPU::NoSubRegister)
+ return &Op;
+
+ MachineInstr *Def = MRI.getVRegDef(Op.getReg());
+ if (Def->isMoveImmediate()) {
+ MachineOperand &ImmSrc = Def->getOperand(1);
+ if (ImmSrc.isImm())
+ return &ImmSrc;
+ }
+ }
+
+ return &Op;
+}
+
// Try to simplify operations with a constant that may appear after instruction
// selection.
+// TODO: See if a frame index with a fixed offset can fold.
static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
const SIInstrInfo *TII,
- MachineInstr *MI) {
+ MachineInstr *MI,
+ MachineOperand *ImmOp) {
unsigned Opc = MI->getOpcode();
-
if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
Opc == AMDGPU::S_NOT_B32) {
- MachineOperand &Src0 = MI->getOperand(1);
- if (Src0.isImm()) {
- Src0.setImm(~Src0.getImm());
- mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
- return true;
- }
-
- return false;
+ MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
+ mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+ return true;
}
- if (!MI->isCommutable())
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx == -1)
return false;
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
+ MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
- MachineOperand *Src0 = &MI->getOperand(Src0Idx);
- MachineOperand *Src1 = &MI->getOperand(Src1Idx);
if (!Src0->isImm() && !Src1->isImm())
return false;
@@ -431,19 +515,26 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
const SIRegisterInfo &TRI = TII->getRegisterInfo();
bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
- Src0->setImm(NewImm);
+ // Be careful to change the right operand, src0 may belong to a different
+ // instruction.
+ MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
MI->RemoveOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
return true;
}
+ if (!MI->isCommutable())
+ return false;
+
if (Src0->isImm() && !Src1->isImm()) {
std::swap(Src0, Src1);
std::swap(Src0Idx, Src1Idx);
}
int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
- if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
+ if (Opc == AMDGPU::V_OR_B32_e64 ||
+ Opc == AMDGPU::V_OR_B32_e32 ||
+ Opc == AMDGPU::S_OR_B32) {
if (Src1Val == 0) {
// y = or x, 0 => y = copy x
MI->RemoveOperand(Src1Idx);
@@ -459,6 +550,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
}
if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+ MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
MI->getOpcode() == AMDGPU::S_AND_B32) {
if (Src1Val == 0) {
// y = and x, 0 => y = v_mov_b32 0
@@ -476,29 +568,136 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
}
if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+ MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
MI->getOpcode() == AMDGPU::S_XOR_B32) {
if (Src1Val == 0) {
// y = xor x, 0 => y = copy x
MI->RemoveOperand(Src1Idx);
mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+ return true;
}
}
return false;
}
+void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+ MachineOperand &OpToFold) const {
+ // We need mutate the operands of new mov instructions to add implicit
+ // uses of EXEC, but adding them invalidates the use_iterator, so defer
+ // this.
+ SmallVector<MachineInstr *, 4> CopiesToReplace;
+ SmallVector<FoldCandidate, 4> FoldList;
+ MachineOperand &Dst = MI.getOperand(0);
+
+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+ if (FoldingImm) {
+ unsigned NumLiteralUses = 0;
+ MachineOperand *NonInlineUse = nullptr;
+ int NonInlineUseOpNo = -1;
+
+ MachineRegisterInfo::use_iterator NextUse, NextInstUse;
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+ Use != E; Use = NextUse) {
+ NextUse = std::next(Use);
+ MachineInstr *UseMI = Use->getParent();
+ unsigned OpNo = Use.getOperandNo();
+
+ // Folding the immediate may reveal operations that can be constant
+ // folded or replaced with a copy. This can happen for example after
+ // frame indices are lowered to constants or from splitting 64-bit
+ // constants.
+ //
+ // We may also encounter cases where one or both operands are
+ // immediates materialized into a register, which would ordinarily not
+ // be folded due to multiple uses or operand constraints.
+
+ if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
+ DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+
+ // Some constant folding cases change the same immediate's use to a new
+ // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
+ // again. The same constant folded instruction could also have a second
+ // use operand.
+ NextUse = MRI->use_begin(Dst.getReg());
+ continue;
+ }
+
+ // Try to fold any inline immediate uses, and then only fold other
+ // constants if they have one use.
+ //
+ // The legality of the inline immediate must be checked based on the use
+ // operand, not the defining instruction, because 32-bit instructions
+ // with 32-bit inline immediate sources may be used to materialize
+ // constants used in 16-bit operands.
+ //
+ // e.g. it is unsafe to fold:
+ // s_mov_b32 s0, 1.0 // materializes 0x3f800000
+ // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
+
+ // Folding immediates with more than one use will increase program size.
+ // FIXME: This will also reduce register usage, which may be better
+ // in some cases. A better heuristic is needed.
+ if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
+ foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+ } else {
+ if (++NumLiteralUses == 1) {
+ NonInlineUse = &*Use;
+ NonInlineUseOpNo = OpNo;
+ }
+ }
+ }
+
+ if (NumLiteralUses == 1) {
+ MachineInstr *UseMI = NonInlineUse->getParent();
+ foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
+ }
+ } else {
+ // Folding register.
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+ Use != E; ++Use) {
+ MachineInstr *UseMI = Use->getParent();
+
+ foldOperand(OpToFold, UseMI, Use.getOperandNo(),
+ FoldList, CopiesToReplace);
+ }
+ }
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ // Make sure we add EXEC uses to any new v_mov instructions created.
+ for (MachineInstr *Copy : CopiesToReplace)
+ Copy->addImplicitDefUseOperands(*MF);
+
+ for (FoldCandidate &Fold : FoldList) {
+ if (updateOperand(Fold, *TRI)) {
+ // Clear kill flags.
+ if (Fold.isReg()) {
+ assert(Fold.OpToFold && Fold.OpToFold->isReg());
+ // FIXME: Probably shouldn't bother trying to fold if not an
+ // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+ // copies.
+ MRI->clearKillFlags(Fold.OpToFold->getReg());
+ }
+ DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+ static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+ }
+ }
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))
return false;
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
+ BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
@@ -512,8 +711,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &OpToFold = MI.getOperand(1);
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
- // FIXME: We could also be folding things like FrameIndexes and
- // TargetIndexes.
+ // FIXME: We could also be folding things like TargetIndexes.
if (!FoldingImm && !OpToFold.isReg())
continue;
@@ -532,90 +730,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
continue;
- // We need mutate the operands of new mov instructions to add implicit
- // uses of EXEC, but adding them invalidates the use_iterator, so defer
- // this.
- SmallVector<MachineInstr *, 4> CopiesToReplace;
-
- std::vector<FoldCandidate> FoldList;
- if (FoldingImm) {
- unsigned NumLiteralUses = 0;
- MachineOperand *NonInlineUse = nullptr;
- int NonInlineUseOpNo = -1;
-
- // Try to fold any inline immediate uses, and then only fold other
- // constants if they have one use.
- //
- // The legality of the inline immediate must be checked based on the use
- // operand, not the defining instruction, because 32-bit instructions
- // with 32-bit inline immediate sources may be used to materialize
- // constants used in 16-bit operands.
- //
- // e.g. it is unsafe to fold:
- // s_mov_b32 s0, 1.0 // materializes 0x3f800000
- // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
-
- // Folding immediates with more than one use will increase program size.
- // FIXME: This will also reduce register usage, which may be better
- // in some cases. A better heuristic is needed.
- for (MachineRegisterInfo::use_iterator
- Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
- Use != E; ++Use) {
- MachineInstr *UseMI = Use->getParent();
- unsigned OpNo = Use.getOperandNo();
-
- if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
- foldOperand(OpToFold, UseMI, OpNo, FoldList,
- CopiesToReplace, TII, TRI, MRI);
- } else {
- if (++NumLiteralUses == 1) {
- NonInlineUse = &*Use;
- NonInlineUseOpNo = OpNo;
- }
- }
- }
-
- if (NumLiteralUses == 1) {
- MachineInstr *UseMI = NonInlineUse->getParent();
- foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList,
- CopiesToReplace, TII, TRI, MRI);
- }
- } else {
- // Folding register.
- for (MachineRegisterInfo::use_iterator
- Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
- Use != E; ++Use) {
- MachineInstr *UseMI = Use->getParent();
-
- foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
- CopiesToReplace, TII, TRI, MRI);
- }
- }
-
- // Make sure we add EXEC uses to any new v_mov instructions created.
- for (MachineInstr *Copy : CopiesToReplace)
- Copy->addImplicitDefUseOperands(MF);
-
- for (FoldCandidate &Fold : FoldList) {
- if (updateOperand(Fold, TRI)) {
- // Clear kill flags.
- if (Fold.isReg()) {
- assert(Fold.OpToFold && Fold.OpToFold->isReg());
- // FIXME: Probably shouldn't bother trying to fold if not an
- // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
- // copies.
- MRI.clearKillFlags(Fold.OpToFold->getReg());
- }
- DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
- static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
-
- // Folding the immediate may reveal operations that can be constant
- // folded or replaced with a copy. This can happen for example after
- // frame indices are lowered to constants or from splitting 64-bit
- // constants.
- tryConstantFoldOp(MRI, TII, Fold.UseMI);
- }
- }
+ foldInstOperand(MI, OpToFold);
}
}
return false;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 34096e158039..ebaefae3bfef 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -557,6 +557,27 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+def FPVRegInputModsMatchClass : AsmOperandClass {
+ let Name = "VRegWithFPInputMods";
+ let ParserMethod = "parseRegWithFPInputMods";
+ let PredicateMethod = "isVReg";
+}
+
+def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
+ let PrintMethod = "printOperandAndFPInputMods";
+}
+
+def IntVRegInputModsMatchClass : AsmOperandClass {
+ let Name = "VRegWithIntInputMods";
+ let ParserMethod = "parseRegWithIntInputMods";
+ let PredicateMethod = "isVReg";
+}
+
+def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
+ let PrintMethod = "printOperandAndIntInputMods";
+}
+
+
//===----------------------------------------------------------------------===//
// Complex patterns
//===----------------------------------------------------------------------===//
@@ -761,6 +782,15 @@ class getSrcMod <ValueType VT> {
);
}
+// Return type of input modifiers operand specified input operand for SDWA/DPP
+class getSrcModExt <ValueType VT> {
+ bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+ !if(!eq(VT.Value, f32.Value), 1,
+ !if(!eq(VT.Value, f64.Value), 1,
+ 0)));
+ Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
+}
+
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1
@@ -1001,6 +1031,11 @@ class VOPProfile <list<ValueType> _ArgVT> {
field Operand Src0Mod = getSrcMod<Src0VT>.ret;
field Operand Src1Mod = getSrcMod<Src1VT>.ret;
field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+ field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
+ field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
+ field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
+ field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
+
field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
field bit HasDst32 = HasDst;
@@ -1038,15 +1073,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
field dag Outs32 = Outs;
field dag Outs64 = Outs;
field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
- field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+ field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod>.ret;
+ HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
- HasModifiers, Src0Mod, Src1Mod, DstVT>.ret;
+ HasModifiers, Src0ModSDWA, Src1ModSDWA,
+ DstVT>.ret;
field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bc35c2edc8d3..b86c04191189 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -871,6 +871,11 @@ def : Pat <
>;
def : Pat <
+ (i16 (sext_inreg i16:$src, i1)),
+ (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
+>;
+
+def : Pat <
(i16 (sext_inreg i16:$src, i8)),
(S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
>;
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index b27d7c691032..dd31dc690840 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -84,12 +84,17 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
// FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
// a special case for it. It can only be shrunk if the third operand
// is vcc. We should handle this the same way we handle vopc, by addding
- // a register allocation hint pre-regalloc and then do the shrining
+ // a register allocation hint pre-regalloc and then do the shrinking
// post-regalloc.
if (Src2) {
switch (MI.getOpcode()) {
default: return false;
+ case AMDGPU::V_ADDC_U32_e64:
+ case AMDGPU::V_SUBB_U32_e64:
+ // Additional verification is needed for sdst/src2.
+ return true;
+
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
if (!isVGPR(Src2, TRI, MRI) ||
@@ -174,7 +179,7 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
const MachineOperand &Orig) {
for (MachineOperand &Use : MI.implicit_operands()) {
- if (Use.getReg() == AMDGPU::VCC) {
+ if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
Use.setIsUndef(Orig.isUndef());
Use.setIsKill(Orig.isKill());
return;
@@ -456,6 +461,31 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
continue;
}
+ // Check for the bool flag output for instructions like V_ADD_I32_e64.
+ const MachineOperand *SDst = TII->getNamedOperand(MI,
+ AMDGPU::OpName::sdst);
+
+ // Check the carry-in operand for v_addc_u32_e64.
+ const MachineOperand *Src2 = TII->getNamedOperand(MI,
+ AMDGPU::OpName::src2);
+
+ if (SDst) {
+ if (SDst->getReg() != AMDGPU::VCC) {
+ if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
+ MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
+ continue;
+ }
+
+ // All of the instructions with carry outs also have an SGPR input in
+ // src2.
+ if (Src2 && Src2->getReg() != AMDGPU::VCC) {
+ if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
+ MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
+
+ continue;
+ }
+ }
+
// We can shrink this instruction
DEBUG(dbgs() << "Shrinking " << MI);
@@ -481,8 +511,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (Src1)
Inst32.addOperand(*Src1);
- const MachineOperand *Src2 =
- TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (Src2) {
int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
if (Op32Src2Idx != -1) {
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index bff706cdc1dc..a15b9ceff2f4 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -232,7 +232,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
- let InsSDWA = (ins Src0RC32:$vdst, Int32InputMods:$src0_modifiers, VCSrc_b32:$src0,
+ let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel);
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 20fb7f7bcab7..00e5ab3db0b7 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -183,13 +183,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
- let InsDPP = (ins FP32InputMods:$src0_modifiers, Src0DPP:$src0,
- FP32InputMods:$src1_modifiers, Src1DPP:$src1,
+ let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+ Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
VGPR_32:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
- let InsSDWA = (ins FP32InputMods:$src0_modifiers, Src0SDWA:$src0,
- FP32InputMods:$src1_modifiers, Src1SDWA:$src1,
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
VGPR_32:$src2, // stub argument
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel, src1_sel:$src1_sel);
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index c431d9db801e..16a456da3c67 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -517,8 +517,8 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
VOPC_Profile<sched, vt, i32> {
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
let Asm64 = "$sdst, $src0_modifiers, $src1";
- let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
- Int32InputMods:$src1_modifiers, Src1RC64:$src1,
+ let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+ Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
let HasSrc1Mods = 0;