aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
commiteb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/AMDGPU/AMDGPUISelLowering.cpp
parentb8a2042aa938069e862750553db0e4d82d25822c (diff)
downloadsrc-eb11fae6d08f479c0799db45860a98af528fa6e7.tar.gz
src-eb11fae6d08f479c0799db45860a98af528fa6e7.zip
Vendor import of llvm trunk r338150:vendor/llvm/llvm-trunk-r338150
Notes
Notes: svn path=/vendor/llvm/dist/; revision=336809 svn path=/vendor/llvm/llvm-trunk-r338150/; revision=336814; tag=vendor/llvm/llvm-trunk-r338150
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp549
1 files changed, 357 insertions, 192 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49929441ef21..b201126c593b 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief This is the parent TargetLowering class for hardware code gen
+/// This is the parent TargetLowering class for hardware code gen
/// targets.
//
//===----------------------------------------------------------------------===//
@@ -25,9 +25,12 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "R600MachineFunctionInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -38,18 +41,6 @@
#include "llvm/Support/KnownBits.h"
using namespace llvm;
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- MachineFunction &MF = State.getMachineFunction();
- AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
-
- uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
- ArgFlags.getOrigAlign());
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
- return true;
-}
-
static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State,
@@ -71,7 +62,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
- case MVT::v2f32: {
+ case MVT::v2f32:
+ case MVT::v4i16:
+ case MVT::v4f16: {
// Up to SGPR0-SGPR39
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::SGPR_64RegClass, 20);
@@ -92,7 +85,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
- case MVT::v2f32: {
+ case MVT::v2f32:
+ case MVT::v4i16:
+ case MVT::v4f16: {
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::VReg_64RegClass, 31);
}
@@ -324,10 +319,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FLOG, MVT::f32, Custom);
setOperationAction(ISD::FLOG10, MVT::f32, Custom);
- if (Subtarget->has16BitInsts()) {
- setOperationAction(ISD::FLOG, MVT::f16, Custom);
- setOperationAction(ISD::FLOG10, MVT::f16, Custom);
- }
setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
@@ -335,10 +326,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FREM, MVT::f32, Custom);
setOperationAction(ISD::FREM, MVT::f64, Custom);
- // v_mad_f32 does not support denormals according to some sources.
- if (!Subtarget->hasFP32Denormals())
- setOperationAction(ISD::FMAD, MVT::f32, Legal);
-
// Expand to fneg + fadd.
setOperationAction(ISD::FSUB, MVT::f64, Expand);
@@ -353,19 +340,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
- if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
- setOperationAction(ISD::FCEIL, MVT::f64, Custom);
- setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
- setOperationAction(ISD::FRINT, MVT::f64, Custom);
- setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
- }
-
- if (!Subtarget->hasBFI()) {
- // fcopysign can be done in a single instruction with BFI.
- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
- }
-
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
@@ -389,13 +363,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
- }
-
- if (!Subtarget->hasBCNT(32))
- setOperationAction(ISD::CTPOP, MVT::i32, Expand);
- if (!Subtarget->hasBCNT(64))
- setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+ // AMDGPU uses ADDC/SUBC/ADDE/SUBE
+ setOperationAction(ISD::ADDC, VT, Legal);
+ setOperationAction(ISD::SUBC, VT, Legal);
+ setOperationAction(ISD::ADDE, VT, Legal);
+ setOperationAction(ISD::SUBE, VT, Legal);
+ }
// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
@@ -416,28 +390,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMAX, MVT::i32, Legal);
setOperationAction(ISD::UMAX, MVT::i32, Legal);
- if (Subtarget->hasFFBH())
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
-
- if (Subtarget->hasFFBL())
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
-
setOperationAction(ISD::CTTZ, MVT::i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
- // We only really have 32-bit BFE instructions (and 16-bit on VI).
- //
- // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
- // effort to match them now. We want this to be false for i64 cases when the
- // extraction isn't restricted to the upper or lower half. Ideally we would
- // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
- // span the midpoint are probably relatively rare, so don't worry about them
- // for now.
- if (Subtarget->hasBFE())
- setHasExtractBitsInsn(true);
-
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v4i32
};
@@ -468,10 +425,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Custom);
setOperationAction(ISD::UDIVREM, VT, Expand);
- setOperationAction(ISD::ADDC, VT, Expand);
- setOperationAction(ISD::SUBC, VT, Expand);
- setOperationAction(ISD::ADDE, VT, Expand);
- setOperationAction(ISD::SUBE, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -546,11 +499,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// vector compares until that is fixed.
setHasMultipleConditionRegisters(true);
- // SI at least has hardware support for floating point exceptions, but no way
- // of using or handling them is implemented. They are also optional in OpenCL
- // (Section 7.3)
- setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
-
PredictableSelectIsExpensive = false;
// We want to find all load dependencies for long chains of stores to enable
@@ -573,6 +521,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::MULHU);
setTargetDAGCombine(ISD::MULHS);
@@ -607,6 +556,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case ISD::FNEARBYINT:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::SIN_HW:
case AMDGPUISD::FMUL_LEGACY:
case AMDGPUISD::FMIN_LEGACY:
@@ -748,6 +698,37 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
return true;
}
+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::EntryToken:
+ case ISD::TokenFactor:
+ return true;
+ case ISD::INTRINSIC_WO_CHAIN:
+ {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IntrID) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane:
+ return true;
+ }
+ }
+ break;
+ case ISD::LOAD:
+ {
+ const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
+ if (L->getMemOperand()->getAddrSpace()
+ == AMDGPUASI.CONSTANT_ADDRESS_32BIT)
+ return true;
+ return false;
+ }
+ break;
+ }
+}
+
//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//
@@ -832,17 +813,6 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return isZExtFree(Val.getValueType(), VT2);
}
-// v_mad_mix* support a conversion from f16 to f32.
-//
-// There is only one special case when denormals are enabled we don't currently,
-// where this is OK to use.
-bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
- EVT DestVT, EVT SrcVT) const {
- return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
- DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
- SrcVT.getScalarType() == MVT::f16;
-}
-
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
@@ -862,7 +832,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
- return CC_AMDGPU_Kernel;
+ llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@@ -885,7 +855,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
- return CC_AMDGPU_Kernel;
+ llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@@ -929,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
/// for each individual part is i8. We pass the memory type as LocVT to the
/// calling convention analysis function and the register type (Ins[x].VT) as
/// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
- const SmallVectorImpl<ISD::InputArg> &Ins) const {
- for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
- const ISD::InputArg &In = Ins[i];
- EVT MemVT;
-
- unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
-
- if (!Subtarget->isAmdHsaOS() &&
- (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
- // The ABI says the caller will extend these values to 32-bits.
- MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
- } else if (NumRegs == 1) {
- // This argument is not split, so the IR type is the memory type.
- assert(!In.Flags.isSplit());
- if (In.ArgVT.isExtended()) {
- // We have an extended type, like i24, so we should just use the register type
- MemVT = In.VT;
- } else {
- MemVT = In.ArgVT;
- }
- } else if (In.ArgVT.isVector() && In.VT.isVector() &&
- In.ArgVT.getScalarType() == In.VT.getScalarType()) {
- assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
- // We have a vector value which has been split into a vector with
- // the same scalar type, but fewer elements. This should handle
- // all the floating-point vector types.
- MemVT = In.VT;
- } else if (In.ArgVT.isVector() &&
- In.ArgVT.getVectorNumElements() == NumRegs) {
- // This arg has been split so that each element is stored in a separate
- // register.
- MemVT = In.ArgVT.getScalarType();
- } else if (In.ArgVT.isExtended()) {
- // We have an extended type, like i65.
- MemVT = In.VT;
- } else {
- unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
- assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
- if (In.VT.isInteger()) {
- MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
- } else if (In.VT.isVector()) {
- assert(!In.VT.getScalarType().isFloatingPoint());
- unsigned NumElements = In.VT.getVectorNumElements();
- assert(MemoryBits % NumElements == 0);
- // This vector type has been split into another vector type with
- // a different elements size.
- EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
- MemoryBits / NumElements);
- MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
+ CCState &State,
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {
+ const MachineFunction &MF = State.getMachineFunction();
+ const Function &Fn = MF.getFunction();
+ LLVMContext &Ctx = Fn.getParent()->getContext();
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+ const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
+
+ unsigned MaxAlign = 1;
+ uint64_t ExplicitArgOffset = 0;
+ const DataLayout &DL = Fn.getParent()->getDataLayout();
+
+ unsigned InIndex = 0;
+
+ for (const Argument &Arg : Fn.args()) {
+ Type *BaseArgTy = Arg.getType();
+ unsigned Align = DL.getABITypeAlignment(BaseArgTy);
+ MaxAlign = std::max(Align, MaxAlign);
+ unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
+
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+ // We're basically throwing away everything passed into us and starting over
+ // to get accurate in-memory offsets. The "PartOffset" is completely useless
+ // to us as computed in Ins.
+ //
+ // We also need to figure out what type legalization is trying to do to get
+ // the correct memory offsets.
+
+ SmallVector<EVT, 16> ValueVTs;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+
+ for (unsigned Value = 0, NumValues = ValueVTs.size();
+ Value != NumValues; ++Value) {
+ uint64_t BasePartOffset = Offsets[Value];
+
+ EVT ArgVT = ValueVTs[Value];
+ EVT MemVT = ArgVT;
+ MVT RegisterVT =
+ getRegisterTypeForCallingConv(Ctx, ArgVT);
+ unsigned NumRegs =
+ getNumRegistersForCallingConv(Ctx, ArgVT);
+
+ if (!Subtarget->isAmdHsaOS() &&
+ (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {
+ // The ABI says the caller will extend these values to 32-bits.
+ MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+ } else if (NumRegs == 1) {
+ // This argument is not split, so the IR type is the memory type.
+ if (ArgVT.isExtended()) {
+ // We have an extended type, like i24, so we should just use the
+ // register type.
+ MemVT = RegisterVT;
+ } else {
+ MemVT = ArgVT;
+ }
+ } else if (ArgVT.isVector() && RegisterVT.isVector() &&
+ ArgVT.getScalarType() == RegisterVT.getScalarType()) {
+ assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
+ // We have a vector value which has been split into a vector with
+ // the same scalar type, but fewer elements. This should handle
+ // all the floating-point vector types.
+ MemVT = RegisterVT;
+ } else if (ArgVT.isVector() &&
+ ArgVT.getVectorNumElements() == NumRegs) {
+ // This arg has been split so that each element is stored in a separate
+ // register.
+ MemVT = ArgVT.getScalarType();
+ } else if (ArgVT.isExtended()) {
+ // We have an extended type, like i65.
+ MemVT = RegisterVT;
} else {
- llvm_unreachable("cannot deduce memory type.");
+ unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
+ assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
+ if (RegisterVT.isInteger()) {
+ MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+ } else if (RegisterVT.isVector()) {
+ assert(!RegisterVT.getScalarType().isFloatingPoint());
+ unsigned NumElements = RegisterVT.getVectorNumElements();
+ assert(MemoryBits % NumElements == 0);
+ // This vector type has been split into another vector type with
+ // a different elements size.
+ EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+ MemoryBits / NumElements);
+ MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+ } else {
+ llvm_unreachable("cannot deduce memory type.");
+ }
}
- }
- // Convert one element vectors to scalar.
- if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
- MemVT = MemVT.getScalarType();
+ // Convert one element vectors to scalar.
+ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+ MemVT = MemVT.getScalarType();
- if (MemVT.isExtended()) {
- // This should really only happen if we have vec3 arguments
- assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
- MemVT = MemVT.getPow2VectorType(State.getContext());
- }
+ if (MemVT.isExtended()) {
+ // This should really only happen if we have vec3 arguments
+ assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+ MemVT = MemVT.getPow2VectorType(State.getContext());
+ }
- assert(MemVT.isSimple());
- allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
- State);
+ unsigned PartOffset = 0;
+ for (unsigned i = 0; i != NumRegs; ++i) {
+ State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
+ BasePartOffset + PartOffset,
+ MemVT.getSimpleVT(),
+ CCValAssign::Full));
+ PartOffset += MemVT.getStoreSize();
+ }
+ }
}
}
@@ -1178,7 +1192,15 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
- if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
+ if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
+ G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {
+ if (!MFI->isEntryFunction()) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+ DiagnosticInfoUnsupported BadLDSDecl(
+ Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
+ DAG.getContext()->diagnose(BadLDSDecl);
+ }
+
// XXX: What does the value of G->getOffset() mean?
assert(G->getOffset() == 0 &&
"Do not know what to do with an non-zero offset");
@@ -1201,6 +1223,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
SmallVector<SDValue, 8> Args;
+ EVT VT = Op.getValueType();
+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SDLoc SL(Op);
+ SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
+ SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
+
+ SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+ }
+
for (const SDUse &U : Op->ops())
DAG.ExtractVectorElements(U.get(), Args);
@@ -1219,7 +1251,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
}
-/// \brief Generate Min/Max node
+/// Generate Min/Max node
SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
SDValue LHS, SDValue RHS,
SDValue True, SDValue False,
@@ -1985,7 +2017,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
- // Extend back to to 64-bits.
+ // Extend back to 64-bits.
SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
@@ -2806,28 +2838,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SN->getBasePtr(), SN->getMemOperand());
}
-SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
- if (!CSrc)
- return SDValue();
-
- const APFloat &F = CSrc->getValueAPF();
- APFloat Zero = APFloat::getZero(F.getSemantics());
- APFloat::cmpResult Cmp0 = F.compare(Zero);
- if (Cmp0 == APFloat::cmpLessThan ||
- (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
- return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
- }
-
- APFloat One(F.getSemantics(), "1.0");
- APFloat::cmpResult Cmp1 = F.compare(One);
- if (Cmp1 == APFloat::cmpGreaterThan)
- return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
-
- return SDValue(CSrc, 0);
-}
-
// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
// issues.
@@ -2903,7 +2913,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDValue X = LHS->getOperand(0);
if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
- isTypeLegal(MVT::v2i16)) {
+ isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
// Prefer build_vector as the canonical form if packed types are legal.
// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
@@ -3017,6 +3027,92 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
}
+SDValue AMDGPUTargetLowering::performTruncateCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+
+ // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
+ if (Src.getOpcode() == ISD::BITCAST) {
+ SDValue Vec = Src.getOperand(0);
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
+ SDValue Elt0 = Vec.getOperand(0);
+ EVT EltVT = Elt0.getValueType();
+ if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
+ if (EltVT.isFloatingPoint()) {
+ Elt0 = DAG.getNode(ISD::BITCAST, SL,
+ EltVT.changeTypeToInteger(), Elt0);
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
+ }
+ }
+ }
+
+ // Equivalent of above for accessing the high element of a vector as an
+ // integer operation.
+ // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
+ if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
+ if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
+ if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
+ SDValue BV = stripBitcast(Src.getOperand(0));
+ if (BV.getOpcode() == ISD::BUILD_VECTOR &&
+ BV.getValueType().getVectorNumElements() == 2) {
+ SDValue SrcElt = BV.getOperand(1);
+ EVT SrcEltVT = SrcElt.getValueType();
+ if (SrcEltVT.isFloatingPoint()) {
+ SrcElt = DAG.getNode(ISD::BITCAST, SL,
+ SrcEltVT.changeTypeToInteger(), SrcElt);
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
+ }
+ }
+ }
+ }
+
+ // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
+ //
+ // i16 (trunc (srl i64:x, K)), K <= 16 ->
+ // i16 (trunc (srl (i32 (trunc x), K)))
+ if (VT.getScalarSizeInBits() < 32) {
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.getScalarSizeInBits() > 32 &&
+ (Src.getOpcode() == ISD::SRL ||
+ Src.getOpcode() == ISD::SRA ||
+ Src.getOpcode() == ISD::SHL)) {
+ SDValue Amt = Src.getOperand(1);
+ KnownBits Known;
+ DAG.computeKnownBits(Amt, Known);
+ unsigned Size = VT.getScalarSizeInBits();
+ if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
+ (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
+ EVT MidVT = VT.isVector() ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ VT.getVectorNumElements()) : MVT::i32;
+
+ EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
+ Src.getOperand(0));
+ DCI.AddToWorklist(Trunc.getNode());
+
+ if (Amt.getValueType() != NewShiftVT) {
+ Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
+ DCI.AddToWorklist(Amt.getNode());
+ }
+
+ SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
+ Trunc, Amt);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
// We need to specifically handle i64 mul here to avoid unnecessary conversion
// instructions. If we only match on the legalized i64 mul expansion,
// SimplifyDemandedBits will be unable to remove them because there will be
@@ -3058,6 +3154,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+
+ // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
+ // in the source into any_extends if the result of the mul is truncated. Since
+ // we can assume the high bits are whatever we want, use the underlying value
+ // to avoid the unknown high bits from interfering.
+ if (N0.getOpcode() == ISD::ANY_EXTEND)
+ N0 = N0.getOperand(0);
+
+ if (N1.getOpcode() == ISD::ANY_EXTEND)
+ N1 = N1.getOperand(0);
+
SDValue Mul;
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
@@ -3495,6 +3602,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FSIN:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::SIN_HW: {
SDValue CvtSrc = N0.getOperand(0);
if (CvtSrc.getOpcode() == ISD::FNEG) {
@@ -3571,6 +3679,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
}
}
+SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ if (!CFP)
+ return SDValue();
+
+ // XXX - Should this flush denormals?
+ const APFloat &Val = CFP->getValueAPF();
+ APFloat One(Val.getSemantics(), "1.0");
+ return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -3617,12 +3737,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
// TODO: Generalize and move to DAGCombiner
SDValue Src = N->getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
- assert(Src.getValueType() == MVT::i64);
- SDLoc SL(N);
- uint64_t CVal = C->getZExtValue();
- return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
- DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
- DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ if (Src.getValueType() == MVT::i64) {
+ SDLoc SL(N);
+ uint64_t CVal = C->getZExtValue();
+ return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ }
}
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3656,6 +3777,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performSraCombine(N, DCI);
}
+ case ISD::TRUNCATE:
+ return performTruncateCombine(N, DCI);
case ISD::MUL:
return performMulCombine(N, DCI);
case ISD::MULHS:
@@ -3768,18 +3891,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performLoadCombine(N, DCI);
case ISD::STORE:
return performStoreCombine(N, DCI);
- case AMDGPUISD::CLAMP:
- return performClampCombine(N, DCI);
- case AMDGPUISD::RCP: {
- if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
- // XXX - Should this flush denormals?
- const APFloat &Val = CFP->getValueAPF();
- APFloat One(Val.getSemantics(), "1.0");
- return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
- }
-
- break;
- }
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RCP_IFLAG:
+ return performRcpCombine(N, DCI);
case ISD::AssertZext:
case ISD::AssertSext:
return performAssertSZExtCombine(N, DCI);
@@ -3856,9 +3970,14 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
- const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
- unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
- uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
+ const MachineFunction &MF, const ImplicitParameter Param) const {
+ const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+ const AMDGPUSubtarget &ST =
+ AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
+ unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
+ unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+ uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
+ ExplicitArgOffset;
switch (Param) {
case GRID_DIM:
return ArgOffset;
@@ -3907,6 +4026,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)
+ NODE_NAME_CASE(FDOT2)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
NODE_NAME_CASE(DIV_FMAS)
@@ -3917,6 +4037,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RSQ)
NODE_NAME_CASE(RCP_LEGACY)
NODE_NAME_CASE(RSQ_LEGACY)
+ NODE_NAME_CASE(RCP_IFLAG)
NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
NODE_NAME_CASE(LDEXP)
@@ -3941,6 +4062,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(MAD_I64_I32)
NODE_NAME_CASE(MAD_U64_U32)
+ NODE_NAME_CASE(PERM)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(EXPORT_DONE)
@@ -3957,6 +4079,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE2)
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_I16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_U16_F32)
+ NODE_NAME_CASE(CVT_PK_I16_I32)
+ NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
@@ -3976,14 +4102,21 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+ NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
+ NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
NODE_NAME_CASE(ATOMIC_INC)
NODE_NAME_CASE(ATOMIC_DEC)
+ NODE_NAME_CASE(ATOMIC_LOAD_FADD)
+ NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
+ NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+ NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
+ NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
@@ -3995,6 +4128,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_OR)
NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
return nullptr;
@@ -4108,14 +4242,45 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.Zero.setHighBits(32 - MaxValBits);
break;
}
+ case AMDGPUISD::PERM: {
+ ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ if (!CMask)
+ return;
+
+ KnownBits LHSKnown, RHSKnown;
+ DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+ unsigned Sel = CMask->getZExtValue();
+
+ for (unsigned I = 0; I < 32; I += 8) {
+ unsigned SelBits = Sel & 0xff;
+ if (SelBits < 4) {
+ SelBits *= 8;
+ Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+ Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+ } else if (SelBits < 7) {
+ SelBits = (SelBits & 3) * 8;
+ Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
+ Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
+ } else if (SelBits == 0x0c) {
+ Known.Zero |= 0xff << I;
+ } else if (SelBits > 0x0c) {
+ Known.One |= 0xff << I;
+ }
+ Sel >>= 8;
+ }
+ break;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {
case Intrinsic::amdgcn_mbcnt_lo:
case Intrinsic::amdgcn_mbcnt_hi: {
+ const GCNSubtarget &ST =
+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
// These return at most the wavefront size - 1.
unsigned Size = Op.getValueType().getSizeInBits();
- Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
+ Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
break;
}
default: