src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2018-07-28 10:51:19 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2018-07-28 10:51:19 +0000
commit	eb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree	44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/AMDGPU/AMDGPUISelLowering.cpp
parent	b8a2042aa938069e862750553db0e4d82d25822c (diff)
download	src-eb11fae6d08f479c0799db45860a98af528fa6e7.tar.gz src-eb11fae6d08f479c0799db45860a98af528fa6e7.zip

Vendor import of llvm trunk r338150:vendor/llvm/llvm-trunk-r338150

https://llvm.org/svn/llvm-project/llvm/trunk@338150

Notes

Notes: svn path=/vendor/llvm/dist/; revision=336809 svn path=/vendor/llvm/llvm-trunk-r338150/; revision=336814; tag=vendor/llvm/llvm-trunk-r338150

Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUISelLowering.cpp')

-rw-r--r--

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

549

1 files changed, 357 insertions, 192 deletions

diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49929441ef21..b201126c593b 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

@@ -8,7 +8,7 @@

//===----------------------------------------------------------------------===//

/// \file

-/// \brief This is the parent TargetLowering class for hardware code gen

+/// This is the parent TargetLowering class for hardware code gen

/// targets.

//===----------------------------------------------------------------------===//

@@ -25,9 +25,12 @@

#include "AMDGPURegisterInfo.h"

#include "AMDGPUSubtarget.h"

#include "AMDGPUTargetMachine.h"

+#include "Utils/AMDGPUBaseInfo.h"

#include "R600MachineFunctionInfo.h"

#include "SIInstrInfo.h"

#include "SIMachineFunctionInfo.h"

+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

+#include "llvm/CodeGen/Analysis.h"

#include "llvm/CodeGen/CallingConvLower.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

@@ -38,18 +41,6 @@

#include "llvm/Support/KnownBits.h"

using namespace llvm;

-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,

- CCValAssign::LocInfo LocInfo,

- ISD::ArgFlagsTy ArgFlags, CCState &State) {

- MachineFunction &MF = State.getMachineFunction();

- AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();

- uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),

- ArgFlags.getOrigAlign());

- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));

- return true;

static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,

CCValAssign::LocInfo LocInfo,

ISD::ArgFlagsTy ArgFlags, CCState &State,

@@ -71,7 +62,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,

case MVT::i64:

case MVT::f64:

case MVT::v2i32:

- case MVT::v2f32: {

+ case MVT::v2f32:

+ case MVT::v4i16:

+ case MVT::v4f16: {

// Up to SGPR0-SGPR39

return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,

&AMDGPU::SGPR_64RegClass, 20);

@@ -92,7 +85,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,

case MVT::i64:

case MVT::f64:

case MVT::v2i32:

- case MVT::v2f32: {

+ case MVT::v2f32:

+ case MVT::v4i16:

+ case MVT::v4f16: {

return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,

&AMDGPU::VReg_64RegClass, 31);

}

@@ -324,10 +319,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::FLOG, MVT::f32, Custom);

setOperationAction(ISD::FLOG10, MVT::f32, Custom);

- if (Subtarget->has16BitInsts()) {

- setOperationAction(ISD::FLOG, MVT::f16, Custom);

- setOperationAction(ISD::FLOG10, MVT::f16, Custom);

- }

setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);

setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);

@@ -335,10 +326,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::FREM, MVT::f32, Custom);

setOperationAction(ISD::FREM, MVT::f64, Custom);

- // v_mad_f32 does not support denormals according to some sources.

- if (!Subtarget->hasFP32Denormals())

- setOperationAction(ISD::FMAD, MVT::f32, Legal);

// Expand to fneg + fadd.

setOperationAction(ISD::FSUB, MVT::f64, Expand);

@@ -353,19 +340,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);

- if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {

- setOperationAction(ISD::FCEIL, MVT::f64, Custom);

- setOperationAction(ISD::FTRUNC, MVT::f64, Custom);

- setOperationAction(ISD::FRINT, MVT::f64, Custom);

- setOperationAction(ISD::FFLOOR, MVT::f64, Custom);

- }

- if (!Subtarget->hasBFI()) {

- // fcopysign can be done in a single instruction with BFI.

- setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

- setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

- }

setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);

setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);

setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);

@@ -389,13 +363,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::BSWAP, VT, Expand);

setOperationAction(ISD::CTTZ, VT, Expand);

setOperationAction(ISD::CTLZ, VT, Expand);

- }

- if (!Subtarget->hasBCNT(32))

- setOperationAction(ISD::CTPOP, MVT::i32, Expand);

- if (!Subtarget->hasBCNT(64))

- setOperationAction(ISD::CTPOP, MVT::i64, Expand);

+ // AMDGPU uses ADDC/SUBC/ADDE/SUBE

+ setOperationAction(ISD::ADDC, VT, Legal);

+ setOperationAction(ISD::SUBC, VT, Legal);

+ setOperationAction(ISD::ADDE, VT, Legal);

+ setOperationAction(ISD::SUBE, VT, Legal);

+ }

// The hardware supports 32-bit ROTR, but not ROTL.

setOperationAction(ISD::ROTL, MVT::i32, Expand);

@@ -416,28 +390,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::SMAX, MVT::i32, Legal);

setOperationAction(ISD::UMAX, MVT::i32, Legal);

- if (Subtarget->hasFFBH())

- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);

- if (Subtarget->hasFFBL())

- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);

setOperationAction(ISD::CTTZ, MVT::i64, Custom);

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);

setOperationAction(ISD::CTLZ, MVT::i64, Custom);

setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);

- // We only really have 32-bit BFE instructions (and 16-bit on VI).

- //

- // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any

- // effort to match them now. We want this to be false for i64 cases when the

- // extraction isn't restricted to the upper or lower half. Ideally we would

- // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that

- // span the midpoint are probably relatively rare, so don't worry about them

- // for now.

- if (Subtarget->hasBFE())

- setHasExtractBitsInsn(true);

static const MVT::SimpleValueType VectorIntTypes[] = {

MVT::v2i32, MVT::v4i32

};

@@ -468,10 +425,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

setOperationAction(ISD::SDIVREM, VT, Custom);

setOperationAction(ISD::UDIVREM, VT, Expand);

- setOperationAction(ISD::ADDC, VT, Expand);

- setOperationAction(ISD::SUBC, VT, Expand);

- setOperationAction(ISD::ADDE, VT, Expand);

- setOperationAction(ISD::SUBE, VT, Expand);

setOperationAction(ISD::SELECT, VT, Expand);

setOperationAction(ISD::VSELECT, VT, Expand);

setOperationAction(ISD::SELECT_CC, VT, Expand);

@@ -546,11 +499,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

// vector compares until that is fixed.

setHasMultipleConditionRegisters(true);

- // SI at least has hardware support for floating point exceptions, but no way

- // of using or handling them is implemented. They are also optional in OpenCL

- // (Section 7.3)

- setHasFloatingPointExceptions(Subtarget->hasFPExceptions());

PredictableSelectIsExpensive = false;

// We want to find all load dependencies for long chains of stores to enable

@@ -573,6 +521,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,

setTargetDAGCombine(ISD::SHL);

setTargetDAGCombine(ISD::SRA);

setTargetDAGCombine(ISD::SRL);

+ setTargetDAGCombine(ISD::TRUNCATE);

setTargetDAGCombine(ISD::MUL);

setTargetDAGCombine(ISD::MULHU);

setTargetDAGCombine(ISD::MULHS);

@@ -607,6 +556,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {

case ISD::FNEARBYINT:

case AMDGPUISD::RCP:

case AMDGPUISD::RCP_LEGACY:

+ case AMDGPUISD::RCP_IFLAG:

case AMDGPUISD::SIN_HW:

case AMDGPUISD::FMUL_LEGACY:

case AMDGPUISD::FMIN_LEGACY:

@@ -748,6 +698,37 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {

return true;

}

+bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {

+ switch (N->getOpcode()) {

+ default:

+ return false;

+ case ISD::EntryToken:

+ case ISD::TokenFactor:

+ return true;

+ case ISD::INTRINSIC_WO_CHAIN:

+ {

+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();

+ switch (IntrID) {

+ default:

+ return false;

+ case Intrinsic::amdgcn_readfirstlane:

+ case Intrinsic::amdgcn_readlane:

+ return true;

+ }

+ break;

+ case ISD::LOAD:

+ {

+ const LoadSDNode * L = dyn_cast<LoadSDNode>(N);

+ if (L->getMemOperand()->getAddrSpace()

+ == AMDGPUASI.CONSTANT_ADDRESS_32BIT)

+ return true;

+ return false;

+ }

+ break;

+ }

//===---------------------------------------------------------------------===//

// Target Properties

//===---------------------------------------------------------------------===//

@@ -832,17 +813,6 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

return isZExtFree(Val.getValueType(), VT2);

}

-// v_mad_mix* support a conversion from f16 to f32.

-//

-// There is only one special case when denormals are enabled we don't currently,

-// where this is OK to use.

-bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,

- EVT DestVT, EVT SrcVT) const {

- return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&

- DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&

- SrcVT.getScalarType() == MVT::f16;

bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {

// There aren't really 64-bit registers, but pairs of 32-bit ones and only a

// limited number of native 64-bit operations. Shrinking an operation to fit

@@ -862,7 +832,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,

switch (CC) {

case CallingConv::AMDGPU_KERNEL:

case CallingConv::SPIR_KERNEL:

- return CC_AMDGPU_Kernel;

+ llvm_unreachable("kernels should not be handled here");

case CallingConv::AMDGPU_VS:

case CallingConv::AMDGPU_GS:

case CallingConv::AMDGPU_PS:

@@ -885,7 +855,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,

switch (CC) {

case CallingConv::AMDGPU_KERNEL:

case CallingConv::SPIR_KERNEL:

- return CC_AMDGPU_Kernel;

+ llvm_unreachable("kernels should not be handled here");

case CallingConv::AMDGPU_VS:

case CallingConv::AMDGPU_GS:

case CallingConv::AMDGPU_PS:

@@ -929,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,

/// for each individual part is i8. We pass the memory type as LocVT to the

/// calling convention analysis function and the register type (Ins[x].VT) as

/// the ValVT.

-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,

- const SmallVectorImpl<ISD::InputArg> &Ins) const {

- for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

- const ISD::InputArg &In = Ins[i];

- EVT MemVT;

- unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);

- if (!Subtarget->isAmdHsaOS() &&

- (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {

- // The ABI says the caller will extend these values to 32-bits.

- MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;

- } else if (NumRegs == 1) {

- // This argument is not split, so the IR type is the memory type.

- assert(!In.Flags.isSplit());

- if (In.ArgVT.isExtended()) {

- // We have an extended type, like i24, so we should just use the register type

- MemVT = In.VT;

- } else {

- MemVT = In.ArgVT;

- }

- } else if (In.ArgVT.isVector() && In.VT.isVector() &&

- In.ArgVT.getScalarType() == In.VT.getScalarType()) {

- assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());

- // We have a vector value which has been split into a vector with

- // the same scalar type, but fewer elements. This should handle

- // all the floating-point vector types.

- MemVT = In.VT;

- } else if (In.ArgVT.isVector() &&

- In.ArgVT.getVectorNumElements() == NumRegs) {

- // This arg has been split so that each element is stored in a separate

- // register.

- MemVT = In.ArgVT.getScalarType();

- } else if (In.ArgVT.isExtended()) {

- // We have an extended type, like i65.

- MemVT = In.VT;

- } else {

- unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;

- assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);

- if (In.VT.isInteger()) {

- MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);

- } else if (In.VT.isVector()) {

- assert(!In.VT.getScalarType().isFloatingPoint());

- unsigned NumElements = In.VT.getVectorNumElements();

- assert(MemoryBits % NumElements == 0);

- // This vector type has been split into another vector type with

- // a different elements size.

- EVT ScalarVT = EVT::getIntegerVT(State.getContext(),

- MemoryBits / NumElements);

- MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);

+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(

+ CCState &State,

+ const SmallVectorImpl<ISD::InputArg> &Ins) const {

+ const MachineFunction &MF = State.getMachineFunction();

+ const Function &Fn = MF.getFunction();

+ LLVMContext &Ctx = Fn.getParent()->getContext();

+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);

+ const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);

+ unsigned MaxAlign = 1;

+ uint64_t ExplicitArgOffset = 0;

+ const DataLayout &DL = Fn.getParent()->getDataLayout();

+ unsigned InIndex = 0;

+ for (const Argument &Arg : Fn.args()) {

+ Type *BaseArgTy = Arg.getType();

+ unsigned Align = DL.getABITypeAlignment(BaseArgTy);

+ MaxAlign = std::max(Align, MaxAlign);

+ unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);

+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;

+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;

+ // We're basically throwing away everything passed into us and starting over

+ // to get accurate in-memory offsets. The "PartOffset" is completely useless

+ // to us as computed in Ins.

+ //

+ // We also need to figure out what type legalization is trying to do to get

+ // the correct memory offsets.

+ SmallVector<EVT, 16> ValueVTs;

+ SmallVector<uint64_t, 16> Offsets;

+ ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);

+ for (unsigned Value = 0, NumValues = ValueVTs.size();

+ Value != NumValues; ++Value) {

+ uint64_t BasePartOffset = Offsets[Value];

+ EVT ArgVT = ValueVTs[Value];

+ EVT MemVT = ArgVT;

+ MVT RegisterVT =

+ getRegisterTypeForCallingConv(Ctx, ArgVT);

+ unsigned NumRegs =

+ getNumRegistersForCallingConv(Ctx, ArgVT);

+ if (!Subtarget->isAmdHsaOS() &&

+ (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {

+ // The ABI says the caller will extend these values to 32-bits.

+ MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;

+ } else if (NumRegs == 1) {

+ // This argument is not split, so the IR type is the memory type.

+ if (ArgVT.isExtended()) {

+ // We have an extended type, like i24, so we should just use the

+ // register type.

+ MemVT = RegisterVT;

+ } else {

+ MemVT = ArgVT;

+ }

+ } else if (ArgVT.isVector() && RegisterVT.isVector() &&

+ ArgVT.getScalarType() == RegisterVT.getScalarType()) {

+ assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());

+ // We have a vector value which has been split into a vector with

+ // the same scalar type, but fewer elements. This should handle

+ // all the floating-point vector types.

+ MemVT = RegisterVT;

+ } else if (ArgVT.isVector() &&

+ ArgVT.getVectorNumElements() == NumRegs) {

+ // This arg has been split so that each element is stored in a separate

+ // register.

+ MemVT = ArgVT.getScalarType();

+ } else if (ArgVT.isExtended()) {

+ // We have an extended type, like i65.

+ MemVT = RegisterVT;

} else {

- llvm_unreachable("cannot deduce memory type.");

+ unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;

+ assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);

+ if (RegisterVT.isInteger()) {

+ MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);

+ } else if (RegisterVT.isVector()) {

+ assert(!RegisterVT.getScalarType().isFloatingPoint());

+ unsigned NumElements = RegisterVT.getVectorNumElements();

+ assert(MemoryBits % NumElements == 0);

+ // This vector type has been split into another vector type with

+ // a different elements size.

+ EVT ScalarVT = EVT::getIntegerVT(State.getContext(),

+ MemoryBits / NumElements);

+ MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);

+ } else {

+ llvm_unreachable("cannot deduce memory type.");

+ }

}

- }

- // Convert one element vectors to scalar.

- if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)

- MemVT = MemVT.getScalarType();

+ // Convert one element vectors to scalar.

+ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)

+ MemVT = MemVT.getScalarType();

- if (MemVT.isExtended()) {

- // This should really only happen if we have vec3 arguments

- assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);

- MemVT = MemVT.getPow2VectorType(State.getContext());

- }

+ if (MemVT.isExtended()) {

+ // This should really only happen if we have vec3 arguments

+ assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);

+ MemVT = MemVT.getPow2VectorType(State.getContext());

+ }

- assert(MemVT.isSimple());

- allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,

- State);

+ unsigned PartOffset = 0;

+ for (unsigned i = 0; i != NumRegs; ++i) {

+ State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,

+ BasePartOffset + PartOffset,

+ MemVT.getSimpleVT(),

+ CCValAssign::Full));

+ PartOffset += MemVT.getStoreSize();

+ }

}

@@ -1178,7 +1192,15 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,

GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);

const GlobalValue *GV = G->getGlobal();

- if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {

+ if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||

+ G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) {

+ if (!MFI->isEntryFunction()) {

+ const Function &Fn = DAG.getMachineFunction().getFunction();

+ DiagnosticInfoUnsupported BadLDSDecl(

+ Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());

+ DAG.getContext()->diagnose(BadLDSDecl);

+ }

// XXX: What does the value of G->getOffset() mean?

assert(G->getOffset() == 0 &&

"Do not know what to do with an non-zero offset");

@@ -1201,6 +1223,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,

SelectionDAG &DAG) const {

SmallVector<SDValue, 8> Args;

+ EVT VT = Op.getValueType();

+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {

+ SDLoc SL(Op);

+ SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));

+ SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));

+ SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });

+ return DAG.getNode(ISD::BITCAST, SL, VT, BV);

+ }

for (const SDUse &U : Op->ops())

DAG.ExtractVectorElements(U.get(), Args);

@@ -1219,7 +1251,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,

return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);

}

-/// \brief Generate Min/Max node

+/// Generate Min/Max node

SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,

SDValue LHS, SDValue RHS,

SDValue True, SDValue False,

@@ -1985,7 +2017,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {

const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);

SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);

- // Extend back to to 64-bits.

+ // Extend back to 64-bits.

SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});

SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);

@@ -2806,28 +2838,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,

SN->getBasePtr(), SN->getMemOperand());

}

-SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,

- DAGCombinerInfo &DCI) const {

- ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));

- if (!CSrc)

- return SDValue();

- const APFloat &F = CSrc->getValueAPF();

- APFloat Zero = APFloat::getZero(F.getSemantics());

- APFloat::cmpResult Cmp0 = F.compare(Zero);

- if (Cmp0 == APFloat::cmpLessThan ||

- (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {

- return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));

- }

- APFloat One(F.getSemantics(), "1.0");

- APFloat::cmpResult Cmp1 = F.compare(One);

- if (Cmp1 == APFloat::cmpGreaterThan)

- return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));

- return SDValue(CSrc, 0);

// FIXME: This should go in generic DAG combiner with an isTruncateFree check,

// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU

// issues.

@@ -2903,7 +2913,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,

SDValue X = LHS->getOperand(0);

if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&

- isTypeLegal(MVT::v2i16)) {

+ isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {

// Prefer build_vector as the canonical form if packed types are legal.

// (shl ([asz]ext i16:x), 16 -> build_vector 0, x

SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,

@@ -3017,6 +3027,92 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,

return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);

}

+SDValue AMDGPUTargetLowering::performTruncateCombine(

+ SDNode *N, DAGCombinerInfo &DCI) const {

+ SDLoc SL(N);

+ SelectionDAG &DAG = DCI.DAG;

+ EVT VT = N->getValueType(0);

+ SDValue Src = N->getOperand(0);

+ // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)

+ if (Src.getOpcode() == ISD::BITCAST) {

+ SDValue Vec = Src.getOperand(0);

+ if (Vec.getOpcode() == ISD::BUILD_VECTOR) {

+ SDValue Elt0 = Vec.getOperand(0);

+ EVT EltVT = Elt0.getValueType();

+ if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {

+ if (EltVT.isFloatingPoint()) {

+ Elt0 = DAG.getNode(ISD::BITCAST, SL,

+ EltVT.changeTypeToInteger(), Elt0);

+ }

+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);

+ }

+ // Equivalent of above for accessing the high element of a vector as an

+ // integer operation.

+ // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)

+ if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {

+ if (auto K = isConstOrConstSplat(Src.getOperand(1))) {

+ if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {

+ SDValue BV = stripBitcast(Src.getOperand(0));

+ if (BV.getOpcode() == ISD::BUILD_VECTOR &&

+ BV.getValueType().getVectorNumElements() == 2) {

+ SDValue SrcElt = BV.getOperand(1);

+ EVT SrcEltVT = SrcElt.getValueType();

+ if (SrcEltVT.isFloatingPoint()) {

+ SrcElt = DAG.getNode(ISD::BITCAST, SL,

+ SrcEltVT.changeTypeToInteger(), SrcElt);

+ }

+ return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);

+ }

+ // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.

+ //

+ // i16 (trunc (srl i64:x, K)), K <= 16 ->

+ // i16 (trunc (srl (i32 (trunc x), K)))

+ if (VT.getScalarSizeInBits() < 32) {

+ EVT SrcVT = Src.getValueType();

+ if (SrcVT.getScalarSizeInBits() > 32 &&

+ (Src.getOpcode() == ISD::SRL ||

+ Src.getOpcode() == ISD::SRA ||

+ Src.getOpcode() == ISD::SHL)) {

+ SDValue Amt = Src.getOperand(1);

+ KnownBits Known;

+ DAG.computeKnownBits(Amt, Known);

+ unsigned Size = VT.getScalarSizeInBits();

+ if ((Known.isConstant() && Known.getConstant().ule(Size)) ||

+ (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {

+ EVT MidVT = VT.isVector() ?

+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,

+ VT.getVectorNumElements()) : MVT::i32;

+ EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());

+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,

+ Src.getOperand(0));

+ DCI.AddToWorklist(Trunc.getNode());

+ if (Amt.getValueType() != NewShiftVT) {

+ Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);

+ DCI.AddToWorklist(Amt.getNode());

+ }

+ SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,

+ Trunc, Amt);

+ return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);

+ }

+ return SDValue();

// We need to specifically handle i64 mul here to avoid unnecessary conversion

// instructions. If we only match on the legalized i64 mul expansion,

// SimplifyDemandedBits will be unable to remove them because there will be

@@ -3058,6 +3154,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,

SDValue N0 = N->getOperand(0);

SDValue N1 = N->getOperand(1);

+ // SimplifyDemandedBits has the annoying habit of turning useful zero_extends

+ // in the source into any_extends if the result of the mul is truncated. Since

+ // we can assume the high bits are whatever we want, use the underlying value

+ // to avoid the unknown high bits from interfering.

+ if (N0.getOpcode() == ISD::ANY_EXTEND)

+ N0 = N0.getOperand(0);

+ if (N1.getOpcode() == ISD::ANY_EXTEND)

+ N1 = N1.getOperand(0);

SDValue Mul;

if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {

@@ -3495,6 +3602,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,

case ISD::FSIN:

case AMDGPUISD::RCP:

case AMDGPUISD::RCP_LEGACY:

+ case AMDGPUISD::RCP_IFLAG:

case AMDGPUISD::SIN_HW: {

SDValue CvtSrc = N0.getOperand(0);

if (CvtSrc.getOpcode() == ISD::FNEG) {

@@ -3571,6 +3679,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,

}

+SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,

+ DAGCombinerInfo &DCI) const {

+ const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));

+ if (!CFP)

+ return SDValue();

+ // XXX - Should this flush denormals?

+ const APFloat &Val = CFP->getValueAPF();

+ APFloat One(Val.getSemantics(), "1.0");

+ return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));

SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

DAGCombinerInfo &DCI) const {

SelectionDAG &DAG = DCI.DAG;

@@ -3617,12 +3737,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

// TODO: Generalize and move to DAGCombiner

SDValue Src = N->getOperand(0);

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {

- assert(Src.getValueType() == MVT::i64);

- SDLoc SL(N);

- uint64_t CVal = C->getZExtValue();

- return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,

- DAG.getConstant(Lo_32(CVal), SL, MVT::i32),

- DAG.getConstant(Hi_32(CVal), SL, MVT::i32));

+ if (Src.getValueType() == MVT::i64) {

+ SDLoc SL(N);

+ uint64_t CVal = C->getZExtValue();

+ return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,

+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),

+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));

+ }

}

if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {

@@ -3656,6 +3777,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

return performSraCombine(N, DCI);

}

+ case ISD::TRUNCATE:

+ return performTruncateCombine(N, DCI);

case ISD::MUL:

return performMulCombine(N, DCI);

case ISD::MULHS:

@@ -3768,18 +3891,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

return performLoadCombine(N, DCI);

case ISD::STORE:

return performStoreCombine(N, DCI);

- case AMDGPUISD::CLAMP:

- return performClampCombine(N, DCI);

- case AMDGPUISD::RCP: {

- if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {

- // XXX - Should this flush denormals?

- const APFloat &Val = CFP->getValueAPF();

- APFloat One(Val.getSemantics(), "1.0");

- return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));

- }

- break;

- }

+ case AMDGPUISD::RCP:

+ case AMDGPUISD::RCP_IFLAG:

+ return performRcpCombine(N, DCI);

case ISD::AssertZext:

case ISD::AssertSext:

return performAssertSZExtCombine(N, DCI);

@@ -3856,9 +3970,14 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,

}

uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(

- const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {

- unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();

- uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);

+ const MachineFunction &MF, const ImplicitParameter Param) const {

+ const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();

+ const AMDGPUSubtarget &ST =

+ AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());

+ unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());

+ unsigned Alignment = ST.getAlignmentForImplicitArgPtr();

+ uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +

+ ExplicitArgOffset;

switch (Param) {

case GRID_DIM:

return ArgOffset;

@@ -3907,6 +4026,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {

NODE_NAME_CASE(FMED3)

NODE_NAME_CASE(SMED3)

NODE_NAME_CASE(UMED3)

+ NODE_NAME_CASE(FDOT2)

NODE_NAME_CASE(URECIP)

NODE_NAME_CASE(DIV_SCALE)

NODE_NAME_CASE(DIV_FMAS)

@@ -3917,6 +4037,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {

NODE_NAME_CASE(RSQ)

NODE_NAME_CASE(RCP_LEGACY)

NODE_NAME_CASE(RSQ_LEGACY)

+ NODE_NAME_CASE(RCP_IFLAG)

NODE_NAME_CASE(FMUL_LEGACY)

NODE_NAME_CASE(RSQ_CLAMP)

NODE_NAME_CASE(LDEXP)

@@ -3941,6 +4062,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {

NODE_NAME_CASE(MAD_I24)

NODE_NAME_CASE(MAD_I64_I32)

NODE_NAME_CASE(MAD_U64_U32)

+ NODE_NAME_CASE(PERM)

NODE_NAME_CASE(TEXTURE_FETCH)

NODE_NAME_CASE(EXPORT)

NODE_NAME_CASE(EXPORT_DONE)

@@ -3957,6 +4079,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {

NODE_NAME_CASE(CVT_F32_UBYTE2)

NODE_NAME_CASE(CVT_F32_UBYTE3)

NODE_NAME_CASE(CVT_PKRTZ_F16_F32)

+ NODE_NAME_CASE(CVT_PKNORM_I16_F32)

+ NODE_NAME_CASE(CVT_PKNORM_U16_F32)

+ NODE_NAME_CASE(CVT_PK_I16_I32)

+ NODE_NAME_CASE(CVT_PK_U16_U32)

NODE_NAME_CASE(FP_TO_FP16)

NODE_NAME_CASE(FP16_ZEXT)

NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)

@@ -3976,14 +4102,21 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {

NODE_NAME_CASE(LOAD_CONSTANT)

NODE_NAME_CASE(TBUFFER_STORE_FORMAT)

NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)

+ NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)

NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)

+ NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)

NODE_NAME_CASE(ATOMIC_CMP_SWAP)

NODE_NAME_CASE(ATOMIC_INC)

NODE_NAME_CASE(ATOMIC_DEC)

+ NODE_NAME_CASE(ATOMIC_LOAD_FADD)

+ NODE_NAME_CASE(ATOMIC_LOAD_FMIN)

+ NODE_NAME_CASE(ATOMIC_LOAD_FMAX)

NODE_NAME_CASE(BUFFER_LOAD)

NODE_NAME_CASE(BUFFER_LOAD_FORMAT)

+ NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)

NODE_NAME_CASE(BUFFER_STORE)

NODE_NAME_CASE(BUFFER_STORE_FORMAT)

+ NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)

NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)

NODE_NAME_CASE(BUFFER_ATOMIC_ADD)

NODE_NAME_CASE(BUFFER_ATOMIC_SUB)

@@ -3995,6 +4128,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {

NODE_NAME_CASE(BUFFER_ATOMIC_OR)

NODE_NAME_CASE(BUFFER_ATOMIC_XOR)

NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)

case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;

}

return nullptr;

@@ -4108,14 +4242,45 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(

Known.Zero.setHighBits(32 - MaxValBits);

break;

}

+ case AMDGPUISD::PERM: {

+ ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));

+ if (!CMask)

+ return;

+ KnownBits LHSKnown, RHSKnown;

+ DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);

+ DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);

+ unsigned Sel = CMask->getZExtValue();

+ for (unsigned I = 0; I < 32; I += 8) {

+ unsigned SelBits = Sel & 0xff;

+ if (SelBits < 4) {

+ SelBits *= 8;

+ Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;

+ Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;

+ } else if (SelBits < 7) {

+ SelBits = (SelBits & 3) * 8;

+ Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;

+ Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;

+ } else if (SelBits == 0x0c) {

+ Known.Zero |= 0xff << I;

+ } else if (SelBits > 0x0c) {

+ Known.One |= 0xff << I;

+ }

+ Sel >>= 8;

+ }

+ break;

+ }

case ISD::INTRINSIC_WO_CHAIN: {

unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

switch (IID) {

case Intrinsic::amdgcn_mbcnt_lo:

case Intrinsic::amdgcn_mbcnt_hi: {

+ const GCNSubtarget &ST =

+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>();

// These return at most the wavefront size - 1.

unsigned Size = Op.getValueType().getSizeInBits();

- Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());

+ Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());

break;

}

default: