diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2018-07-28 10:51:19 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2018-07-28 10:51:19 +0000 |
commit | eb11fae6d08f479c0799db45860a98af528fa6e7 (patch) | |
tree | 44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/AMDGPU/AMDGPUISelLowering.cpp | |
parent | b8a2042aa938069e862750553db0e4d82d25822c (diff) | |
download | src-eb11fae6d08f479c0799db45860a98af528fa6e7.tar.gz src-eb11fae6d08f479c0799db45860a98af528fa6e7.zip |
Vendor import of llvm trunk r338150:vendor/llvm/llvm-trunk-r338150
Notes
Notes:
svn path=/vendor/llvm/dist/; revision=336809
svn path=/vendor/llvm/llvm-trunk-r338150/; revision=336814; tag=vendor/llvm/llvm-trunk-r338150
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 549 |
1 files changed, 357 insertions, 192 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 49929441ef21..b201126c593b 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This is the parent TargetLowering class for hardware code gen +/// This is the parent TargetLowering class for hardware code gen /// targets. // //===----------------------------------------------------------------------===// @@ -25,9 +25,12 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" #include "R600MachineFunctionInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -38,18 +41,6 @@ #include "llvm/Support/KnownBits.h" using namespace llvm; -static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - MachineFunction &MF = State.getMachineFunction(); - AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); - - uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - return true; -} - static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, @@ -71,7 +62,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::i64: case MVT::f64: case MVT::v2i32: - case MVT::v2f32: { + case MVT::v2f32: + case MVT::v4i16: + case MVT::v4f16: { // Up to SGPR0-SGPR39 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, &AMDGPU::SGPR_64RegClass, 20); @@ -92,7 +85,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::i64: case MVT::f64: case MVT::v2i32: - case MVT::v2f32: { + case MVT::v2f32: + case MVT::v4i16: + case MVT::v4f16: { return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, &AMDGPU::VReg_64RegClass, 31); } @@ -324,10 +319,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG, MVT::f32, Custom); setOperationAction(ISD::FLOG10, MVT::f32, Custom); - if (Subtarget->has16BitInsts()) { - setOperationAction(ISD::FLOG, MVT::f16, Custom); - setOperationAction(ISD::FLOG10, MVT::f16, Custom); - } setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); @@ -335,10 +326,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FREM, MVT::f32, Custom); setOperationAction(ISD::FREM, MVT::f64, Custom); - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); - // Expand to fneg + fadd. setOperationAction(ISD::FSUB, MVT::f64, Expand); @@ -353,19 +340,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - setOperationAction(ISD::FCEIL, MVT::f64, Custom); - setOperationAction(ISD::FTRUNC, MVT::f64, Custom); - setOperationAction(ISD::FRINT, MVT::f64, Custom); - setOperationAction(ISD::FFLOOR, MVT::f64, Custom); - } - - if (!Subtarget->hasBFI()) { - // fcopysign can be done in a single instruction with BFI. - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - } - setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); @@ -389,13 +363,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); - } - - if (!Subtarget->hasBCNT(32)) - setOperationAction(ISD::CTPOP, MVT::i32, Expand); - if (!Subtarget->hasBCNT(64)) - setOperationAction(ISD::CTPOP, MVT::i64, Expand); + // AMDGPU uses ADDC/SUBC/ADDE/SUBE + setOperationAction(ISD::ADDC, VT, Legal); + setOperationAction(ISD::SUBC, VT, Legal); + setOperationAction(ISD::ADDE, VT, Legal); + setOperationAction(ISD::SUBE, VT, Legal); + } // The hardware supports 32-bit ROTR, but not ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); @@ -416,28 +390,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMAX, MVT::i32, Legal); setOperationAction(ISD::UMAX, MVT::i32, Legal); - if (Subtarget->hasFFBH()) - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); - - if (Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); - setOperationAction(ISD::CTTZ, MVT::i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); - // We only really have 32-bit BFE instructions (and 16-bit on VI). - // - // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any - // effort to match them now. We want this to be false for i64 cases when the - // extraction isn't restricted to the upper or lower half. Ideally we would - // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that - // span the midpoint are probably relatively rare, so don't worry about them - // for now. - if (Subtarget->hasBFE()) - setHasExtractBitsInsn(true); - static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -468,10 +425,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Custom); setOperationAction(ISD::UDIVREM, VT, Expand); - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::SUBE, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); @@ -546,11 +499,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // vector compares until that is fixed. setHasMultipleConditionRegisters(true); - // SI at least has hardware support for floating point exceptions, but no way - // of using or handling them is implemented. They are also optional in OpenCL - // (Section 7.3) - setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); - PredictableSelectIsExpensive = false; // We want to find all load dependencies for long chains of stores to enable @@ -573,6 +521,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::MULHU); setTargetDAGCombine(ISD::MULHS); @@ -607,6 +556,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FNEARBYINT: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::SIN_HW: case AMDGPUISD::FMUL_LEGACY: case AMDGPUISD::FMIN_LEGACY: @@ -748,6 +698,37 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { return true; } +bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { + switch (N->getOpcode()) { + default: + return false; + case ISD::EntryToken: + case ISD::TokenFactor: + return true; + case ISD::INTRINSIC_WO_CHAIN: + { + unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntrID) { + default: + return false; + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + return true; + } + } + break; + case ISD::LOAD: + { + const LoadSDNode * L = dyn_cast<LoadSDNode>(N); + if (L->getMemOperand()->getAddrSpace() + == AMDGPUASI.CONSTANT_ADDRESS_32BIT) + return true; + return false; + } + break; + } +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// @@ -832,17 +813,6 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return isZExtFree(Val.getValueType(), VT2); } -// v_mad_mix* support a conversion from f16 to f32. -// -// There is only one special case when denormals are enabled we don't currently, -// where this is OK to use. -bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode, - EVT DestVT, EVT SrcVT) const { - return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() && - DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && - SrcVT.getScalarType() == MVT::f16; -} - bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit @@ -862,7 +832,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, switch (CC) { case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: - return CC_AMDGPU_Kernel; + llvm_unreachable("kernels should not be handled here"); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: @@ -885,7 +855,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, switch (CC) { case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: - return CC_AMDGPU_Kernel; + llvm_unreachable("kernels should not be handled here"); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: @@ -929,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, /// for each individual part is i8. We pass the memory type as LocVT to the /// calling convention analysis function and the register type (Ins[x].VT) as /// the ValVT. -void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, - const SmallVectorImpl<ISD::InputArg> &Ins) const { - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - const ISD::InputArg &In = Ins[i]; - EVT MemVT; - - unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT); - - if (!Subtarget->isAmdHsaOS() && - (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) { - // The ABI says the caller will extend these values to 32-bits. - MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32; - } else if (NumRegs == 1) { - // This argument is not split, so the IR type is the memory type. - assert(!In.Flags.isSplit()); - if (In.ArgVT.isExtended()) { - // We have an extended type, like i24, so we should just use the register type - MemVT = In.VT; - } else { - MemVT = In.ArgVT; - } - } else if (In.ArgVT.isVector() && In.VT.isVector() && - In.ArgVT.getScalarType() == In.VT.getScalarType()) { - assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements()); - // We have a vector value which has been split into a vector with - // the same scalar type, but fewer elements. This should handle - // all the floating-point vector types. - MemVT = In.VT; - } else if (In.ArgVT.isVector() && - In.ArgVT.getVectorNumElements() == NumRegs) { - // This arg has been split so that each element is stored in a separate - // register. - MemVT = In.ArgVT.getScalarType(); - } else if (In.ArgVT.isExtended()) { - // We have an extended type, like i65. - MemVT = In.VT; - } else { - unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs; - assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0); - if (In.VT.isInteger()) { - MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); - } else if (In.VT.isVector()) { - assert(!In.VT.getScalarType().isFloatingPoint()); - unsigned NumElements = In.VT.getVectorNumElements(); - assert(MemoryBits % NumElements == 0); - // This vector type has been split into another vector type with - // a different elements size. - EVT ScalarVT = EVT::getIntegerVT(State.getContext(), - MemoryBits / NumElements); - MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); +void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( + CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const { + const MachineFunction &MF = State.getMachineFunction(); + const Function &Fn = MF.getFunction(); + LLVMContext &Ctx = Fn.getParent()->getContext(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); + const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); + + unsigned MaxAlign = 1; + uint64_t ExplicitArgOffset = 0; + const DataLayout &DL = Fn.getParent()->getDataLayout(); + + unsigned InIndex = 0; + + for (const Argument &Arg : Fn.args()) { + Type *BaseArgTy = Arg.getType(); + unsigned Align = DL.getABITypeAlignment(BaseArgTy); + MaxAlign = std::max(Align, MaxAlign); + unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); + + uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + + // We're basically throwing away everything passed into us and starting over + // to get accurate in-memory offsets. The "PartOffset" is completely useless + // to us as computed in Ins. + // + // We also need to figure out what type legalization is trying to do to get + // the correct memory offsets. + + SmallVector<EVT, 16> ValueVTs; + SmallVector<uint64_t, 16> Offsets; + ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); + + for (unsigned Value = 0, NumValues = ValueVTs.size(); + Value != NumValues; ++Value) { + uint64_t BasePartOffset = Offsets[Value]; + + EVT ArgVT = ValueVTs[Value]; + EVT MemVT = ArgVT; + MVT RegisterVT = + getRegisterTypeForCallingConv(Ctx, ArgVT); + unsigned NumRegs = + getNumRegistersForCallingConv(Ctx, ArgVT); + + if (!Subtarget->isAmdHsaOS() && + (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) { + // The ABI says the caller will extend these values to 32-bits. + MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32; + } else if (NumRegs == 1) { + // This argument is not split, so the IR type is the memory type. + if (ArgVT.isExtended()) { + // We have an extended type, like i24, so we should just use the + // register type. + MemVT = RegisterVT; + } else { + MemVT = ArgVT; + } + } else if (ArgVT.isVector() && RegisterVT.isVector() && + ArgVT.getScalarType() == RegisterVT.getScalarType()) { + assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); + // We have a vector value which has been split into a vector with + // the same scalar type, but fewer elements. This should handle + // all the floating-point vector types. + MemVT = RegisterVT; + } else if (ArgVT.isVector() && + ArgVT.getVectorNumElements() == NumRegs) { + // This arg has been split so that each element is stored in a separate + // register. + MemVT = ArgVT.getScalarType(); + } else if (ArgVT.isExtended()) { + // We have an extended type, like i65. + MemVT = RegisterVT; } else { - llvm_unreachable("cannot deduce memory type."); + unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; + assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); + if (RegisterVT.isInteger()) { + MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); + } else if (RegisterVT.isVector()) { + assert(!RegisterVT.getScalarType().isFloatingPoint()); + unsigned NumElements = RegisterVT.getVectorNumElements(); + assert(MemoryBits % NumElements == 0); + // This vector type has been split into another vector type with + // a different elements size. + EVT ScalarVT = EVT::getIntegerVT(State.getContext(), + MemoryBits / NumElements); + MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); + } else { + llvm_unreachable("cannot deduce memory type."); + } } - } - // Convert one element vectors to scalar. - if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) - MemVT = MemVT.getScalarType(); + // Convert one element vectors to scalar. + if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) + MemVT = MemVT.getScalarType(); - if (MemVT.isExtended()) { - // This should really only happen if we have vec3 arguments - assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); - MemVT = MemVT.getPow2VectorType(State.getContext()); - } + if (MemVT.isExtended()) { + // This should really only happen if we have vec3 arguments + assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + MemVT = MemVT.getPow2VectorType(State.getContext()); + } - assert(MemVT.isSimple()); - allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags, - State); + unsigned PartOffset = 0; + for (unsigned i = 0; i != NumRegs; ++i) { + State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, + BasePartOffset + PartOffset, + MemVT.getSimpleVT(), + CCValAssign::Full)); + PartOffset += MemVT.getStoreSize(); + } + } } } @@ -1178,7 +1192,15 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); - if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) { + if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS || + G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) { + if (!MFI->isEntryFunction()) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + DiagnosticInfoUnsupported BadLDSDecl( + Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc()); + DAG.getContext()->diagnose(BadLDSDecl); + } + // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); @@ -1201,6 +1223,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> Args; + EVT VT = Op.getValueType(); + if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SDLoc SL(Op); + SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0)); + SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1)); + + SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi }); + return DAG.getNode(ISD::BITCAST, SL, VT, BV); + } + for (const SDUse &U : Op->ops()) DAG.ExtractVectorElements(U.get(), Args); @@ -1219,7 +1251,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); } -/// \brief Generate Min/Max node +/// Generate Min/Max node SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, @@ -1985,7 +2017,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); - // Extend back to to 64-bits. + // Extend back to 64-bits. SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); @@ -2806,28 +2838,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } -SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); - if (!CSrc) - return SDValue(); - - const APFloat &F = CSrc->getValueAPF(); - APFloat Zero = APFloat::getZero(F.getSemantics()); - APFloat::cmpResult Cmp0 = F.compare(Zero); - if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { - return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); - } - - APFloat One(F.getSemantics(), "1.0"); - APFloat::cmpResult Cmp1 = F.compare(One); - if (Cmp1 == APFloat::cmpGreaterThan) - return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); - - return SDValue(CSrc, 0); -} - // FIXME: This should go in generic DAG combiner with an isTruncateFree check, // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU // issues. @@ -2903,7 +2913,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue X = LHS->getOperand(0); if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && - isTypeLegal(MVT::v2i16)) { + isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { // Prefer build_vector as the canonical form if packed types are legal. // (shl ([asz]ext i16:x), 16 -> build_vector 0, x SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, @@ -3017,6 +3027,92 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } +SDValue AMDGPUTargetLowering::performTruncateCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + + // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) + if (Src.getOpcode() == ISD::BITCAST) { + SDValue Vec = Src.getOperand(0); + if (Vec.getOpcode() == ISD::BUILD_VECTOR) { + SDValue Elt0 = Vec.getOperand(0); + EVT EltVT = Elt0.getValueType(); + if (VT.getSizeInBits() <= EltVT.getSizeInBits()) { + if (EltVT.isFloatingPoint()) { + Elt0 = DAG.getNode(ISD::BITCAST, SL, + EltVT.changeTypeToInteger(), Elt0); + } + + return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); + } + } + } + + // Equivalent of above for accessing the high element of a vector as an + // integer operation. + // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) + if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { + if (auto K = isConstOrConstSplat(Src.getOperand(1))) { + if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { + SDValue BV = stripBitcast(Src.getOperand(0)); + if (BV.getOpcode() == ISD::BUILD_VECTOR && + BV.getValueType().getVectorNumElements() == 2) { + SDValue SrcElt = BV.getOperand(1); + EVT SrcEltVT = SrcElt.getValueType(); + if (SrcEltVT.isFloatingPoint()) { + SrcElt = DAG.getNode(ISD::BITCAST, SL, + SrcEltVT.changeTypeToInteger(), SrcElt); + } + + return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); + } + } + } + } + + // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. + // + // i16 (trunc (srl i64:x, K)), K <= 16 -> + // i16 (trunc (srl (i32 (trunc x), K))) + if (VT.getScalarSizeInBits() < 32) { + EVT SrcVT = Src.getValueType(); + if (SrcVT.getScalarSizeInBits() > 32 && + (Src.getOpcode() == ISD::SRL || + Src.getOpcode() == ISD::SRA || + Src.getOpcode() == ISD::SHL)) { + SDValue Amt = Src.getOperand(1); + KnownBits Known; + DAG.computeKnownBits(Amt, Known); + unsigned Size = VT.getScalarSizeInBits(); + if ((Known.isConstant() && Known.getConstant().ule(Size)) || + (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) { + EVT MidVT = VT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + VT.getVectorNumElements()) : MVT::i32; + + EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout()); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT, + Src.getOperand(0)); + DCI.AddToWorklist(Trunc.getNode()); + + if (Amt.getValueType() != NewShiftVT) { + Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT); + DCI.AddToWorklist(Amt.getNode()); + } + + SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT, + Trunc, Amt); + return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift); + } + } + } + + return SDValue(); +} + // We need to specifically handle i64 mul here to avoid unnecessary conversion // instructions. If we only match on the legalized i64 mul expansion, // SimplifyDemandedBits will be unable to remove them because there will be @@ -3058,6 +3154,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + + // SimplifyDemandedBits has the annoying habit of turning useful zero_extends + // in the source into any_extends if the result of the mul is truncated. Since + // we can assume the high bits are whatever we want, use the underlying value + // to avoid the unknown high bits from interfering. + if (N0.getOpcode() == ISD::ANY_EXTEND) + N0 = N0.getOperand(0); + + if (N1.getOpcode() == ISD::ANY_EXTEND) + N1 = N1.getOperand(0); + SDValue Mul; if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { @@ -3495,6 +3602,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, case ISD::FSIN: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::SIN_HW: { SDValue CvtSrc = N0.getOperand(0); if (CvtSrc.getOpcode() == ISD::FNEG) { @@ -3571,6 +3679,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, } } +SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CFP) + return SDValue(); + + // XXX - Should this flush denormals? + const APFloat &Val = CFP->getValueAPF(); + APFloat One(Val.getSemantics(), "1.0"); + return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -3617,12 +3737,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, // TODO: Generalize and move to DAGCombiner SDValue Src = N->getOperand(0); if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { - assert(Src.getValueType() == MVT::i64); - SDLoc SL(N); - uint64_t CVal = C->getZExtValue(); - return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, - DAG.getConstant(Lo_32(CVal), SL, MVT::i32), - DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + if (Src.getValueType() == MVT::i64) { + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + } } if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { @@ -3656,6 +3777,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performSraCombine(N, DCI); } + case ISD::TRUNCATE: + return performTruncateCombine(N, DCI); case ISD::MUL: return performMulCombine(N, DCI); case ISD::MULHS: @@ -3768,18 +3891,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); - case AMDGPUISD::CLAMP: - return performClampCombine(N, DCI); - case AMDGPUISD::RCP: { - if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) { - // XXX - Should this flush denormals? - const APFloat &Val = CFP->getValueAPF(); - APFloat One(Val.getSemantics(), "1.0"); - return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); - } - - break; - } + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_IFLAG: + return performRcpCombine(N, DCI); case ISD::AssertZext: case ISD::AssertSext: return performAssertSZExtCombine(N, DCI); @@ -3856,9 +3970,14 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( - const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { - unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); - uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment); + const MachineFunction &MF, const ImplicitParameter Param) const { + const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); + const AMDGPUSubtarget &ST = + AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction()); + unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction()); + unsigned Alignment = ST.getAlignmentForImplicitArgPtr(); + uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + + ExplicitArgOffset; switch (Param) { case GRID_DIM: return ArgOffset; @@ -3907,6 +4026,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMED3) NODE_NAME_CASE(SMED3) NODE_NAME_CASE(UMED3) + NODE_NAME_CASE(FDOT2) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) @@ -3917,6 +4037,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RCP_IFLAG) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(LDEXP) @@ -3941,6 +4062,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MAD_I24) NODE_NAME_CASE(MAD_I64_I32) NODE_NAME_CASE(MAD_U64_U32) + NODE_NAME_CASE(PERM) NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(EXPORT_DONE) @@ -3957,6 +4079,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(CVT_PKRTZ_F16_F32) + NODE_NAME_CASE(CVT_PKNORM_I16_F32) + NODE_NAME_CASE(CVT_PKNORM_U16_F32) + NODE_NAME_CASE(CVT_PK_I16_I32) + NODE_NAME_CASE(CVT_PK_U16_U32) NODE_NAME_CASE(FP_TO_FP16) NODE_NAME_CASE(FP16_ZEXT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) @@ -3976,14 +4102,21 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) + NODE_NAME_CASE(ATOMIC_LOAD_FADD) + NODE_NAME_CASE(ATOMIC_LOAD_FMIN) + NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(BUFFER_STORE) NODE_NAME_CASE(BUFFER_STORE_FORMAT) + NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) NODE_NAME_CASE(BUFFER_ATOMIC_ADD) NODE_NAME_CASE(BUFFER_ATOMIC_SUB) @@ -3995,6 +4128,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; @@ -4108,14 +4242,45 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.Zero.setHighBits(32 - MaxValBits); break; } + case AMDGPUISD::PERM: { + ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!CMask) + return; + + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); + DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); + unsigned Sel = CMask->getZExtValue(); + + for (unsigned I = 0; I < 32; I += 8) { + unsigned SelBits = Sel & 0xff; + if (SelBits < 4) { + SelBits *= 8; + Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; + Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; + } else if (SelBits < 7) { + SelBits = (SelBits & 3) * 8; + Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; + Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; + } else if (SelBits == 0x0c) { + Known.Zero |= 0xff << I; + } else if (SelBits > 0x0c) { + Known.One |= 0xff << I; + } + Sel >>= 8; + } + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); switch (IID) { case Intrinsic::amdgcn_mbcnt_lo: case Intrinsic::amdgcn_mbcnt_hi: { + const GCNSubtarget &ST = + DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); // These return at most the wavefront size - 1. unsigned Size = Op.getValueType().getSizeInBits(); - Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2()); + Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2()); break; } default: |