diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2302 |
1 files changed, 1703 insertions, 599 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 41ca7fe8bfaa..5b7fc2656a20 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Custom DAG lowering for SI +/// Custom DAG lowering for SI // //===----------------------------------------------------------------------===// @@ -26,6 +26,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -49,7 +50,6 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -73,6 +73,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include <cassert> @@ -111,8 +112,9 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) { } SITargetLowering::SITargetLowering(const TargetMachine &TM, - const SISubtarget &STI) - : AMDGPUTargetLowering(TM, STI) { + const GCNSubtarget &STI) + : AMDGPUTargetLowering(TM, STI), + Subtarget(&STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); @@ -138,14 +140,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); - } - if (Subtarget->hasVOP3PInsts()) { + // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } - computeRegisterProperties(STI.getRegisterInfo()); + computeRegisterProperties(Subtarget->getRegisterInfo()); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); @@ -173,7 +176,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -208,11 +210,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -232,13 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBCARRY, MVT::i64, Legal); #endif - //setOperationAction(ISD::ADDC, MVT::i64, Expand); - //setOperationAction(ISD::SUBC, MVT::i64, Expand); - // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64}) { + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -261,6 +263,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); + // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that // is expanded to avoid having two separate loops in case the index is a VGPR. @@ -285,12 +289,30 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); + // Avoid stack access for these. // TODO: Generalize to more vector types. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling @@ -302,7 +324,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); - if (getSubtarget()->hasFlatAddressSpace()) { + if (Subtarget->hasFlatAddressSpace()) { setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); } @@ -315,13 +337,56 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Custom); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::FLOG, MVT::f16, Custom); + setOperationAction(ISD::FLOG10, MVT::f16, Custom); + } + + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + if (!Subtarget->hasBFI()) { + // fcopysign can be done in a single instruction with BFI. + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + } + + if (!Subtarget->hasBCNT(32)) + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + + if (!Subtarget->hasBCNT(64)) + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + + if (Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + + if (Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + + // We only really have 32-bit BFE instructions (and 16-bit on VI). + // + // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any + // effort to match them now. We want this to be false for i64 cases when the + // extraction isn't restricted to the upper or lower half. Ideally we would + // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that + // span the midpoint are probably relatively rare, so don't worry about them + // for now. + if (Subtarget->hasBFE()) + setHasExtractBitsInsn(true); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); - if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); + } else { + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); } setOperationAction(ISD::FFLOOR, MVT::f64, Legal); @@ -408,10 +473,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f16, Legal); if (!Subtarget->hasFP16Denormals()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - } - if (Subtarget->hasVOP3PInsts()) { - for (MVT VT : {MVT::v2i16, MVT::v2f16}) { + for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -438,6 +501,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::Constant, MVT::v2i16, Legal); setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); + setOperationAction(ISD::UNDEF, MVT::v2i16, Legal); + setOperationAction(ISD::UNDEF, MVT::v2f16, Legal); + setOperationAction(ISD::STORE, MVT::v2i16, Promote); AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); setOperationAction(ISD::STORE, MVT::v2f16, Promote); @@ -454,11 +520,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); setOperationAction(ISD::XOR, MVT::v2i16, Promote); AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2i16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2f16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v4i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); + + if (!Subtarget->hasVOP3PInsts()) { + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); + } + + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); + // This isn't really legal, but this avoids the legalizer unrolling it (and + // allows matching fneg (fabs x) patterns) + setOperationAction(ISD::FABS, MVT::v2f16, Legal); + } + + if (Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::ADD, MVT::v2i16, Legal); setOperationAction(ISD::SUB, MVT::v2i16, Legal); setOperationAction(ISD::MUL, MVT::v2i16, Legal); @@ -471,26 +564,51 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMAX, MVT::v2i16, Legal); setOperationAction(ISD::FADD, MVT::v2f16, Legal); - setOperationAction(ISD::FNEG, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); - - // This isn't really legal, but this avoids the legalizer unrolling it (and - // allows matching fneg (fabs x) patterns) - setOperationAction(ISD::FABS, MVT::v2f16, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + setOperationAction(ISD::SHL, MVT::v4i16, Custom); + setOperationAction(ISD::SRA, MVT::v4i16, Custom); + setOperationAction(ISD::SRL, MVT::v4i16, Custom); + setOperationAction(ISD::ADD, MVT::v4i16, Custom); + setOperationAction(ISD::SUB, MVT::v4i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i16, Custom); + + setOperationAction(ISD::SMIN, MVT::v4i16, Custom); + setOperationAction(ISD::SMAX, MVT::v4i16, Custom); + setOperationAction(ISD::UMIN, MVT::v4i16, Custom); + setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + + setOperationAction(ISD::FADD, MVT::v4f16, Custom); + setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); + setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); + + setOperationAction(ISD::SELECT, MVT::v4i16, Custom); + setOperationAction(ISD::SELECT, MVT::v4f16, Custom); + } + + setOperationAction(ISD::FNEG, MVT::v4f16, Custom); + setOperationAction(ISD::FABS, MVT::v4f16, Custom); + + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::SELECT, MVT::v2i16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2f16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); } else { + // Legalization hack. setOperationAction(ISD::SELECT, MVT::v2i16, Custom); setOperationAction(ISD::SELECT, MVT::v2f16, Custom); + + setOperationAction(ISD::FNEG, MVT::v2f16, Custom); + setOperationAction(ISD::FABS, MVT::v2f16, Custom); } for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { @@ -505,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMIN); @@ -542,16 +661,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); setSchedulingPreference(Sched::RegPressure); + + // SI at least has hardware support for floating point exceptions, but no way + // of using or handling them is implemented. They are also optional in OpenCL + // (Section 7.3) + setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); } -const SISubtarget *SITargetLowering::getSubtarget() const { - return static_cast<const SISubtarget *>(Subtarget); +const GCNSubtarget *SITargetLowering::getSubtarget() const { + return Subtarget; } //===----------------------------------------------------------------------===// // TargetLowering queries //===----------------------------------------------------------------------===// +// v_mad_mix* support a conversion from f16 to f32. +// +// There is only one special case when denormals are enabled we don't currently, +// where this is OK to use. +bool SITargetLowering::isFPExtFoldable(unsigned Opcode, + EVT DestVT, EVT SrcVT) const { + return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && + SrcVT.getScalarType() == MVT::f16; +} + bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. @@ -562,9 +698,55 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, unsigned IntrID) const { + if (const AMDGPU::RsrcIntrinsic *RsrcIntr = + AMDGPU::lookupRsrcIntrinsic(IntrID)) { + AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), + (Intrinsic::ID)IntrID); + if (Attr.hasFnAttribute(Attribute::ReadNone)) + return false; + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + if (RsrcIntr->IsImage) { + Info.ptrVal = MFI->getImagePSV( + *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), + CI.getArgOperand(RsrcIntr->RsrcArg)); + Info.align = 0; + } else { + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), + CI.getArgOperand(RsrcIntr->RsrcArg)); + } + + Info.flags = MachineMemOperand::MODereferenceable; + if (Attr.hasFnAttribute(Attribute::ReadOnly)) { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.flags |= MachineMemOperand::MOLoad; + } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); + Info.flags |= MachineMemOperand::MOStore; + } else { + // Atomic + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable; + + // XXX - Should this be volatile without known ordering? + Info.flags |= MachineMemOperand::MOVolatile; + } + return true; + } + switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -578,220 +760,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } - // Image load. - case Intrinsic::amdgcn_image_load: - case Intrinsic::amdgcn_image_load_mip: - - // Sample. - case Intrinsic::amdgcn_image_sample: - case Intrinsic::amdgcn_image_sample_cl: - case Intrinsic::amdgcn_image_sample_d: - case Intrinsic::amdgcn_image_sample_d_cl: - case Intrinsic::amdgcn_image_sample_l: - case Intrinsic::amdgcn_image_sample_b: - case Intrinsic::amdgcn_image_sample_b_cl: - case Intrinsic::amdgcn_image_sample_lz: - case Intrinsic::amdgcn_image_sample_cd: - case Intrinsic::amdgcn_image_sample_cd_cl: - - // Sample with comparison. - case Intrinsic::amdgcn_image_sample_c: - case Intrinsic::amdgcn_image_sample_c_cl: - case Intrinsic::amdgcn_image_sample_c_d: - case Intrinsic::amdgcn_image_sample_c_d_cl: - case Intrinsic::amdgcn_image_sample_c_l: - case Intrinsic::amdgcn_image_sample_c_b: - case Intrinsic::amdgcn_image_sample_c_b_cl: - case Intrinsic::amdgcn_image_sample_c_lz: - case Intrinsic::amdgcn_image_sample_c_cd: - case Intrinsic::amdgcn_image_sample_c_cd_cl: - - // Sample with offsets. - case Intrinsic::amdgcn_image_sample_o: - case Intrinsic::amdgcn_image_sample_cl_o: - case Intrinsic::amdgcn_image_sample_d_o: - case Intrinsic::amdgcn_image_sample_d_cl_o: - case Intrinsic::amdgcn_image_sample_l_o: - case Intrinsic::amdgcn_image_sample_b_o: - case Intrinsic::amdgcn_image_sample_b_cl_o: - case Intrinsic::amdgcn_image_sample_lz_o: - case Intrinsic::amdgcn_image_sample_cd_o: - case Intrinsic::amdgcn_image_sample_cd_cl_o: - - // Sample with comparison and offsets. - case Intrinsic::amdgcn_image_sample_c_o: - case Intrinsic::amdgcn_image_sample_c_cl_o: - case Intrinsic::amdgcn_image_sample_c_d_o: - case Intrinsic::amdgcn_image_sample_c_d_cl_o: - case Intrinsic::amdgcn_image_sample_c_l_o: - case Intrinsic::amdgcn_image_sample_c_b_o: - case Intrinsic::amdgcn_image_sample_c_b_cl_o: - case Intrinsic::amdgcn_image_sample_c_lz_o: - case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: - - // Basic gather4 - case Intrinsic::amdgcn_image_gather4: - case Intrinsic::amdgcn_image_gather4_cl: - case Intrinsic::amdgcn_image_gather4_l: - case Intrinsic::amdgcn_image_gather4_b: - case Intrinsic::amdgcn_image_gather4_b_cl: - case Intrinsic::amdgcn_image_gather4_lz: - - // Gather4 with comparison - case Intrinsic::amdgcn_image_gather4_c: - case Intrinsic::amdgcn_image_gather4_c_cl: - case Intrinsic::amdgcn_image_gather4_c_l: - case Intrinsic::amdgcn_image_gather4_c_b: - case Intrinsic::amdgcn_image_gather4_c_b_cl: - case Intrinsic::amdgcn_image_gather4_c_lz: - - // Gather4 with offsets - case Intrinsic::amdgcn_image_gather4_o: - case Intrinsic::amdgcn_image_gather4_cl_o: - case Intrinsic::amdgcn_image_gather4_l_o: - case Intrinsic::amdgcn_image_gather4_b_o: - case Intrinsic::amdgcn_image_gather4_b_cl_o: - case Intrinsic::amdgcn_image_gather4_lz_o: - - // Gather4 with comparison and offsets - case Intrinsic::amdgcn_image_gather4_c_o: - case Intrinsic::amdgcn_image_gather4_c_cl_o: - case Intrinsic::amdgcn_image_gather4_c_l_o: - case Intrinsic::amdgcn_image_gather4_c_b_o: - case Intrinsic::amdgcn_image_gather4_c_b_cl_o: - case Intrinsic::amdgcn_image_gather4_c_lz_o: { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = MFI->getImagePSV( - *MF.getSubtarget<SISubtarget>().getInstrInfo(), - CI.getArgOperand(1)); - Info.align = 0; - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MODereferenceable; - return true; - } - case Intrinsic::amdgcn_image_store: - case Intrinsic::amdgcn_image_store_mip: { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); - Info.ptrVal = MFI->getImagePSV( - *MF.getSubtarget<SISubtarget>().getInstrInfo(), - CI.getArgOperand(2)); - Info.flags = MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable; - Info.align = 0; - return true; - } - case Intrinsic::amdgcn_image_atomic_swap: - case Intrinsic::amdgcn_image_atomic_add: - case Intrinsic::amdgcn_image_atomic_sub: - case Intrinsic::amdgcn_image_atomic_smin: - case Intrinsic::amdgcn_image_atomic_umin: - case Intrinsic::amdgcn_image_atomic_smax: - case Intrinsic::amdgcn_image_atomic_umax: - case Intrinsic::amdgcn_image_atomic_and: - case Intrinsic::amdgcn_image_atomic_or: - case Intrinsic::amdgcn_image_atomic_xor: - case Intrinsic::amdgcn_image_atomic_inc: - case Intrinsic::amdgcn_image_atomic_dec: { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = MFI->getImagePSV( - *MF.getSubtarget<SISubtarget>().getInstrInfo(), - CI.getArgOperand(2)); - - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable; - - // XXX - Should this be volatile without known ordering? - Info.flags |= MachineMemOperand::MOVolatile; - return true; - } - case Intrinsic::amdgcn_image_atomic_cmpswap: { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = MFI->getImagePSV( - *MF.getSubtarget<SISubtarget>().getInstrInfo(), - CI.getArgOperand(3)); - - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable; - - // XXX - Should this be volatile without known ordering? - Info.flags |= MachineMemOperand::MOVolatile; - return true; - } - case Intrinsic::amdgcn_tbuffer_load: - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = MFI->getBufferPSV( - *MF.getSubtarget<SISubtarget>().getInstrInfo(), - CI.getArgOperand(0)); - Info.memVT = MVT::getVT(CI.getType()); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MODereferenceable; - - // There is a constant offset component, but there are additional register - // offsets which could break AA if we set the offset to anything non-0. - return true; - } - case Intrinsic::amdgcn_tbuffer_store: - case Intrinsic::amdgcn_buffer_store: - case Intrinsic::amdgcn_buffer_store_format: { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_VOID; - Info.ptrVal = MFI->getBufferPSV( - *MF.getSubtarget<SISubtarget>().getInstrInfo(), - CI.getArgOperand(1)); - Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); - Info.flags = MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable; - return true; - } - case Intrinsic::amdgcn_buffer_atomic_swap: - case Intrinsic::amdgcn_buffer_atomic_add: - case Intrinsic::amdgcn_buffer_atomic_sub: - case Intrinsic::amdgcn_buffer_atomic_smin: - case Intrinsic::amdgcn_buffer_atomic_umin: - case Intrinsic::amdgcn_buffer_atomic_smax: - case Intrinsic::amdgcn_buffer_atomic_umax: - case Intrinsic::amdgcn_buffer_atomic_and: - case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = MFI->getBufferPSV( - *MF.getSubtarget<SISubtarget>().getInstrInfo(), - CI.getArgOperand(1)); - Info.memVT = MVT::getVT(CI.getType()); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile; - return true; - } - case Intrinsic::amdgcn_buffer_atomic_cmpswap: { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = MFI->getBufferPSV( - *MF.getSubtarget<SISubtarget>().getInstrInfo(), - CI.getArgOperand(2)); - Info.memVT = MVT::getVT(CI.getType()); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile; - return true; - } default: return false; } @@ -802,7 +770,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, Type *&AccessTy) const { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); Ops.push_back(Ptr); @@ -892,7 +863,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AS == AMDGPUASI.GLOBAL_ADDRESS) return isLegalGlobalAddressingMode(AM); - if (AS == AMDGPUASI.CONSTANT_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -903,19 +875,19 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // will use a MUBUF load. // FIXME?: We also need to do this if unaligned, but we don't know the // alignment here. - if (DL.getTypeStoreSize(Ty) < 4) + if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) return isLegalGlobalAddressingMode(AM); - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { // SMRD instructions have an 8-bit, dword offset on SI. if (!isUInt<8>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { // On CI+, this can also be a 32-bit literal constant offset. If it fits // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -1015,7 +987,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { - *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ? + *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS || + AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ? (Align % 4 == 0) : true; } @@ -1058,7 +1031,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS; + AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -1070,7 +1044,7 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); - const Instruction *I = dyn_cast<Instruction>(Ptr); + const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); return I && I->getMetadata("amdgpu.noclobber"); } @@ -1149,14 +1123,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); - return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Offset, SL, PtrVT)); + return DAG.getObjectPtrOffset(SL, BasePtr, Offset); } SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const { - auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); - uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(), + FIRST_IMPLICIT); return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); } @@ -1183,18 +1156,42 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, SDValue SITargetLowering::lowerKernargMemParameter( SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - uint64_t Offset, bool Signed, + uint64_t Offset, unsigned Align, bool Signed, const ISD::InputArg *Arg) const { - const DataLayout &DL = DAG.getDataLayout(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - unsigned Align = DL.getABITypeAlignment(Ty); + // Try to avoid using an extload by loading earlier than the argument address, + // and extracting the relevant bits. The load should hopefully be merged with + // the previous argument. + if (MemVT.getStoreSize() < 4 && Align < 4) { + // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). + int64_t AlignDownOffset = alignDown(Offset, 4); + int64_t OffsetDiff = Offset - AlignDownOffset; + + EVT IntVT = MemVT.changeTypeToInteger(); + + // TODO: If we passed in the base kernel offset we could have a better + // alignment than 4, but we don't really need it. + SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); + SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + + SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); + SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); + + SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract); + ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); + ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); + + + return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL); + } SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, - MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); @@ -1269,36 +1266,51 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, FunctionType *FType, SIMachineFunctionInfo *Info) { for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { - const ISD::InputArg &Arg = Ins[I]; + const ISD::InputArg *Arg = &Ins[I]; // First check if it's a PS input addr. - if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal() && PSInputNum <= 15) { + if (CallConv == CallingConv::AMDGPU_PS && + !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { + + bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); + + // Inconveniently only the first part of the split is marked as isSplit, + // so skip to the end. We only want to increment PSInputNum once for the + // entire split argument. + if (Arg->Flags.isSplit()) { + while (!Arg->Flags.isSplitEnd()) { + assert(!Arg->VT.isVector() && + "unexpected vector split in ps argument type"); + if (!SkipArg) + Splits.push_back(*Arg); + Arg = &Ins[++I]; + } + } - if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { + if (SkipArg) { // We can safely skip PS inputs. - Skipped.set(I); + Skipped.set(Arg->getOrigArgIndex()); ++PSInputNum; continue; } Info->markPSInputAllocated(PSInputNum); - if (Arg.Used) + if (Arg->Used) Info->markPSInputEnabled(PSInputNum); ++PSInputNum; } // Second split vertices into their elements. - if (Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; + if (Arg->VT.isVector()) { + ISD::InputArg NewArg = *Arg; NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); + NewArg.VT = Arg->VT.getVectorElementType(); // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, // NOT four or eight. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + Type *ParamType = FType->getParamType(Arg->getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); for (unsigned J = 0; J != NumElements; ++J) { @@ -1306,7 +1318,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, NewArg.PartOffset += NewArg.VT.getStoreSize(); } } else { - Splits.push_back(Arg); + Splits.push_back(*Arg); } } } @@ -1564,8 +1576,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - if (ST.isAmdCodeObjectV2(MF)) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (ST.isAmdCodeObjectV2(MF.getFunction())) { if (RequiresStackAccess) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user @@ -1677,12 +1689,12 @@ SDValue SITargetLowering::LowerFormalArguments( const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); + const Function &Fn = MF.getFunction(); FunctionType *FType = MF.getFunction().getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { - const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported NoGraphicsHSA( Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); DAG.getContext()->diagnose(NoGraphicsHSA); @@ -1779,9 +1791,16 @@ SDValue SITargetLowering::LowerFormalArguments( SmallVector<SDValue, 16> Chains; - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + // FIXME: This is the minimum kernel argument alignment. We should improve + // this to the maximum alignment of the arguments. + // + // FIXME: Alignment of explicit arguments totally broken with non-0 explicit + // kern arg offset. + const unsigned KernelArgBaseAlign = 16; + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; - if (Skipped[i]) { + if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); continue; } @@ -1793,19 +1812,16 @@ SDValue SITargetLowering::LowerFormalArguments( VT = Ins[i].VT; EVT MemVT = VA.getLocVT(); - const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) + - VA.getLocMemOffset(); - Info->setABIArgOffset(Offset + MemVT.getStoreSize()); + const uint64_t Offset = VA.getLocMemOffset(); + unsigned Align = MinAlign(KernelArgBaseAlign, Offset); - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]); + DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be @@ -1913,7 +1929,7 @@ SDValue SITargetLowering::LowerFormalArguments( auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); - ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo()); + ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); unsigned StackArgSize = CCInfo.getNextStackOffset(); Info->setBytesInStackArgArea(StackArgSize); @@ -2058,8 +2074,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // FIXME: Does sret work properly? if (!Info->isEntryFunction()) { - const SIRegisterInfo *TRI - = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { @@ -2161,8 +2176,7 @@ void SITargetLowering::passSpecialInputs( SelectionDAG &DAG = CLI.DAG; const SDLoc &DL = CLI.DL; - const SISubtarget *ST = getSubtarget(); - const SIRegisterInfo *TRI = ST->getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); @@ -2355,6 +2369,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported required tail call to function "); } + if (AMDGPU::isShader(MF.getFunction().getCallingConv())) { + // Note the issue is with the CC of the calling function, not of the call + // itself. + return lowerUnhandledCall(CLI, InVals, + "unsupported call from graphics shader of function "); + } + // The first 4 bytes are reserved for the callee's emergency stack slot. const unsigned CalleeUsableStackOffset = 4; @@ -2600,7 +2621,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -2660,7 +2681,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, } - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { report_fatal_error(Twine("invalid register \"" + StringRef(RegName) + "\" for subtarget.")); @@ -2734,7 +2755,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( unsigned PhiReg, unsigned InitSaveExecReg, int Offset, - bool UseGPRIdxMode) { + bool UseGPRIdxMode, + bool IsIndirectSrc) { MachineBasicBlock::iterator I = LoopBB.begin(); unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -2763,6 +2785,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addReg(CurrentIdxReg) .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); + // Update EXEC, save the original EXEC value to VCC. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) + .addReg(CondReg, RegState::Kill); + + MRI.setSimpleHint(NewExec, CondReg); + if (UseGPRIdxMode) { unsigned IdxReg; if (Offset == 0) { @@ -2773,11 +2801,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addReg(CurrentIdxReg, RegState::Kill) .addImm(Offset); } - - MachineInstr *SetIdx = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX)) - .addReg(IdxReg, RegState::Kill); - SetIdx->getOperand(2).setIsUndef(); + unsigned IdxMode = IsIndirectSrc ? + VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + MachineInstr *SetOn = + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .addReg(IdxReg, RegState::Kill) + .addImm(IdxMode); + SetOn->getOperand(3).setIsUndef(); } else { // Move index from VCC into M0 if (Offset == 0) { @@ -2790,12 +2820,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( } } - // Update EXEC, save the original EXEC value to VCC. - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) - .addReg(CondReg, RegState::Kill); - - MRI.setSimpleHint(NewExec, CondReg); - // Update EXEC, switch all done bits to 0 and all todo bits to 1. MachineInstr *InsertPt = BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) @@ -2823,7 +2847,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, unsigned InitResultReg, unsigned PhiReg, int Offset, - bool UseGPRIdxMode) { + bool UseGPRIdxMode, + bool IsIndirectSrc) { MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2862,7 +2887,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, InitResultReg, DstReg, PhiReg, TmpExec, - Offset, UseGPRIdxMode); + Offset, UseGPRIdxMode, IsIndirectSrc); MachineBasicBlock::iterator First = RemainderBB->begin(); BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) @@ -2947,7 +2972,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, // Control flow needs to be inserted if indexing with a VGPR. static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, - const SISubtarget &ST) { + const GCNSubtarget &ST) { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineFunction *MF = MBB.getParent(); @@ -2997,17 +3022,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); - if (UseGPRIdxMode) { - MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addImm(0) // Reset inside loop. - .addImm(VGPRIndexMode::SRC0_ENABLE); - SetOn->getOperand(3).setIsUndef(); - - // Disable again after the loop. - BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); - } - - auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode); + auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, + Offset, UseGPRIdxMode, true); MachineBasicBlock *LoopBB = InsPt->getParent(); if (UseGPRIdxMode) { @@ -3015,6 +3031,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, .addReg(SrcReg, RegState::Undef, SubReg) .addReg(SrcReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); + BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) .addReg(SrcReg, RegState::Undef, SubReg) @@ -3046,7 +3063,7 @@ static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, - const SISubtarget &ST) { + const GCNSubtarget &ST) { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineFunction *MF = MBB.getParent(); @@ -3115,22 +3132,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); - if (UseGPRIdxMode) { - MachineBasicBlock::iterator I(&MI); - - MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addImm(0) // Reset inside loop. - .addImm(VGPRIndexMode::DST_ENABLE); - SetOn->getOperand(3).setIsUndef(); - - // Disable again after the loop. - BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); - } - unsigned PhiReg = MRI.createVirtualRegister(VecRC); auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, - Offset, UseGPRIdxMode); + Offset, UseGPRIdxMode, false); MachineBasicBlock *LoopBB = InsPt->getParent(); if (UseGPRIdxMode) { @@ -3140,6 +3145,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, .addReg(Dst, RegState::ImplicitDefine) .addReg(PhiReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); + BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); @@ -3350,8 +3356,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::ADJCALLSTACKDOWN: { const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); MachineInstrBuilder MIB(*MF, &MI); + + // Add an implicit use of the frame offset reg to prevent the restore copy + // inserted after the call from being reorderd after stack operations in the + // the caller's frame. MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) - .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); + .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit) + .addReg(Info->getFrameOffsetReg(), RegState::Implicit); return BB; } case AMDGPU::SI_CALL_ISEL: @@ -3441,12 +3452,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: + case MVT::f32: { // This is as fast on some subtargets. However, we always have full rate f32 // mad available which returns the same result as the separate operations // which we should prefer over fma. We can't use this if we want to support // denormals, so only report this in these cases. - return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); + if (Subtarget->hasFP32Denormals()) + return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); + + // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. + return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); + } case MVT::f64: return true; case MVT::f16: @@ -3462,6 +3478,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// +// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the +// wider vector type is legal. +SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4f16); + + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); + + SDLoc SL(Op); + SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + +// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the +// wider vector type is legal. +SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + SDValue Lo0, Hi0; + std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Lo1, Hi1; + std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); + + SDLoc SL(Op); + + SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -3494,15 +3553,105 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::BUILD_VECTOR: + return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); case ISD::TRAP: - case ISD::DEBUGTRAP: return lowerTRAP(Op, DAG); + case ISD::DEBUGTRAP: + return lowerDEBUGTRAP(Op, DAG); + case ISD::FABS: + case ISD::FNEG: + return splitUnaryVectorOp(Op, DAG); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FADD: + case ISD::FMUL: + return splitBinaryVectorOp(Op, DAG); } return SDValue(); } +static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, + const SDLoc &DL, + SelectionDAG &DAG, bool Unpacked) { + if (!LoadVT.isVector()) + return Result; + + if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. + // Truncate to v2i16/v4i16. + EVT IntLoadVT = LoadVT.changeTypeToInteger(); + + // Workaround legalizer not scalarizing truncate after vector op + // legalization byt not creating intermediate vector trunc. + SmallVector<SDValue, 4> Elts; + DAG.ExtractVectorElements(Result, Elts); + for (SDValue &Elt : Elts) + Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); + + Result = DAG.getBuildVector(IntLoadVT, DL, Elts); + + // Bitcast to original type (v2f16/v4f16). + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); + } + + // Cast back to the original packed type. + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); +} + +SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, + MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic) const { + SDLoc DL(M); + SmallVector<SDValue, 10> Ops; + Ops.reserve(M->getNumOperands()); + + Ops.push_back(M->getOperand(0)); + if (IsIntrinsic) + Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32)); + + // Skip 1, as it is the intrinsic ID. + for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I) + Ops.push_back(M->getOperand(I)); + + bool Unpacked = Subtarget->hasUnpackedD16VMem(); + EVT LoadVT = M->getValueType(0); + + EVT EquivLoadVT = LoadVT; + if (Unpacked && LoadVT.isVector()) { + EquivLoadVT = LoadVT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()) : LoadVT; + } + + // Change from v4f16/v2f16 to EquivLoadVT. + SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); + + SDValue Load + = DAG.getMemIntrinsicNode( + IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + if (!Unpacked) // Just adjusted the opcode. + return Load; + + SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); + + return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -3554,6 +3703,15 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } break; } + case ISD::INTRINSIC_W_CHAIN: { + if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + return; + } + + break; + } case ISD::SELECT: { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -3576,12 +3734,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); return; } + case ISD::FNEG: { + if (N->getValueType(0) != MVT::v2f16) + break; + + SDLoc SL(N); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); + + SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, + BC, + DAG.getConstant(0x80008000, SL, MVT::i32)); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); + return; + } + case ISD::FABS: { + if (N->getValueType(0) != MVT::v2f16) + break; + + SDLoc SL(N); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); + + SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, + BC, + DAG.getConstant(0x7fff7fff, SL, MVT::i32)); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); + return; + } default: break; } } -/// \brief Helper function for LowerBRCOND +/// Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { SDNode *Parent = Value.getNode(); @@ -3646,13 +3830,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects( bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -3789,40 +3975,37 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); - MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); - unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ? - SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap; - - if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && - Subtarget->isTrapHandlerEnabled()) { - SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - unsigned UserSGPR = Info->getQueuePtrUserSGPR(); - assert(UserSGPR != AMDGPU::NoRegister); - - SDValue QueuePtr = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); - - SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); - - SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, - QueuePtr, SDValue()); + if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || + !Subtarget->isTrapHandlerEnabled()) + return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); - SDValue Ops[] = { - ToReg, - DAG.getTargetConstant(TrapID, SL, MVT::i16), - SGPR01, - ToReg.getValue(1) - }; + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); + SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, + QueuePtr, SDValue()); + SDValue Ops[] = { + ToReg, + DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16), + SGPR01, + ToReg.getValue(1) + }; + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); +} - return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); - } +SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Chain = Op.getOperand(0); + MachineFunction &MF = DAG.getMachineFunction(); - switch (TrapID) { - case SISubtarget::TrapIDLLVMTrap: - return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); - case SISubtarget::TrapIDLLVMDebugTrap: { + if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || + !Subtarget->isTrapHandlerEnabled()) { DiagnosticInfoUnsupported NoTrap(MF.getFunction(), "debugtrap handler not supported", Op.getDebugLoc(), @@ -3831,11 +4014,12 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { Ctx.diagnose(NoTrap); return Chain; } - default: - llvm_unreachable("unsupported trap handler type!"); - } - return Chain; + SDValue Ops[] = { + Chain, + DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16) + }; + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, @@ -3948,34 +4132,78 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDValue InsVal = Op.getOperand(1); SDValue Idx = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + + assert(VecSize <= 64); + + unsigned NumElts = VecVT.getVectorNumElements(); + SDLoc SL(Op); + auto KIdx = dyn_cast<ConstantSDNode>(Idx); + + if (NumElts == 4 && EltSize == 16 && KIdx) { + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); + + SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, + DAG.getConstant(0, SL, MVT::i32)); + SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, + DAG.getConstant(1, SL, MVT::i32)); + + SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf); + SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf); + + unsigned Idx = KIdx->getZExtValue(); + bool InsertLo = Idx < 2; + SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, + InsertLo ? LoVec : HiVec, + DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), + DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); + + InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf); + + SDValue Concat = InsertLo ? + DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) : + DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf }); + + return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); + } + if (isa<ConstantSDNode>(Idx)) return SDValue(); + MVT IntVT = MVT::getIntegerVT(VecSize); + // Avoid stack access for dynamic indexing. - SDLoc SL(Op); - SDValue Vec = Op.getOperand(0); - SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1)); + SDValue Val = InsVal; + if (InsVal.getValueType() == MVT::f16) + Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val); + SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); - // Convert vector index to bit-index. - SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, - DAG.getConstant(16, SL, MVT::i32)); + assert(isPowerOf2_32(EltSize)); + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); - SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + // Convert vector index to bit-index. + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); - SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32, - DAG.getConstant(0xffff, SL, MVT::i32), + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); + SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, + DAG.getConstant(0xffff, SL, IntVT), ScaledIdx); - SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal); - SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32, - DAG.getNOT(SL, BFM, MVT::i32), BCVec); + SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); + SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT, + DAG.getNOT(SL, BFM, IntVT), BCVec); - SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS); - return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI); + SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); + return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); } SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, @@ -3985,51 +4213,87 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, EVT ResultVT = Op.getValueType(); SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); + EVT VecVT = Vec.getValueType(); + unsigned VecSize = VecVT.getSizeInBits(); + EVT EltVT = VecVT.getVectorElementType(); + assert(VecSize <= 64); DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); - // Make sure we we do any optimizations that will make it easier to fold + // Make sure we do any optimizations that will make it easier to fold // source modifiers before obscuring it with bit operations. // XXX - Why doesn't this get called when vector_shuffle is expanded? if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; - if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) { - SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + unsigned EltSize = EltVT.getSizeInBits(); + assert(isPowerOf2_32(EltSize)); - if (CIdx->getZExtValue() == 1) { - Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result, - DAG.getConstant(16, SL, MVT::i32)); - } else { - assert(CIdx->getZExtValue() == 0); - } + MVT IntVT = MVT::getIntegerVT(VecSize); + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); + + // Convert vector index to bit-index (* EltSize) + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); - if (ResultVT.bitsLT(MVT::i32)) - Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); + SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); + + if (ResultVT == MVT::f16) { + SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt); return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); } - SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32); + return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); +} - // Convert vector index to bit-index. - SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen); +SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT VT = Op.getValueType(); + + if (VT == MVT::v4i16 || VT == MVT::v4f16) { + EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); - SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx); + // Turn into pair of packed build_vectors. + // TODO: Special case for constants that can be materialized with s_mov_b64. + SDValue Lo = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(0), Op.getOperand(1) }); + SDValue Hi = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(2), Op.getOperand(3) }); - SDValue Result = Elt; - if (ResultVT.bitsLT(MVT::i32)) - Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); - return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); + SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } + + assert(VT == MVT::v2f16 || VT == MVT::v2i16); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); + + Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); + Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); + + SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, + DAG.getConstant(16, SL, MVT::i32)); + + SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); + + return DAG.getNode(ISD::BITCAST, SL, VT, Or); } bool SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitGOTReloc(GA->getGlobal()); } @@ -4082,6 +4346,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT && GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && // FIXME: It isn't correct to rely on the type of the pointer. This should // be removed when address space 0 is 64-bit. @@ -4134,7 +4399,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, unsigned Offset) const { SDLoc SL(Op); SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, false); + DAG.getEntryNode(), Offset, 4, false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -4158,6 +4423,245 @@ static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, return DAG.getUNDEF(VT); } +static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, + ArrayRef<SDValue> Elts) { + assert(!Elts.empty()); + MVT Type; + unsigned NumElts; + + if (Elts.size() == 1) { + Type = MVT::f32; + NumElts = 1; + } else if (Elts.size() == 2) { + Type = MVT::v2f32; + NumElts = 2; + } else if (Elts.size() <= 4) { + Type = MVT::v4f32; + NumElts = 4; + } else if (Elts.size() <= 8) { + Type = MVT::v8f32; + NumElts = 8; + } else { + assert(Elts.size() <= 16); + Type = MVT::v16f32; + NumElts = 16; + } + + SmallVector<SDValue, 16> VecElts(NumElts); + for (unsigned i = 0; i < Elts.size(); ++i) { + SDValue Elt = Elts[i]; + if (Elt.getValueType() != MVT::f32) + Elt = DAG.getBitcast(MVT::f32, Elt); + VecElts[i] = Elt; + } + for (unsigned i = Elts.size(); i < NumElts; ++i) + VecElts[i] = DAG.getUNDEF(MVT::f32); + + if (NumElts == 1) + return VecElts[0]; + return DAG.getBuildVector(Type, DL, VecElts); +} + +static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, + SDValue *GLC, SDValue *SLC) { + auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode()); + if (!CachePolicyConst) + return false; + + uint64_t Value = CachePolicyConst->getZExtValue(); + SDLoc DL(CachePolicy); + if (GLC) { + *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x1; + } + if (SLC) { + *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x2; + } + + return Value == 0; +} + +SDValue SITargetLowering::lowerImage(SDValue Op, + const AMDGPU::ImageDimIntrinsicInfo *Intr, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); + + SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end()); + bool IsD16 = false; + SDValue VData; + int NumVDataDwords; + unsigned AddrIdx; // Index of first address argument + unsigned DMask; + + if (BaseOpcode->Atomic) { + VData = Op.getOperand(2); + + bool Is64Bit = VData.getValueType() == MVT::i64; + if (BaseOpcode->AtomicX2) { + SDValue VData2 = Op.getOperand(3); + VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, + {VData, VData2}); + if (Is64Bit) + VData = DAG.getBitcast(MVT::v4i32, VData); + + ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + DMask = Is64Bit ? 0xf : 0x3; + NumVDataDwords = Is64Bit ? 4 : 2; + AddrIdx = 4; + } else { + DMask = Is64Bit ? 0x3 : 0x1; + NumVDataDwords = Is64Bit ? 2 : 1; + AddrIdx = 3; + } + } else { + unsigned DMaskIdx; + + if (BaseOpcode->Store) { + VData = Op.getOperand(2); + + MVT StoreVT = VData.getSimpleValueType(); + if (StoreVT.getScalarType() == MVT::f16) { + if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || + !BaseOpcode->HasD16) + return Op; // D16 is unsupported for this instruction + + IsD16 = true; + VData = handleD16VData(VData, DAG); + } + + NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; + DMaskIdx = 3; + } else { + MVT LoadVT = Op.getSimpleValueType(); + if (LoadVT.getScalarType() == MVT::f16) { + if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || + !BaseOpcode->HasD16) + return Op; // D16 is unsupported for this instruction + + IsD16 = true; + if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem()) + ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + } + + NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32; + DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1; + } + + auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); + if (!DMaskConst) + return Op; + + AddrIdx = DMaskIdx + 1; + DMask = DMaskConst->getZExtValue(); + if (!DMask && !BaseOpcode->Store) { + // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they + // store the channels' default values. + SDValue Undef = DAG.getUNDEF(Op.getValueType()); + if (isa<MemSDNode>(Op)) + return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); + return Undef; + } + } + + unsigned NumVAddrs = BaseOpcode->NumExtraArgs + + (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) + + (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + SmallVector<SDValue, 4> VAddrs; + for (unsigned i = 0; i < NumVAddrs; ++i) + VAddrs.push_back(Op.getOperand(AddrIdx + i)); + SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); + + SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); + SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); + unsigned CtrlIdx; // Index of texfailctrl argument + SDValue Unorm; + if (!BaseOpcode->Sampler) { + Unorm = True; + CtrlIdx = AddrIdx + NumVAddrs + 1; + } else { + auto UnormConst = + dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2)); + if (!UnormConst) + return Op; + + Unorm = UnormConst->getZExtValue() ? True : False; + CtrlIdx = AddrIdx + NumVAddrs + 3; + } + + SDValue TexFail = Op.getOperand(CtrlIdx); + auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode()); + if (!TexFailConst || TexFailConst->getZExtValue() != 0) + return Op; + + SDValue GLC; + SDValue SLC; + if (BaseOpcode->Atomic) { + GLC = True; // TODO no-return optimization + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC)) + return Op; + } else { + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC)) + return Op; + } + + SmallVector<SDValue, 14> Ops; + if (BaseOpcode->Store || BaseOpcode->Atomic) + Ops.push_back(VData); // vdata + Ops.push_back(VAddr); + Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc + if (BaseOpcode->Sampler) + Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler + Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); + Ops.push_back(Unorm); + Ops.push_back(GLC); + Ops.push_back(SLC); + Ops.push_back(False); // r128 + Ops.push_back(False); // tfe + Ops.push_back(False); // lwe + Ops.push_back(DimInfo->DA ? True : False); + if (BaseOpcode->HasD16) + Ops.push_back(IsD16 ? True : False); + if (isa<MemSDNode>(Op)) + Ops.push_back(Op.getOperand(0)); // chain + + int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32; + int Opcode = -1; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6, + NumVDataDwords, NumVAddrDwords); + assert(Opcode != -1); + + MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); + if (auto MemOp = dyn_cast<MemSDNode>(Op)) { + MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1); + *MemRefs = MemOp->getMemOperand(); + NewNode->setMemRefs(MemRefs, MemRefs + 1); + } + + if (BaseOpcode->AtomicX2) { + SmallVector<SDValue, 1> Elt; + DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); + return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); + } else if (IsD16 && !BaseOpcode->Store) { + MVT LoadVT = Op.getSimpleValueType(); + SDValue Adjusted = adjustLoadValueTypeImpl( + SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem()); + return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL); + } + + return SDValue(NewNode, 0); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -4171,14 +4675,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_implicit_buffer_ptr: { - if (getSubtarget()->isAmdCodeObjectV2(MF)) + if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction())) return emitNonHSAIntrinsicError(DAG, DL, VT); return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); } case Intrinsic::amdgcn_dispatch_ptr: case Intrinsic::amdgcn_queue_ptr: { - if (!Subtarget->isAmdCodeObjectV2(MF)) { + if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) { DiagnosticInfoUnsupported BadIntrin( MF.getFunction(), "unsupported hsa intrinsic without hsa target", DL.getDebugLoc()); @@ -4208,16 +4712,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_rsq: return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rsq_legacy: - if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rcp_legacy: - if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rsq_clamp: { - if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); Type *Type = VT.getTypeForEVT(*DAG.getContext()); @@ -4235,37 +4739,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); + SI::KernelInputOffsets::NGROUPS_X, 4, false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); + SI::KernelInputOffsets::NGROUPS_Y, 4, false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); + SI::KernelInputOffsets::NGROUPS_Z, 4, false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); @@ -4354,7 +4858,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_log_clamp: { - if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return SDValue(); DiagnosticInfoUnsupported BadIntrin( @@ -4439,6 +4943,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fmed3: return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_fdot2: + return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -4484,17 +4991,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), 0); } - case Intrinsic::amdgcn_image_getlod: - case Intrinsic::amdgcn_image_getresinfo: { - unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4; - - // Replace dmask with everything disabled with undef. - const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx)); - if (!DMask || DMask->isNullValue()) - return DAG.getUNDEF(Op.getValueType()); - return SDValue(); - } + case Intrinsic::amdgcn_fmad_ftz: + return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); default: + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) + return lowerImage(Op, ImageDimIntr, DAG); + return Op; } } @@ -4506,10 +5010,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast<MemSDNode>(Op); - unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? - AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + Opc = AMDGPUISD::ATOMIC_INC; + break; + case Intrinsic::amdgcn_atomic_dec: + Opc = AMDGPUISD::ATOMIC_DEC; + break; + case Intrinsic::amdgcn_ds_fadd: + Opc = AMDGPUISD::ATOMIC_LOAD_FADD; + break; + case Intrinsic::amdgcn_ds_fmin: + Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; + break; + case Intrinsic::amdgcn_ds_fmax: + Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; + break; + default: + llvm_unreachable("Unknown intrinsic!"); + } SDValue Ops[] = { M->getOperand(0), // Chain M->getOperand(2), // Ptr @@ -4534,13 +5059,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG); + } + SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4554,10 +5089,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(10) // slc }; - EVT VT = Op.getValueType(); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, M->getMemOperand()); + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -4638,65 +5172,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, VT, M->getMemOperand()); } - // Basic sample. - case Intrinsic::amdgcn_image_sample: - case Intrinsic::amdgcn_image_sample_cl: - case Intrinsic::amdgcn_image_sample_d: - case Intrinsic::amdgcn_image_sample_d_cl: - case Intrinsic::amdgcn_image_sample_l: - case Intrinsic::amdgcn_image_sample_b: - case Intrinsic::amdgcn_image_sample_b_cl: - case Intrinsic::amdgcn_image_sample_lz: - case Intrinsic::amdgcn_image_sample_cd: - case Intrinsic::amdgcn_image_sample_cd_cl: - - // Sample with comparison. - case Intrinsic::amdgcn_image_sample_c: - case Intrinsic::amdgcn_image_sample_c_cl: - case Intrinsic::amdgcn_image_sample_c_d: - case Intrinsic::amdgcn_image_sample_c_d_cl: - case Intrinsic::amdgcn_image_sample_c_l: - case Intrinsic::amdgcn_image_sample_c_b: - case Intrinsic::amdgcn_image_sample_c_b_cl: - case Intrinsic::amdgcn_image_sample_c_lz: - case Intrinsic::amdgcn_image_sample_c_cd: - case Intrinsic::amdgcn_image_sample_c_cd_cl: - - // Sample with offsets. - case Intrinsic::amdgcn_image_sample_o: - case Intrinsic::amdgcn_image_sample_cl_o: - case Intrinsic::amdgcn_image_sample_d_o: - case Intrinsic::amdgcn_image_sample_d_cl_o: - case Intrinsic::amdgcn_image_sample_l_o: - case Intrinsic::amdgcn_image_sample_b_o: - case Intrinsic::amdgcn_image_sample_b_cl_o: - case Intrinsic::amdgcn_image_sample_lz_o: - case Intrinsic::amdgcn_image_sample_cd_o: - case Intrinsic::amdgcn_image_sample_cd_cl_o: - - // Sample with comparison and offsets. - case Intrinsic::amdgcn_image_sample_c_o: - case Intrinsic::amdgcn_image_sample_c_cl_o: - case Intrinsic::amdgcn_image_sample_c_d_o: - case Intrinsic::amdgcn_image_sample_c_d_cl_o: - case Intrinsic::amdgcn_image_sample_c_l_o: - case Intrinsic::amdgcn_image_sample_c_b_o: - case Intrinsic::amdgcn_image_sample_c_b_cl_o: - case Intrinsic::amdgcn_image_sample_c_lz_o: - case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: { - // Replace dmask with everything disabled with undef. - const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5)); - if (!DMask || DMask->isNullValue()) { - SDValue Undef = DAG.getUNDEF(Op.getValueType()); - return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); - } + default: + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrID)) + return lowerImage(Op, ImageDimIntr, DAG); return SDValue(); } - default: - return SDValue(); +} + +SDValue SITargetLowering::handleD16VData(SDValue VData, + SelectionDAG &DAG) const { + EVT StoreVT = VData.getValueType(); + + // No change for f16 and legal vector D16 types. + if (!StoreVT.isVector()) + return VData; + + SDLoc DL(VData); + assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); + + if (Subtarget->hasUnpackedD16VMem()) { + // We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + StoreVT.getVectorNumElements()); + SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + return DAG.UnrollVectorOp(ZExt.getNode()); } + + assert(isTypeLegal(StoreVT)); + return VData; } SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, @@ -4786,7 +5294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, @@ -4841,9 +5349,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } case Intrinsic::amdgcn_tbuffer_store: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // voffset @@ -4854,42 +5366,133 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(10), // glc Op.getOperand(11) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : + AMDGPUISD::TBUFFER_STORE_FORMAT; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_buffer_store: case Intrinsic::amdgcn_buffer_store_format: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // offset Op.getOperand(6), // glc Op.getOperand(7) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable, - VT.getStoreSize(), 4); + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? + AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: { + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) + return lowerImage(Op, ImageDimIntr, DAG); - unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : - AMDGPUISD::BUFFER_STORE_FORMAT; - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + return Op; } + } +} - default: +static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, + ISD::LoadExtType ExtType, SDValue Op, + const SDLoc &SL, EVT VT) { + if (VT.bitsLT(Op.getValueType())) + return DAG.getNode(ISD::TRUNCATE, SL, VT, Op); + + switch (ExtType) { + case ISD::SEXTLOAD: + return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op); + case ISD::ZEXTLOAD: + return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op); + case ISD::EXTLOAD: + return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op); + case ISD::NON_EXTLOAD: return Op; } + + llvm_unreachable("invalid ext type"); +} + +SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + if (Ld->getAlignment() < 4 || Ld->isDivergent()) + return SDValue(); + + // FIXME: Constant loads should all be marked invariant. + unsigned AS = Ld->getAddressSpace(); + if (AS != AMDGPUASI.CONSTANT_ADDRESS && + AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT && + (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) + return SDValue(); + + // Don't do this early, since it may interfere with adjacent load merging for + // illegal types. We can avoid losing alignment information for exotic types + // pre-legalize. + EVT MemVT = Ld->getMemoryVT(); + if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) || + MemVT.getSizeInBits() >= 32) + return SDValue(); + + SDLoc SL(Ld); + + assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && + "unexpected vector extload"); + + // TODO: Drop only high part of range. + SDValue Ptr = Ld->getBasePtr(); + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + MVT::i32, SL, Ld->getChain(), Ptr, + Ld->getOffset(), + Ld->getPointerInfo(), MVT::i32, + Ld->getAlignment(), + Ld->getMemOperand()->getFlags(), + Ld->getAAInfo(), + nullptr); // Drop ranges + + EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + if (MemVT.isFloatingPoint()) { + assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && + "unexpected fp extload"); + TruncVT = MemVT.changeTypeToInteger(); + } + + SDValue Cvt = NewLoad; + if (Ld->getExtensionType() == ISD::SEXTLOAD) { + Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, + DAG.getValueType(TruncVT)); + } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || + Ld->getExtensionType() == ISD::NON_EXTLOAD) { + Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); + } else { + assert(Ld->getExtensionType() == ISD::EXTLOAD); + } + + EVT VT = Ld->getValueType(0); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + + DCI.AddToWorklist(Cvt.getNode()); + + // We may need to handle exotic cases, such as i16->i64 extloads, so insert + // the appropriate extension from the 32-bit load. + Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT); + DCI.AddToWorklist(Cvt.getNode()); + + // Handle conversion back to floating point if necessary. + Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt); + + return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL); } SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { @@ -4928,9 +5531,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned Alignment = Load->getAlignment(); unsigned AS = Load->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - AS, Load->getAlignment())) { + AS, Alignment)) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); @@ -4945,24 +5549,32 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); - if (AS == AMDGPUASI.CONSTANT_ADDRESS) { - if (isMemOpUniform(Load)) + + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { + if (!Op->isDivergent() && Alignment >= 4) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { - if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && - !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) + + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || + AS == AMDGPUASI.GLOBAL_ADDRESS) { + if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && + !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && + Alignment >= 4) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS || + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || + AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); @@ -4989,21 +5601,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { - if (NumElements > 2) - return SplitVectorLoad(Op, DAG); - - if (NumElements == 2) + // Use ds_read_b128 if possible. + if (Subtarget->useDS128() && Load->getAlignment() >= 16 && + MemVT.getStoreSize() == 16) return SDValue(); - // If properly aligned, if we split we might be able to use ds_read_b64. - return SplitVectorLoad(Op, DAG); + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); } return SDValue(); } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() != MVT::i64) - return SDValue(); + EVT VT = Op.getValueType(); + assert(VT.getSizeInBits() == 64); SDLoc DL(Op); SDValue Cond = Op.getOperand(0); @@ -5025,7 +5636,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); - return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); + return DAG.getNode(ISD::BITCAST, DL, VT, Res); } // Catch division cases where we can use shortcuts with rcp and rsq @@ -5037,8 +5648,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, SDValue RHS = Op.getOperand(1); EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || - Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) return SDValue(); @@ -5295,7 +5905,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue Scale; - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. @@ -5393,14 +6003,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + // Use ds_write_b128 if possible. + if (Subtarget->useDS128() && Store->getAlignment() >= 16 && + VT.getStoreSize() == 16) + return SDValue(); + if (NumElements > 2) return SplitVectorStore(Op, DAG); - - if (NumElements == 2) - return Op; - - // If properly aligned, if we split we might be able to use ds_write_b64. - return SplitVectorStore(Op, DAG); + return SDValue(); } else { llvm_unreachable("unhandled address space"); } @@ -5474,7 +6084,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, // easier if i8 vectors weren't promoted to i32 vectors, particularly after // types are legalized. v4i8 -> v4f32 is probably the only case to worry // about in practice. - if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); DCI.AddToWorklist(Cvt.getNode()); @@ -5617,6 +6227,71 @@ static bool isBoolSGPR(SDValue V) { return false; } +// If a constant has all zeroes or all ones within each byte return it. +// Otherwise return 0. +static uint32_t getConstantPermuteMask(uint32_t C) { + // 0xff for any zero byte in the mask + uint32_t ZeroByteMask = 0; + if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff; + if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00; + if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000; + if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000; + uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte + if ((NonZeroByteMask & C) != NonZeroByteMask) + return 0; // Partial bytes selected. + return C; +} + +// Check if a node selects whole bytes from its operand 0 starting at a byte +// boundary while masking the rest. Returns select mask as in the v_perm_b32 +// or -1 if not succeeded. +// Note byte select encoding: +// value 0-3 selects corresponding source byte; +// value 0xc selects zero; +// value 0xff selects 0xff. +static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) { + assert(V.getValueSizeInBits() == 32); + + if (V.getNumOperands() != 2) + return ~0; + + ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1)); + if (!N1) + return ~0; + + uint32_t C = N1->getZExtValue(); + + switch (V.getOpcode()) { + default: + break; + case ISD::AND: + if (uint32_t ConstMask = getConstantPermuteMask(C)) { + return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); + } + break; + + case ISD::OR: + if (uint32_t ConstMask = getConstantPermuteMask(C)) { + return (0x03020100 & ~ConstMask) | ConstMask; + } + break; + + case ISD::SHL: + if (C % 8) + return ~0; + + return uint32_t((0x030201000c0c0c0cull << C) >> 32); + + case ISD::SRL: + if (C % 8) + return ~0; + + return uint32_t(0x0c0c0c0c03020100ull >> C); + } + + return ~0; +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) @@ -5663,6 +6338,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, } } } + + // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) + if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM && + isa<ConstantSDNode>(LHS.getOperand(2))) { + uint32_t Sel = getConstantPermuteMask(Mask); + if (!Sel) + return SDValue(); + + // Select 0xc for all zero bytes + Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c); + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), + LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); + } } // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> @@ -5715,6 +6404,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); } + // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && + N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) { + uint32_t LHSMask = getPermuteMask(DAG, LHS); + uint32_t RHSMask = getPermuteMask(DAG, RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { + // Canonicalize the expression in an attempt to have fewer unique masks + // and therefore fewer registers used to hold the masks. + if (LHSMask > RHSMask) { + std::swap(LHSMask, RHSMask); + std::swap(LHS, RHS); + } + + // Select 0xc for each lane used from source operand. Zero has 0xc mask + // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. + uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + + // Check of we need to combine values from two sources within a byte. + if (!(LHSUsedLanes & RHSUsedLanes) && + // If we select high and lower word keep it for SDWA. + // TODO: teach SDWA to work with v_perm_b32 and remove the check. + !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { + // Each byte in each mask is either selector mask 0-3, or has higher + // bits set in either of masks, which can be 0xff for 0xff or 0x0c for + // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise + // mask which is not 0xff wins. By anding both masks we have a correct + // result except that 0x0c shall be corrected to give 0x0c only. + uint32_t Mask = LHSMask & RHSMask; + for (unsigned I = 0; I < 32; I += 8) { + uint32_t ByteSel = 0xff << I; + if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c) + Mask &= (0x0c << I) & 0xffffffff; + } + + // Add 4 to each active LHS lane. It will not affect any existing 0xff + // or 0x0c. + uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); + SDLoc DL(N); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, + LHS.getOperand(0), RHS.getOperand(0), + DAG.getConstant(Sel, DL, MVT::i32)); + } + } + } + return SDValue(); } @@ -5750,6 +6487,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, return SDValue(); } + // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) + if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() && + LHS.getOpcode() == AMDGPUISD::PERM && + isa<ConstantSDNode>(LHS.getOperand(2))) { + uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1)); + if (!Sel) + return SDValue(); + + Sel |= LHS.getConstantOperandVal(2); + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), + LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); + } + + // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && + N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) { + uint32_t LHSMask = getPermuteMask(DAG, LHS); + uint32_t RHSMask = getPermuteMask(DAG, RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { + // Canonicalize the expression in an attempt to have fewer unique masks + // and therefore fewer registers used to hold the masks. + if (LHSMask > RHSMask) { + std::swap(LHSMask, RHSMask); + std::swap(LHS, RHS); + } + + // Select 0xc for each lane used from source operand. Zero has 0xc mask + // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. + uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + + // Check of we need to combine values from two sources within a byte. + if (!(LHSUsedLanes & RHSUsedLanes) && + // If we select high and lower word keep it for SDWA. + // TODO: teach SDWA to work with v_perm_b32 and remove the check. + !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { + // Kill zero bytes selected by other mask. Zero value is 0xc. + LHSMask &= ~RHSUsedLanes; + RHSMask &= ~LHSUsedLanes; + // Add 4 to each active LHS lane + LHSMask |= LHSUsedLanes & 0x04040404; + // Combine masks + uint32_t Sel = LHSMask | RHSMask; + SDLoc DL(N); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, + LHS.getOperand(0), RHS.getOperand(0), + DAG.getConstant(Sel, DL, MVT::i32)); + } + } + } + if (VT != MVT::i64) return SDValue(); @@ -5856,6 +6647,7 @@ static bool fp16SrcZerosHighBits(unsigned Opc) { case AMDGPUISD::FMAD_FTZ: case AMDGPUISD::RCP: case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::LDEXP: return true; default: @@ -5908,6 +6700,23 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performRcpCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + + if (N0.isUndef()) + return N0; + + if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || + N0.getOpcode() == ISD::SINT_TO_FP)) { + return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0, + N->getFlags()); + } + + return AMDGPUTargetLowering::performRcpCombine(N, DCI); +} + static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) return true; @@ -5916,7 +6725,7 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { } static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, - const SISubtarget *ST, unsigned MaxDepth=5) { + const GCNSubtarget *ST, unsigned MaxDepth=5) { // If source is a result of another standard FP operation it is already in // canonical form. @@ -6174,7 +6983,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - VT != MVT::f64 && + !VT.isVector() && VT != MVT::f64 && ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) @@ -6294,15 +7103,87 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); - SelectionDAG &DAG = DCI.DAG; - if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { + + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + + if ((Vec.getOpcode() == ISD::FNEG || + Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { SDLoc SL(N); EVT EltVT = N->getValueType(0); SDValue Idx = N->getOperand(1); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec.getOperand(0), Idx); - return DAG.getNode(ISD::FNEG, SL, EltVT, Elt); + return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt); + } + + // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) + // => + // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) + // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) + // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt + if (Vec.hasOneUse() && DCI.isBeforeLegalize()) { + SDLoc SL(N); + EVT EltVT = N->getValueType(0); + SDValue Idx = N->getOperand(1); + unsigned Opc = Vec.getOpcode(); + + switch(Opc) { + default: + return SDValue(); + // TODO: Support other binary operations. + case ISD::FADD: + case ISD::ADD: + case ISD::UMIN: + case ISD::UMAX: + case ISD::SMIN: + case ISD::SMAX: + case ISD::FMAXNUM: + case ISD::FMINNUM: + return DAG.getNode(Opc, SL, EltVT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(1), Idx)); + } + } + + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit + // elements. This exposes more load reduction opportunities by replacing + // multiple small extract_vector_elements with a single 32-bit extract. + auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (EltSize <= 16 && + EltVT.isByteSized() && + VecSize > 32 && + VecSize % 32 == 0 && + Idx) { + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); + + unsigned BitIndex = Idx->getZExtValue() * EltSize; + unsigned EltIdx = BitIndex / 32; + unsigned LeftoverBitIdx = BitIndex % 32; + SDLoc SL(N); + + SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); + DCI.AddToWorklist(Cast.getNode()); + + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, + DAG.getConstant(EltIdx, SL, MVT::i32)); + DCI.AddToWorklist(Elt.getNode()); + SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, + DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); + DCI.AddToWorklist(Srl.getNode()); + + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl); + DCI.AddToWorklist(Trunc.getNode()); + return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc); } return SDValue(); @@ -6363,8 +7244,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const TargetOptions &Options = DAG.getTarget().Options; if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || - (N0->getFlags().hasUnsafeAlgebra() && - N1->getFlags().hasUnsafeAlgebra())) && + (N0->getFlags().hasAllowContract() && + N1->getFlags().hasAllowContract())) && isFMAFasterThanFMulAndFAdd(VT)) { return ISD::FMA; } @@ -6420,7 +7301,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } - if (VT != MVT::i32) + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); // add x, zext (setcc) => addcarry x, 0, setcc @@ -6596,6 +7477,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performFMACombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc SL(N); + + if (!Subtarget->hasDLInsts() || VT != MVT::f32) + return SDValue(); + + // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> + // FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) + SDValue Op1 = N->getOperand(0); + SDValue Op2 = N->getOperand(1); + SDValue FMA = N->getOperand(2); + + if (FMA.getOpcode() != ISD::FMA || + Op1.getOpcode() != ISD::FP_EXTEND || + Op2.getOpcode() != ISD::FP_EXTEND) + return SDValue(); + + // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, + // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract + // is sufficient to allow generaing fdot2. + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + (N->getFlags().hasAllowContract() && + FMA->getFlags().hasAllowContract())) { + Op1 = Op1.getOperand(0); + Op2 = Op2.getOperand(0); + if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue Vec1 = Op1.getOperand(0); + SDValue Idx1 = Op1.getOperand(1); + SDValue Vec2 = Op2.getOperand(0); + + SDValue FMAOp1 = FMA.getOperand(0); + SDValue FMAOp2 = FMA.getOperand(1); + SDValue FMAAcc = FMA.getOperand(2); + + if (FMAOp1.getOpcode() != ISD::FP_EXTEND || + FMAOp2.getOpcode() != ISD::FP_EXTEND) + return SDValue(); + + FMAOp1 = FMAOp1.getOperand(0); + FMAOp2 = FMAOp2.getOperand(0); + if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue Vec3 = FMAOp1.getOperand(0); + SDValue Vec4 = FMAOp2.getOperand(0); + SDValue Idx2 = FMAOp1.getOperand(1); + + if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) || + // Idx1 and Idx2 cannot be the same. + Idx1 == Idx2) + return SDValue(); + + if (Vec1 == Vec2 || Vec3 == Vec4) + return SDValue(); + + if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) + return SDValue(); + + if ((Vec1 == Vec3 && Vec2 == Vec4) || + (Vec1 == Vec4 && Vec2 == Vec3)) + return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc); + } + return SDValue(); +} + SDValue SITargetLowering::performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -6615,23 +7569,49 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, } } - if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && - isBoolSGPR(LHS.getOperand(0))) { - // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 - // setcc (sext from i1 cc), -1, eq|sle|uge) => cc - // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 - // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc - if ((CRHS->isAllOnesValue() && - (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || - (CRHS->isNullValue() && - (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) - return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), - DAG.getConstant(-1, SL, MVT::i1)); - if ((CRHS->isAllOnesValue() && - (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || - (CRHS->isNullValue() && - (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) - return LHS.getOperand(0); + if (CRHS) { + if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && + isBoolSGPR(LHS.getOperand(0))) { + // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 + // setcc (sext from i1 cc), -1, eq|sle|uge) => cc + // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 + // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || + (CRHS->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) + return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(-1, SL, MVT::i1)); + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || + (CRHS->isNullValue() && + (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) + return LHS.getOperand(0); + } + + uint64_t CRHSVal = CRHS->getZExtValue(); + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && + LHS.getOpcode() == ISD::SELECT && + isa<ConstantSDNode>(LHS.getOperand(1)) && + isa<ConstantSDNode>(LHS.getOperand(2)) && + LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) && + isBoolSGPR(LHS.getOperand(0))) { + // Given CT != FT: + // setcc (select cc, CT, CF), CF, eq => xor cc, -1 + // setcc (select cc, CT, CF), CF, ne => cc + // setcc (select cc, CT, CF), CT, ne => xor cc, -1 + // setcc (select cc, CT, CF), CT, eq => cc + uint64_t CT = LHS.getConstantOperandVal(1); + uint64_t CF = LHS.getConstantOperandVal(2); + + if ((CF == CRHSVal && CC == ISD::SETEQ) || + (CT == CRHSVal && CC == ISD::SETNE)) + return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(-1, SL, MVT::i1)); + if ((CF == CRHSVal && CC == ISD::SETNE) || + (CT == CRHSVal && CC == ISD::SETEQ)) + return LHS.getOperand(0); + } } if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && @@ -6700,6 +7680,29 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performClampCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CSrc) + return SDValue(); + + const APFloat &F = CSrc->getValueAPF(); + APFloat Zero = APFloat::getZero(F.getSemantics()); + APFloat::cmpResult Cmp0 = F.compare(Zero); + if (Cmp0 == APFloat::cmpLessThan || + (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); + } + + APFloat One(F.getSemantics(), "1.0"); + APFloat::cmpResult Cmp1 = F.compare(One); + if (Cmp1 == APFloat::cmpGreaterThan) + return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); + + return SDValue(CSrc, 0); +} + + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -6731,7 +7734,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performMinMaxCombine(N, DCI); break; } - case ISD::LOAD: + case ISD::FMA: + return performFMACombine(N, DCI); + case ISD::LOAD: { + if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI)) + return Widended; + LLVM_FALLTHROUGH; + } case ISD::STORE: case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: @@ -6749,7 +7758,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_DEC: + case AMDGPUISD::ATOMIC_LOAD_FADD: + case AMDGPUISD::ATOMIC_LOAD_FMIN: + case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); @@ -6765,11 +7777,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performClassCombine(N, DCI); case ISD::FCANONICALIZE: return performFCanonicalizeCombine(N, DCI); - case AMDGPUISD::FRACT: case AMDGPUISD::RCP: + return performRcpCombine(N, DCI); + case AMDGPUISD::FRACT: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::LDEXP: { SDValue Src = N->getOperand(0); @@ -6789,6 +7803,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMed3Combine(N, DCI); case AMDGPUISD::CVT_PKRTZ_F16_F32: return performCvtPkRTZCombine(N, DCI); + case AMDGPUISD::CLAMP: + return performClampCombine(N, DCI); case ISD::SCALAR_TO_VECTOR: { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -6815,7 +7831,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -/// \brief Helper function for adjustWritemask +/// Helper function for adjustWritemask static unsigned SubIdx2Lane(unsigned Idx) { switch (Idx) { default: return 0; @@ -6826,12 +7842,19 @@ static unsigned SubIdx2Lane(unsigned Idx) { } } -/// \brief Adjust the writemask of MIMG instructions +/// Adjust the writemask of MIMG instructions SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { + unsigned Opcode = Node->getMachineOpcode(); + + // Subtract 1 because the vdata output is not a MachineSDNode operand. + int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1; + if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) + return Node; // not implemented for D16 + SDNode *Users[4] = { nullptr }; unsigned Lane = 0; - unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; + unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; bool HasChain = Node->getNumValues() > 1; @@ -6881,9 +7904,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, unsigned BitsSet = countPopulation(NewDmask); - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII, - Node->getMachineOpcode(), BitsSet); + int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet); assert(NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"); @@ -6948,7 +7969,7 @@ static bool isFrameIndexOp(SDValue Op) { return isa<FrameIndexSDNode>(Op); } -/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) +/// Legalize target independent instructions (e.g. INSERT_SUBREG) /// with frame index operands. /// LLVM assumes that inputs are to these instructions are registers. SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, @@ -6995,7 +8016,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, return DAG.UpdateNodeOperands(Node, Ops); } -/// \brief Fold the instructions after selecting them. +/// Fold the instructions after selecting them. /// Returns null if users were already updated. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { @@ -7069,7 +8090,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, return Node; } -/// \brief Assign the register class depending on the number of +/// Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { @@ -7156,7 +8177,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); } -/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// Return a resource descriptor with the 'Add TID' bit enabled /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] /// of the resource descriptor) to create an offset, which is added to /// the resource pointer. @@ -7198,11 +8219,11 @@ std::pair<unsigned, const TargetRegisterClass *> SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (!isTypeLegal(VT)) - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); - + const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { switch (Constraint[0]) { + default: + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); case 's': case 'r': switch (VT.getSizeInBits()) { @@ -7210,40 +8231,56 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, nullptr); case 32: case 16: - return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass); + RC = &AMDGPU::SReg_32_XM0RegClass; + break; case 64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + RC = &AMDGPU::SGPR_64RegClass; + break; case 128: - return std::make_pair(0U, &AMDGPU::SReg_128RegClass); + RC = &AMDGPU::SReg_128RegClass; + break; case 256: - return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + RC = &AMDGPU::SReg_256RegClass; + break; case 512: - return std::make_pair(0U, &AMDGPU::SReg_512RegClass); + RC = &AMDGPU::SReg_512RegClass; + break; } - + break; case 'v': switch (VT.getSizeInBits()) { default: return std::make_pair(0U, nullptr); case 32: case 16: - return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); + RC = &AMDGPU::VGPR_32RegClass; + break; case 64: - return std::make_pair(0U, &AMDGPU::VReg_64RegClass); + RC = &AMDGPU::VReg_64RegClass; + break; case 96: - return std::make_pair(0U, &AMDGPU::VReg_96RegClass); + RC = &AMDGPU::VReg_96RegClass; + break; case 128: - return std::make_pair(0U, &AMDGPU::VReg_128RegClass); + RC = &AMDGPU::VReg_128RegClass; + break; case 256: - return std::make_pair(0U, &AMDGPU::VReg_256RegClass); + RC = &AMDGPU::VReg_256RegClass; + break; case 512: - return std::make_pair(0U, &AMDGPU::VReg_512RegClass); + RC = &AMDGPU::VReg_512RegClass; + break; } + break; } + // We actually support i128, i16 and f16 as inline parameters + // even if they are not reported as legal + if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || + VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) + return std::make_pair(0U, RC); } if (Constraint.size() > 1) { - const TargetRegisterClass *RC = nullptr; if (Constraint[1] == 'v') { RC = &AMDGPU::VGPR_32RegClass; } else if (Constraint[1] == 's') { @@ -7280,8 +8317,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Info->isEntryFunction()) { // Callable functions have fixed registers used for stack access. @@ -7311,6 +8347,8 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, Info->getScratchWaveOffsetReg()); + Info->limitOccupancy(MF); + TargetLoweringBase::finalizeLowering(MF); } @@ -7331,3 +8369,69 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, // calculation won't overflow, so assume the sign bit is never set. Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); } + +bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, + FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const +{ + switch (N->getOpcode()) { + case ISD::Register: + case ISD::CopyFromReg: + { + const RegisterSDNode *R = nullptr; + if (N->getOpcode() == ISD::Register) { + R = dyn_cast<RegisterSDNode>(N); + } + else { + R = dyn_cast<RegisterSDNode>(N->getOperand(1)); + } + if (R) + { + const MachineFunction * MF = FLI->MF; + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + unsigned Reg = R->getReg(); + if (TRI.isPhysicalRegister(Reg)) + return TRI.isVGPR(MRI, Reg); + + if (MRI.isLiveIn(Reg)) { + // workitem.id.x workitem.id.y workitem.id.z + // Any VGPR formal argument is also considered divergent + if (TRI.isVGPR(MRI, Reg)) + return true; + // Formal arguments of non-entry functions + // are conservatively considered divergent + else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) + return true; + } + return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + } + } + break; + case ISD::LOAD: { + const LoadSDNode *L = dyn_cast<LoadSDNode>(N); + if (L->getMemOperand()->getAddrSpace() == + Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) + return true; + } break; + case ISD::CALLSEQ_END: + return true; + break; + case ISD::INTRINSIC_WO_CHAIN: + { + + } + return AMDGPU::isIntrinsicSourceOfDivergence( + cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()); + case ISD::INTRINSIC_W_CHAIN: + return AMDGPU::isIntrinsicSourceOfDivergence( + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); + // In some cases intrinsics that are a source of divergence have been + // lowered to AMDGPUISD so we also need to check those too. + case AMDGPUISD::INTERP_MOV: + case AMDGPUISD::INTERP_P1: + case AMDGPUISD::INTERP_P2: + return true; + } + return false; +} |