diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
70 files changed, 1481 insertions, 916 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index e606f0e8fc3c..806c0b18637a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -610,12 +610,6 @@ def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts", "Has ds_*_src2 instructions" >; -def FeatureRegisterBanking : SubtargetFeature<"register-banking", - "HasRegisterBanking", - "true", - "Has register banking" ->; - def FeatureVOP3Literal : SubtargetFeature<"vop3-literal", "HasVOP3Literal", "true", @@ -826,7 +820,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, - FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, + FeatureNoSdstCMPX, FeatureVscnt, FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h index 22be014813b0..5ba9b2cd187e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -26,7 +26,7 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> { const DataLayout &DL; public: - explicit AMDGPUAAResult(const DataLayout &DL) : AAResultBase(), DL(DL) {} + explicit AMDGPUAAResult(const DataLayout &DL) : DL(DL) {} AMDGPUAAResult(AMDGPUAAResult &&Arg) : AAResultBase(std::move(Arg)), DL(Arg.DL) {} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 2f1e7823f65c..cd084fd5440a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -192,8 +192,20 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - if (!SPReg) - SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); + if (!SPReg) { + const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>(); + if (ST.enableFlatScratch()) { + // The stack is accessed unswizzled, so we can use a regular copy. + SPReg = MIRBuilder.buildCopy(PtrTy, + MFI->getStackPtrOffsetReg()).getReg(0); + } else { + // The address we produce here, without knowing the use context, is going + // to be interpreted as a vector address, so we need to convert to a + // swizzled address. + SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy}, + {MFI->getStackPtrOffsetReg()}).getReg(0); + } + } auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); @@ -615,6 +627,13 @@ bool AMDGPUCallLowering::lowerFormalArguments( CCInfo.AllocateReg(ImplicitBufferPtrReg); } + // FIXME: This probably isn't defined for mesa + if (Info->hasFlatScratchInit() && !Subtarget.isAmdPalOS()) { + Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(FlatScratchInitReg); + } + SmallVector<ArgInfo, 32> SplitArgs; unsigned Idx = 0; unsigned PSInputNum = 0; @@ -879,13 +898,17 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, Register InputReg; if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && NeedWorkItemIDX) { - InputReg = MRI.createGenericVirtualRegister(S32); - LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, - std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); + if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) { + InputReg = MRI.createGenericVirtualRegister(S32); + LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, + std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); + } else { + InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0); + } } if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && - NeedWorkItemIDY) { + NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) { Register Y = MRI.createGenericVirtualRegister(S32); LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), std::get<2>(WorkitemIDY)); @@ -895,7 +918,7 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, } if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && - NeedWorkItemIDZ) { + NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) { Register Z = MRI.createGenericVirtualRegister(S32); LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), std::get<2>(WorkitemIDZ)); @@ -904,16 +927,24 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; } - if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { + if (!InputReg && + (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { InputReg = MRI.createGenericVirtualRegister(S32); - - // Workitem ids are already packed, any of present incoming arguments will - // carry all required fields. - ArgDescriptor IncomingArg = ArgDescriptor::createArg( - IncomingArgX ? *IncomingArgX : + if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { + // We're in a situation where the outgoing function requires the workitem + // ID, but the calling function does not have it (e.g a graphics function + // calling a C calling convention function). This is illegal, but we need + // to produce something. + MIRBuilder.buildUndef(InputReg); + } else { + // Workitem ids are already packed, any of present incoming arguments will + // carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); - LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, - &AMDGPU::VGPR_32RegClass, S32); + LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, + &AMDGPU::VGPR_32RegClass, S32); + } } if (OutgoingArg->isRegister()) { @@ -1314,6 +1345,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; } + Info.IsTailCall = CanTailCallOpt; if (CanTailCallOpt) return lowerTailCall(MIRBuilder, Info, OutArgs); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index a55729586b8d..1920684d8f1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -150,13 +150,13 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// \returns The minimum number of bits needed to store the value of \Op as an /// unsigned integer. Truncating to this size and then zero-extending to - /// ScalarSize will not change the value. - unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const; + /// the original will not change the value. + unsigned numBitsUnsigned(Value *Op) const; /// \returns The minimum number of bits needed to store the value of \Op as a /// signed integer. Truncating to this size and then sign-extending to - /// ScalarSize will not change the value. - unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const; + /// the original size will not change the value. + unsigned numBitsSigned(Value *Op) const; /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. /// SelectionDAG has an issue where an and asserting the bits are known @@ -445,17 +445,12 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( return true; } -unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op, - unsigned ScalarSize) const { - KnownBits Known = computeKnownBits(Op, *DL, 0, AC); - return ScalarSize - Known.countMinLeadingZeros(); +unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const { + return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits(); } -unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op, - unsigned ScalarSize) const { - // In order for this to be a signed 24-bit value, bit 23, must - // be a sign bit. - return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC) + 1; +unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const { + return ComputeMaxSignificantBits(Op, *DL, 0, AC); } static void extractValues(IRBuilder<> &Builder, @@ -532,12 +527,12 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { unsigned LHSBits = 0, RHSBits = 0; bool IsSigned = false; - if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS, Size)) <= 24 && - (RHSBits = numBitsUnsigned(RHS, Size)) <= 24) { + if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 && + (RHSBits = numBitsUnsigned(RHS)) <= 24) { IsSigned = false; - } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS, Size)) <= 24 && - (RHSBits = numBitsSigned(RHS, Size)) <= 24) { + } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 && + (RHSBits = numBitsSigned(RHS)) <= 24) { IsSigned = true; } else diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 699c6c479455..3ac7c45b3275 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -331,8 +331,7 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { if (auto PtrTy = dyn_cast<PointerType>(Arg.getType())) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // FIXME: Should report this for all address spaces - PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), - PtrTy->getElementType()); + PointeeAlign = Arg.getParamAlign().valueOrOne(); } } @@ -731,10 +730,8 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, // FIXME: Need to distinguish in memory alignment from pointer alignment. if (auto PtrTy = dyn_cast<PointerType>(Ty)) { - if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), - PtrTy->getElementType()); - } + if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) + PointeeAlign = Arg.getParamAlign().valueOrOne(); } // There's no distinction between byval aggregates and raw aggregates. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 54177564afbc..b9d0655feef7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -51,7 +51,7 @@ unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { // In order for this to be a signed 24-bit value, bit 23, must // be a sign bit. - return DAG.ComputeMinSignedBits(Op); + return DAG.ComputeMaxSignificantBits(Op); } AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, @@ -360,6 +360,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); @@ -1408,6 +1410,11 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, Start != 1) return Op; + if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) || + (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) && + (Start == 0 || Start == 4)) + return Op; + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); @@ -4626,11 +4633,12 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( RHSKnown = RHSKnown.trunc(24); if (Opc == AMDGPUISD::MUL_I24) { - unsigned LHSValBits = 24 - LHSKnown.countMinSignBits(); - unsigned RHSValBits = 24 - RHSKnown.countMinSignBits(); - unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); - if (MaxValBits >= 32) + unsigned LHSValBits = LHSKnown.countMaxSignificantBits(); + unsigned RHSValBits = RHSKnown.countMaxSignificantBits(); + unsigned MaxValBits = LHSValBits + RHSValBits; + if (MaxValBits > 32) break; + unsigned SignBits = 32 - MaxValBits + 1; bool LHSNegative = LHSKnown.isNegative(); bool LHSNonNegative = LHSKnown.isNonNegative(); bool LHSPositive = LHSKnown.isStrictlyPositive(); @@ -4639,16 +4647,16 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( bool RHSPositive = RHSKnown.isStrictlyPositive(); if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative)) - Known.Zero.setHighBits(32 - MaxValBits); + Known.Zero.setHighBits(SignBits); else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative)) - Known.One.setHighBits(32 - MaxValBits); + Known.One.setHighBits(SignBits); } else { - unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros(); - unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros(); - unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); + unsigned LHSValBits = LHSKnown.countMaxActiveBits(); + unsigned RHSValBits = RHSKnown.countMaxActiveBits(); + unsigned MaxValBits = LHSValBits + RHSValBits; if (MaxValBits >= 32) break; - Known.Zero.setHighBits(32 - MaxValBits); + Known.Zero.setBitsFrom(MaxValBits); } break; } @@ -4904,7 +4912,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { } } -bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal( +bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal( unsigned Opc, LLT Ty1, LLT Ty2) const { - return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)); + return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) && + Ty2 == LLT::scalar(32); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index daaca8737c5d..b41506157b68 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -335,8 +335,8 @@ public: AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; - bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1, - LLT Ty2) const override; + bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, + LLT Ty2) const override; }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index db84b8766924..4f1d700bcd84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -58,24 +58,37 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, // Check if a value can be converted to a 16-bit value without losing // precision. -static bool canSafelyConvertTo16Bit(Value &V) { +// The value is expected to be either a float (IsFloat = true) or an unsigned +// integer (IsFloat = false). +static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { Type *VTy = V.getType(); if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { // The value is already 16-bit, so we don't want to convert to 16-bit again! return false; } - if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { - // We need to check that if we cast the index down to a half, we do not lose - // precision. - APFloat FloatValue(ConstFloat->getValueAPF()); - bool LosesInfo = true; - FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); - return !LosesInfo; + if (IsFloat) { + if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { + // We need to check that if we cast the index down to a half, we do not + // lose precision. + APFloat FloatValue(ConstFloat->getValueAPF()); + bool LosesInfo = true; + FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, + &LosesInfo); + return !LosesInfo; + } + } else { + if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { + // We need to check that if we cast the index down to an i16, we do not + // lose precision. + APInt IntValue(ConstInt->getValue()); + return IntValue.getActiveBits() <= 16; + } } + Value *CastSrc; - if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || - match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || - match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { + bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) + : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); + if (IsExt) { Type *CastSrcTy = CastSrc->getType(); if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) return true; @@ -97,13 +110,116 @@ static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { llvm_unreachable("Should never be called!"); } +/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with +/// the modified arguments. +static Optional<Instruction *> modifyIntrinsicCall( + IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC, + std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> + Func) { + SmallVector<Type *, 4> ArgTys; + if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) + return None; + + SmallVector<Value *, 8> Args(II.args()); + + // Modify arguments and types + Func(Args, ArgTys); + + Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys); + + CallInst *NewCall = IC.Builder.CreateCall(I, Args); + NewCall->takeName(&II); + NewCall->copyMetadata(II); + if (isa<FPMathOperator>(NewCall)) + NewCall->copyFastMathFlags(&II); + + // Erase and replace uses + if (!II.getType()->isVoidTy()) + IC.replaceInstUsesWith(II, NewCall); + return IC.eraseInstFromFunction(II); +} + static Optional<Instruction *> simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC) { + // Optimize _L to _LZ when _L is zero + if (const auto *LZMappingInfo = + AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { + if (auto *ConstantLod = + dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { + if (ConstantLod->isZero() || ConstantLod->isNegative()) { + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, + ImageDimIntr->Dim); + return modifyIntrinsicCall( + II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + Args.erase(Args.begin() + ImageDimIntr->LodIndex); + }); + } + } + } + + // Optimize _mip away, when 'lod' is zero + if (const auto *MIPMappingInfo = + AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { + if (auto *ConstantMip = + dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { + if (ConstantMip->isZero()) { + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, + ImageDimIntr->Dim); + return modifyIntrinsicCall( + II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + Args.erase(Args.begin() + ImageDimIntr->MipIndex); + }); + } + } + } + + // Optimize _bias away when 'bias' is zero + if (const auto *BiasMappingInfo = + AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { + if (auto *ConstantBias = + dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { + if (ConstantBias->isZero()) { + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, + ImageDimIntr->Dim); + return modifyIntrinsicCall( + II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + Args.erase(Args.begin() + ImageDimIntr->BiasIndex); + ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); + }); + } + } + } + + // Optimize _offset away when 'offset' is zero + if (const auto *OffsetMappingInfo = + AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { + if (auto *ConstantOffset = + dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { + if (ConstantOffset->isZero()) { + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode( + OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); + return modifyIntrinsicCall( + II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { + Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); + }); + } + } + } + + // Try to use A16 or G16 if (!ST->hasA16() && !ST->hasG16()) return None; + // Address is interpreted as float if the instruction has a sampler or as + // unsigned int if there is no sampler. + bool HasSampler = + AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; bool FloatCoord = false; // true means derivatives can be converted to 16 bit, coordinates not bool OnlyDerivatives = false; @@ -112,7 +228,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { Value *Coord = II.getOperand(OperandIndex); // If the values are not derived from 16-bit values, we cannot optimize. - if (!canSafelyConvertTo16Bit(*Coord)) { + if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { if (OperandIndex < ImageDimIntr->CoordStart || ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { return None; @@ -127,43 +243,50 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, FloatCoord = Coord->getType()->isFloatingPointTy(); } - if (OnlyDerivatives) { - if (!ST->hasG16()) - return None; - } else { - if (!ST->hasA16()) - OnlyDerivatives = true; // Only supports G16 + if (!OnlyDerivatives && !ST->hasA16()) + OnlyDerivatives = true; // Only supports G16 + + // Check if there is a bias parameter and if it can be converted to f16 + if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { + Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); + assert(HasSampler && + "Only image instructions with a sampler can have a bias"); + if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) + OnlyDerivatives = true; } + if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == + ImageDimIntr->CoordStart)) + return None; + Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) : Type::getInt16Ty(II.getContext()); - SmallVector<Type *, 4> ArgTys; - if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) - return None; - - ArgTys[ImageDimIntr->GradientTyArg] = CoordType; - if (!OnlyDerivatives) - ArgTys[ImageDimIntr->CoordTyArg] = CoordType; - Function *I = - Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); + return modifyIntrinsicCall( + II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { + ArgTys[ImageDimIntr->GradientTyArg] = CoordType; + if (!OnlyDerivatives) { + ArgTys[ImageDimIntr->CoordTyArg] = CoordType; - SmallVector<Value *, 8> Args(II.args()); + // Change the bias type + if (ImageDimIntr->NumBiasArgs != 0) + ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); + } - unsigned EndIndex = - OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; - for (unsigned OperandIndex = ImageDimIntr->GradientStart; - OperandIndex < EndIndex; OperandIndex++) { - Args[OperandIndex] = - convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); - } + unsigned EndIndex = + OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; + for (unsigned OperandIndex = ImageDimIntr->GradientStart; + OperandIndex < EndIndex; OperandIndex++) { + Args[OperandIndex] = + convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); + } - CallInst *NewCall = IC.Builder.CreateCall(I, Args); - NewCall->takeName(&II); - NewCall->copyMetadata(II); - if (isa<FPMathOperator>(NewCall)) - NewCall->copyFastMathFlags(&II); - return IC.replaceInstUsesWith(II, NewCall); + // Convert the bias + if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { + Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); + Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); + } + }); } bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index b1263618c5db..e7ee36447682 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -20,9 +20,6 @@ namespace llvm { class GCNSubtarget; -class MachineFunction; -class MachineInstr; -class MachineInstrBuilder; class MachineMemOperand; class AMDGPUInstrInfo { @@ -52,6 +49,9 @@ struct ImageDimIntrinsicInfo { unsigned BaseOpcode; MIMGDim Dim; + uint8_t NumOffsetArgs; + uint8_t NumBiasArgs; + uint8_t NumZCompareArgs; uint8_t NumGradients; uint8_t NumDmask; uint8_t NumData; @@ -60,6 +60,9 @@ struct ImageDimIntrinsicInfo { uint8_t DMaskIndex; uint8_t VAddrStart; + uint8_t OffsetIndex; + uint8_t BiasIndex; + uint8_t ZCompareIndex; uint8_t GradientStart; uint8_t CoordStart; uint8_t LodIndex; @@ -71,6 +74,7 @@ struct ImageDimIntrinsicInfo { uint8_t TexFailCtrlIndex; uint8_t CachePolicyIndex; + uint8_t BiasTyArg; uint8_t GradientTyArg; uint8_t CoordTyArg; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index e16bead81b65..b7d0f0580cda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -46,8 +46,7 @@ static cl::opt<bool> AllowRiskySelect( AMDGPUInstructionSelector::AMDGPUInstructionSelector( const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM) - : InstructionSelector(), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), + : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), STI(STI), EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), #define GET_GLOBALISEL_PREDICATES_INIT @@ -1103,7 +1102,18 @@ bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const { const DebugLoc &DL = I.getDebugLoc(); Register SrcReg = I.getOperand(2).getReg(); unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); + auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); + if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(Pred))) { + MachineInstr *ICmp = + BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst); + + if (!RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), + *TRI.getBoolRC(), *MRI)) + return false; + I.eraseFromParent(); + return true; + } int Opcode = getV_CMPOpcode(Pred, Size); if (Opcode == -1) @@ -1234,7 +1244,7 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { // Get the return address reg and mark it as an implicit live-in Register ReturnAddrReg = TRI.getReturnAddressReg(MF); Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, - AMDGPU::SReg_64RegClass); + AMDGPU::SReg_64RegClass, DL); BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) .addReg(LiveIn); I.eraseFromParent(); @@ -1494,9 +1504,9 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, if (TexFailCtrl) IsTexFail = true; - TFE = (TexFailCtrl & 0x1) ? 1 : 0; + TFE = (TexFailCtrl & 0x1) ? true : false; TexFailCtrl &= ~(uint64_t)0x1; - LWE = (TexFailCtrl & 0x2) ? 1 : 0; + LWE = (TexFailCtrl & 0x2) ? true : false; TexFailCtrl &= ~(uint64_t)0x2; return TexFailCtrl == 0; @@ -1511,10 +1521,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = - AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); - const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = - AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); @@ -1523,7 +1529,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( Register VDataIn, VDataOut; LLT VDataTy; int NumVDataDwords = -1; - bool IsD16 = false; + bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || + MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16; bool Unorm; if (!BaseOpcode->Sampler) @@ -1572,16 +1579,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); - // One memoperand is mandatory, except for getresinfo. - // FIXME: Check this in verifier. - if (!MI.memoperands_empty()) { - const MachineMemOperand *MMO = *MI.memoperands_begin(); - - // Infer d16 from the memory size, as the register type will be mangled by - // unpacked subtargets, or by TFE. - IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32; - } - if (BaseOpcode->Store) { VDataIn = MI.getOperand(1).getReg(); VDataTy = MRI->getType(VDataIn); @@ -1596,26 +1593,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( } } - // Optimize _L to _LZ when _L is zero - if (LZMappingInfo) { - // The legalizer replaced the register with an immediate 0 if we need to - // change the opcode. - const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex); - if (Lod.isImm()) { - assert(Lod.getImm() == 0); - IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l - } - } - - // Optimize _mip away, when 'lod' is zero - if (MIPMappingInfo) { - const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex); - if (Lod.isImm()) { - assert(Lod.getImm() == 0); - IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip - } - } - // Set G16 opcode if (IsG16 && !IsA16) { const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = @@ -2562,6 +2539,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { Register MaskReg = I.getOperand(2).getReg(); LLT Ty = MRI->getType(DstReg); LLT MaskTy = MRI->getType(MaskReg); + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); @@ -2570,6 +2549,24 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { if (DstRB != SrcRB) // Should only happen for hand written MIR. return false; + // Try to avoid emitting a bit operation when we only need to touch half of + // the 64-bit pointer. + APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); + const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); + const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); + + const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32; + const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32; + + if (!IsVGPR && Ty.getSizeInBits() == 64 && + !CanCopyLow32 && !CanCopyHi32) { + auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) + .addReg(SrcReg) + .addReg(MaskReg); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + } + unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; const TargetRegisterClass &RegRC = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; @@ -2586,8 +2583,6 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) return false; - MachineBasicBlock *BB = I.getParent(); - const DebugLoc &DL = I.getDebugLoc(); if (Ty.getSizeInBits() == 32) { assert(MaskTy.getSizeInBits() == 32 && "ptrmask should have been narrowed during legalize"); @@ -2610,13 +2605,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { Register MaskedLo, MaskedHi; - // Try to avoid emitting a bit operation when we only need to touch half of - // the 64-bit pointer. - APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64); - - const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); - const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); - if ((MaskOnes & MaskLo32) == MaskLo32) { + if (CanCopyLow32) { // If all the bits in the low half are 1, we only need a copy for it. MaskedLo = LoReg; } else { @@ -2631,7 +2620,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { .addReg(MaskLo); } - if ((MaskOnes & MaskHi32) == MaskHi32) { + if (CanCopyHi32) { // If all the bits in the high half are 1, we only need a copy for it. MaskedHi = HiReg; } else { @@ -3123,6 +3112,33 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ return true; } +bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); + const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (IsVALU) { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) + .addImm(Subtarget->getWavefrontSizeLog2()) + .addReg(SrcReg); + } else { + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) + .addReg(SrcReg) + .addImm(Subtarget->getWavefrontSizeLog2()); + } + + const TargetRegisterClass &RC = + IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; + if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) + return false; + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); @@ -3236,7 +3252,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_SHUFFLE_VECTOR: return selectG_SHUFFLE_VECTOR(I); case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: - case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { const AMDGPU::ImageDimIntrinsicInfo *Intr = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID()); assert(Intr && "not an image intrinsic with image pseudo"); @@ -3252,6 +3270,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case AMDGPU::G_SI_CALL: I.setDesc(TII.get(AMDGPU::SI_CALL)); return true; + case AMDGPU::G_AMDGPU_WAVE_ADDRESS: + return selectWaveAddress(I); default: return selectImpl(I, *CoverageInfo); } @@ -3896,20 +3916,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits; } +// Return the wave level SGPR base address if this is a wave address. +static Register getWaveAddress(const MachineInstr *Def) { + return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS + ? Def->getOperand(1).getReg() + : Register(); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFScratchOffset( MachineOperand &Root) const { - MachineInstr *MI = Root.getParent(); - MachineBasicBlock *MBB = MI->getParent(); + Register Reg = Root.getReg(); + const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + + const MachineInstr *Def = MRI->getVRegDef(Reg); + if (Register WaveBase = getWaveAddress(Def)) { + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // soffset + MIB.addReg(WaveBase); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset + }}; + } int64_t Offset = 0; + + // FIXME: Copy check is a hack + Register BasePtr; + if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) { + if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset)) + return {}; + const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr); + Register WaveBase = getWaveAddress(BasePtrDef); + if (!WaveBase) + return {}; + + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(Info->getScratchRSrcReg()); + }, + [=](MachineInstrBuilder &MIB) { // soffset + MIB.addReg(WaveBase); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset + }}; + } + if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || !SIInstrInfo::isLegalMUBUFImmOffset(Offset)) return {}; - const MachineFunction *MF = MBB->getParent(); - const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); - return {{ [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 26996e42af53..42095332d11a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -30,7 +30,6 @@ namespace AMDGPU { struct ImageDimIntrinsicInfo; } -class AMDGPUInstrInfo; class AMDGPURegisterBankInfo; class AMDGPUTargetMachine; class BlockFrequencyInfo; @@ -42,7 +41,6 @@ class MachineOperand; class MachineRegisterInfo; class RegisterBank; class SIInstrInfo; -class SIMachineFunctionInfo; class SIRegisterInfo; class TargetRegisterClass; @@ -147,6 +145,7 @@ private: bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, MachineOperand &DataOp) const; bool selectBVHIntrinsic(MachineInstr &I) const; + bool selectWaveAddress(MachineInstr &I) const; std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 0528b552f475..7d3dbfd7e851 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -18,6 +18,7 @@ class AddressSpacesImpl { int Local = 3; int Constant = 4; int Private = 5; + int Constant32Bit = 6; } def AddrSpaces : AddressSpacesImpl; @@ -405,18 +406,23 @@ class Aligned<int Bytes> { int MinAlignment = Bytes; } -class StoreHi16<SDPatternOperator op> : PatFrag < +class StoreHi16<SDPatternOperator op, ValueType vt> : PatFrag < (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)> { let IsStore = 1; + let MemoryVT = vt; } -def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant ]>; -def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant ]>; +def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; +def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, + AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>; -def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, - AddrSpaces.Global, - AddrSpaces.Constant ]>; +def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, + AddrSpaces.Global, + AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>; def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>; @@ -522,9 +528,9 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr), let MemoryVT = i16; } -def store_hi16_#as : StoreHi16 <truncstorei16>; -def truncstorei8_hi16_#as : StoreHi16<truncstorei8>; -def truncstorei16_hi16_#as : StoreHi16<truncstorei16>; +def store_hi16_#as : StoreHi16 <truncstorei16, i16>; +def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>; +def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>; defm atomic_store_#as : binary_atomic_op<atomic_store>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5046daaed977..04c6f67ed339 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -272,8 +272,8 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; unsigned RegSize = Ty.getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); - unsigned AlignBits = Query.MMODescrs[0].AlignInBits; + uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; unsigned AS = Query.Types[1].getAddressSpace(); // All of these need to be custom lowered to cast the pointer operand. @@ -380,7 +380,7 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, /// access up to the alignment. Note this case when the memory access itself /// changes, not the size of the result register. static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, - unsigned AlignInBits, unsigned AddrSpace, + uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode) { unsigned SizeInBits = MemoryTy.getSizeInBits(); // We don't want to widen cases that are naturally legal. @@ -929,10 +929,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_CTPOP) .legalFor({{S32, S32}, {S32, S64}}) .clampScalar(0, S32, S32) + .widenScalarToNextPow2(1, 32) .clampScalar(1, S32, S64) .scalarize(0) - .widenScalarToNextPow2(0, 32) - .widenScalarToNextPow2(1, 32); + .widenScalarToNextPow2(0, 32); + // The hardware instructions return a different result on 0 than the generic // instructions expect. The hardware produces -1, but these produce the @@ -1172,7 +1173,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (MemSize > MaxSize) return std::make_pair(0, LLT::scalar(MaxSize)); - unsigned Align = Query.MMODescrs[0].AlignInBits; + uint64_t Align = Query.MMODescrs[0].AlignInBits; return std::make_pair(0, LLT::scalar(Align)); }) .fewerElementsIf( @@ -1295,6 +1296,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); + if (ST.hasGFX90AInsts()) { + // These are legal with some caveats, and should have undergone expansion in + // the IR in most situations + // TODO: Move atomic expansion into legalizer + // TODO: Also supports <2 x f16> + Atomic.legalFor({ + {S32, GlobalPtr}, + {S64, GlobalPtr}, + {S64, FlatPtr} + }); + } + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) @@ -1345,8 +1358,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }, changeTo(1, S16)); Shifts.maxScalarIf(typeIs(0, S16), 1, S16); Shifts.clampScalar(1, S32, S32); - Shifts.clampScalar(0, S16, S64); Shifts.widenScalarToNextPow2(0, 16); + Shifts.clampScalar(0, S16, S64); getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) .minScalar(0, S16) @@ -1357,8 +1370,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // expansion for the shifted type will produce much worse code if it hasn't // been truncated already. Shifts.clampScalar(1, S32, S32); - Shifts.clampScalar(0, S32, S64); Shifts.widenScalarToNextPow2(0, 32); + Shifts.clampScalar(0, S32, S64); getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) .minScalar(0, S32) @@ -1812,6 +1825,27 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } +/// Return true if the value is a known valid address, such that a null check is +/// not necessary. +static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, + const AMDGPUTargetMachine &TM, unsigned AddrSpace) { + MachineInstr *Def = MRI.getVRegDef(Val); + switch (Def->getOpcode()) { + case AMDGPU::G_FRAME_INDEX: + case AMDGPU::G_GLOBAL_VALUE: + case AMDGPU::G_BLOCK_ADDR: + return true; + case AMDGPU::G_CONSTANT: { + const ConstantInt *CI = Def->getOperand(1).getCImm(); + return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); + } + default: + return false; + } + + return false; +} + bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1862,6 +1896,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS); + + if (isKnownNonNull(Src, MRI, TM, SrcAS)) { + // Extract low 32-bits of the pointer. + B.buildExtract(Dst, Src, 0); + MI.eraseFromParent(); + return true; + } + unsigned NullVal = TM.getNullPointerValue(DestAS); auto SegmentNull = B.buildConstant(DstTy, NullVal); @@ -1884,24 +1926,29 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( if (!ST.hasFlatAddressSpace()) return false; - auto SegmentNull = - B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); - auto FlatNull = - B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); - Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); if (!ApertureReg.isValid()) return false; - auto CmpRes = - B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); - // Coerce the type of the low half of the result so we can use merge_values. Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + + if (isKnownNonNull(Src, MRI, TM, SrcAS)) { + B.buildCopy(Dst, BuildPtr); + MI.eraseFromParent(); + return true; + } + + auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + + auto CmpRes = + B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); MI.eraseFromParent(); @@ -1959,6 +2006,7 @@ bool AMDGPULegalizerInfo::legalizeFceil( // TODO: Should this propagate fast-math-flags? B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); + MI.eraseFromParent(); return true; } @@ -2213,10 +2261,12 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Dst)); - if (IdxVal < VecTy.getNumElements()) - B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); - else + if (IdxVal < VecTy.getNumElements()) { + auto Unmerge = B.buildUnmerge(EltTy, Vec); + B.buildCopy(Dst, Unmerge.getReg(IdxVal)); + } else { B.buildUndef(Dst); + } MI.eraseFromParent(); return true; @@ -2245,11 +2295,20 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( LLT VecTy = MRI.getType(Vec); LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Ins)); + (void)Ins; - if (IdxVal < VecTy.getNumElements()) - B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); - else + unsigned NumElts = VecTy.getNumElements(); + if (IdxVal < NumElts) { + SmallVector<Register, 8> SrcRegs; + for (unsigned i = 0; i < NumElts; ++i) + SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); + B.buildUnmerge(SrcRegs, Vec); + + SrcRegs[IdxVal] = MI.getOperand(2).getReg(); + B.buildMerge(Dst, SrcRegs); + } else { B.buildUndef(Dst); + } MI.eraseFromParent(); return true; @@ -2502,7 +2561,7 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, const LLT MemTy = MMO->getMemoryType(); const Align MemAlign = MMO->getAlign(); const unsigned MemSize = MemTy.getSizeInBits(); - const unsigned AlignInBits = 8 * MemAlign.value(); + const uint64_t AlignInBits = 8 * MemAlign.value(); // Widen non-power-of-2 loads to the alignment if needed if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { @@ -2832,8 +2891,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); assert(DstReg.isVirtual() && "Virtual register expected"); - Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, - ArgTy); + Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, + *ArgRC, B.getDebugLoc(), ArgTy); if (Arg->isMasked()) { // TODO: Should we try to emit this once in the entry block? const LLT S32 = LLT::scalar(32); @@ -2842,6 +2901,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, Register AndMaskSrc = LiveIn; + // TODO: Avoid clearing the high bits if we know workitem id y/z are always + // 0. if (Shift != 0) { auto ShiftAmt = B.buildConstant(S32, Shift); AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); @@ -4106,7 +4167,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; - case Intrinsic::amdgcn_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; @@ -4213,15 +4273,18 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, if ((I < Intr->GradientStart) || (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || (I >= Intr->CoordStart && !IsA16)) { - // Handle any gradient or coordinate operands that should not be packed if ((I < Intr->GradientStart) && IsA16 && (B.getMRI()->getType(AddrReg) == S16)) { + assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); // Special handling of bias when A16 is on. Bias is of type half but // occupies full 32-bit. PackedAddrs.push_back( B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) .getReg(0)); } else { + assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && + "Bias needs to be converted to 16 bit in A16 mode"); + // Handle any gradient or coordinate operands that should not be packed AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); PackedAddrs.push_back(AddrReg); } @@ -4320,6 +4383,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( const LLT V2S16 = LLT::fixed_vector(2, 16); unsigned DMask = 0; + Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); + LLT Ty = MRI->getType(VData); // Check for 16 bit addresses and pack if true. LLT GradTy = @@ -4328,6 +4393,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); const bool IsG16 = GradTy == S16; const bool IsA16 = AddrTy == S16; + const bool IsD16 = Ty.getScalarType() == S16; int DMaskLanes = 0; if (!BaseOpcode->Atomic) { @@ -4347,8 +4413,11 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( Observer.changingInstr(MI); auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); - unsigned NewOpcode = NumDefs == 0 ? - AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; + const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 + : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; + const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 + : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; + unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; // Track that we legalized this MI.setDesc(B.getTII().get(NewOpcode)); @@ -4381,44 +4450,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( unsigned CorrectedNumVAddrs = Intr->NumVAddrs; - // Optimize _L to _LZ when _L is zero - if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = - AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { - const ConstantFP *ConstantLod; - - if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, - m_GFCst(ConstantLod))) { - if (ConstantLod->isZero() || ConstantLod->isNegative()) { - // Set new opcode to _lz variant of _l, and change the intrinsic ID. - const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = - AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, - Intr->Dim); - - // The starting indexes should remain in the same place. - --CorrectedNumVAddrs; - - MI.getOperand(MI.getNumExplicitDefs()) - .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); - MI.RemoveOperand(ArgOffset + Intr->LodIndex); - Intr = NewImageDimIntr; - } - } - } - - // Optimize _mip away, when 'lod' is zero - if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { - int64_t ConstantLod; - if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, - m_ICst(ConstantLod))) { - if (ConstantLod == 0) { - // TODO: Change intrinsic opcode and remove operand instead or replacing - // it with 0, as the _L to _LZ handling is done above. - MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); - --CorrectedNumVAddrs; - } - } - } - // Rewrite the addressing register layout before doing anything else. if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { // 16 bit gradients are supported, but are tied to the A16 control @@ -4494,9 +4525,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( if (BaseOpcode->Store) { // No TFE for stores? // TODO: Handle dmask trim - Register VData = MI.getOperand(1).getReg(); - LLT Ty = MRI->getType(VData); - if (!Ty.isVector() || Ty.getElementType() != S16) + if (!Ty.isVector() || !IsD16) return true; Register RepackedReg = handleD16VData(B, *MRI, VData, true); @@ -4508,9 +4537,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } Register DstReg = MI.getOperand(0).getReg(); - LLT Ty = MRI->getType(DstReg); const LLT EltTy = Ty.getScalarType(); - const bool IsD16 = Ty.getScalarType() == S16; const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; // Confirm that the return type is large enough for the dmask specified @@ -4918,6 +4945,12 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, return true; } +static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { + B.buildConstant(MI.getOperand(0).getReg(), C); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; @@ -5021,12 +5054,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_implicitarg_ptr: return legalizeImplicitArgPtr(MI, MRI, B); case Intrinsic::amdgcn_workitem_id_x: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) + return replaceWithConstant(B, MI, 0); return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_X); case Intrinsic::amdgcn_workitem_id_y: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) + return replaceWithConstant(B, MI, 0); + return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_Y); case Intrinsic::amdgcn_workitem_id_z: + if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) + return replaceWithConstant(B, MI, 0); + return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKITEM_ID_Z); case Intrinsic::amdgcn_workgroup_id_x: @@ -5105,16 +5146,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_struct_buffer_atomic_inc: case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_struct_buffer_atomic_dec: - case Intrinsic::amdgcn_raw_buffer_atomic_fadd: - case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: - case Intrinsic::amdgcn_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: return legalizeBufferAtomic(MI, B, IntrID); + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { + Register DstReg = MI.getOperand(0).getReg(); + if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { + Function &F = B.getMF().getFunction(); + DiagnosticInfoUnsupported NoFpRet( + F, "return versions of fp atomics not supported", B.getDebugLoc(), + DS_Error); + F.getContext().diagnose(NoFpRet); + B.buildUndef(DstReg); + MI.eraseFromParent(); + return true; + } + + return legalizeBufferAtomic(MI, B, IntrID); + } case Intrinsic::amdgcn_atomic_inc: return legalizeAtomicIncDec(MI, B, true); case Intrinsic::amdgcn_atomic_dec: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 7faf0436f995..964a41d3d740 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -21,7 +21,6 @@ namespace llvm { class GCNTargetMachine; -class LLVMContext; class GCNSubtarget; class MachineIRBuilder; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 49cf6db5197f..c28427758ac7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -58,9 +58,6 @@ private: // "FuncName" exists. It may create a new function prototype in pre-link mode. FunctionCallee getFunction(Module *M, const FuncInfo &fInfo); - // Replace a normal function with its native version. - bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo); - bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo); bool TDOFold(CallInst *CI, const FuncInfo &FInfo); @@ -90,24 +87,6 @@ private: double& Res1, Constant *copr0, Constant *copr1, Constant *copr2); bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo); - // exp - bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // exp2 - bool fold_exp2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // exp10 - bool fold_exp10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // log - bool fold_log(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // log2 - bool fold_log2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - - // log10 - bool fold_log10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); - // sqrt bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); @@ -623,7 +602,8 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { Function *Callee = CI->getCalledFunction(); // Ignore indirect calls. - if (Callee == 0) return false; + if (Callee == nullptr) + return false; BasicBlock *BB = CI->getParent(); LLVMContext &Context = CI->getParent()->getContext(); @@ -778,27 +758,6 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { return false; } -bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) { - Module *M = CI->getModule(); - if (getArgType(FInfo) != AMDGPULibFunc::F32 || - FInfo.getPrefix() != AMDGPULibFunc::NOPFX || - !HasNative(FInfo.getId())) - return false; - - AMDGPULibFunc nf = FInfo; - nf.setPrefix(AMDGPULibFunc::NATIVE); - if (FunctionCallee FPExpr = getFunction(M, nf)) { - LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> "); - - CI->setCalledFunction(FPExpr); - - LLVM_DEBUG(dbgs() << *CI << '\n'); - - return true; - } - return false; -} - // [native_]half_recip(c) ==> 1.0/c bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo) { @@ -1402,8 +1361,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B, Function *UCallee = UI->getCalledFunction(); Type *RetType = UCallee->getReturnType(); B.SetInsertPoint(&*ItNew); - AllocaInst *Alloc = B.CreateAlloca(RetType, 0, - std::string(prefix) + UI->getName()); + AllocaInst *Alloc = + B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName()); Alloc->setAlignment( Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType))); return Alloc; @@ -1724,7 +1683,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) { // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); - if (Callee == 0) continue; + if (Callee == nullptr) + continue; LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; dbgs().flush()); @@ -1757,7 +1717,7 @@ PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F, // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); - if (Callee == 0) + if (Callee == nullptr) continue; LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; @@ -1783,9 +1743,10 @@ bool AMDGPUUseNativeCalls::runOnFunction(Function &F) { // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); - if (Callee == 0) continue; + if (Callee == nullptr) + continue; - if(Simplifier.useNative(CI)) + if (Simplifier.useNative(CI)) Changed = true; } } @@ -1811,7 +1772,7 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, // Ignore indirect calls. Function *Callee = CI->getCalledFunction(); - if (Callee == 0) + if (Callee == nullptr) continue; if (Simplifier.useNative(CI)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h index c97223b047e8..dc0ac72016f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -10,6 +10,7 @@ #define _AMDGPU_LIBFUNC_H_ #include "llvm/ADT/StringRef.h" +#include <memory> namespace llvm { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 0c743a77092c..593388a4d819 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -15,9 +15,8 @@ using namespace llvm; AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) - : MachineFunctionInfo(), Mode(MF.getFunction()), - IsEntryFunction( - AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), + : Mode(MF.getFunction()), IsEntryFunction(AMDGPU::isEntryFunctionCC( + MF.getFunction().getCallingConv())), IsModuleEntryFunction( AMDGPU::isModuleEntryFunctionCC(MF.getFunction().getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 10ff50040c6a..48cf46b5f871 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -15,8 +15,6 @@ namespace llvm { -class GCNSubtarget; - class AMDGPUMachineFunction : public MachineFunctionInfo { /// A map to keep track of local memory objects and their offsets within the /// local memory space. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h index 8af7979dba8b..5cefc83e25e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -29,4 +29,4 @@ const char NoteNameV3[] = "AMDGPU"; } // End namespace ElfNote } // End namespace AMDGPU } // End namespace llvm -#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 7c4eb71882c7..f91f31508ad2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -463,7 +463,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { WhatToStore.push_back(Arg); } } else if (isa<FixedVectorType>(ArgType)) { - Type *IType = NULL; + Type *IType = nullptr; uint32_t EleCount = cast<FixedVectorType>(ArgType)->getNumElements(); uint32_t EleSize = ArgType->getScalarSizeInBits(); uint32_t TotalSize = EleCount * EleSize; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index f9a9fe403ff6..2d8126a49327 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -789,6 +789,17 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { Align Alignment = DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType()); + + // HIP uses an extern unsized array in local address space for dynamically + // allocated shared memory. In that case, we have to disable the promotion. + if (GV->hasExternalLinkage() && AllocSize == 0) { + LocalMemLimit = 0; + LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated " + "local memory. Promoting to local memory " + "disabled.\n"); + return false; + } + AllocatedSizes.emplace_back(AllocSize, Alignment); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 3ce67a733c10..0df6f4d45b06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -36,6 +36,7 @@ protected: MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; + const GCNSubtarget &Subtarget; const RegisterBankInfo &RBI; const TargetRegisterInfo &TRI; const SIInstrInfo &TII; @@ -44,9 +45,9 @@ protected: public: AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) : B(B), MF(B.getMF()), MRI(*B.getMRI()), - RBI(*MF.getSubtarget().getRegBankInfo()), - TRI(*MF.getSubtarget().getRegisterInfo()), - TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){}; + Subtarget(MF.getSubtarget<GCNSubtarget>()), + RBI(*Subtarget.getRegBankInfo()), TRI(*Subtarget.getRegisterInfo()), + TII(*Subtarget.getInstrInfo()), Helper(Helper){}; bool isVgprRegBank(Register Reg); Register getAsVgpr(Register Reg); @@ -193,7 +194,10 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( MachineInstr &MI, Med3MatchInfo &MatchInfo) { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); - if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32)) + + // med3 for f16 is only available on gfx9+, and not available for v2f16. + if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) && + Ty != LLT::scalar(32)) return false; auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c60012bcfe2e..de2dccef804a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -718,8 +718,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); const unsigned WaveAndOpc = Subtarget.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - const unsigned MovTermOpc = Subtarget.isWave32() ? - AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + const unsigned MovExecOpc = + Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + const unsigned MovExecTermOpc = + Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + const unsigned XorTermOpc = Subtarget.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; const unsigned AndSaveExecOpc = Subtarget.isWave32() ? @@ -996,12 +999,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); // Save the EXEC mask before the loop. - BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) + BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) .addReg(ExecReg); // Restore the EXEC mask after the loop. B.setMBB(*RestoreExecBB); - B.buildInstr(MovTermOpc) + B.buildInstr(MovExecTermOpc) .addDef(ExecReg) .addReg(SaveExecReg); @@ -2953,7 +2956,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( break; } case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: - case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); assert(RSrcIntrin && RSrcIntrin->IsImage); @@ -3691,6 +3696,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); break; } + case AMDGPU::G_AMDGPU_WAVE_ADDRESS: { + // This case is weird because we expect a physical register in the source, + // but need to set a bank anyway. + // + // We could select the result to SGPR or VGPR, but for the one current use + // it's more practical to always use VGPR. + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + break; + } case AMDGPU::G_INSERT: { unsigned BankID = getMappingType(MRI, MI); unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); @@ -4078,7 +4093,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mqsad_pk_u16_u8: case Intrinsic::amdgcn_mqsad_u32_u8: case Intrinsic::amdgcn_cvt_pk_u8_f32: - case Intrinsic::amdgcn_alignbit: case Intrinsic::amdgcn_alignbyte: case Intrinsic::amdgcn_perm: case Intrinsic::amdgcn_fdot2: @@ -4276,7 +4290,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: - case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: + case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { auto IntrID = MI.getIntrinsicID(); const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 45f7c2f369bd..1c6c63dd5b25 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -353,7 +353,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { // off any return attributes, e.g. zeroext doesn't make sense with a struct. NewFunc->stealArgumentListFrom(F); - AttrBuilder RetAttrs; + AttributeMask RetAttrs; RetAttrs.addAttribute(Attribute::SExt); RetAttrs.addAttribute(Attribute::ZExt); RetAttrs.addAttribute(Attribute::NoAlias); @@ -433,7 +433,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { PointerType *ArgType = cast<PointerType>(Arg.getType()); - auto *EltTy = ArgType->getElementType(); + auto *EltTy = ArgType->getPointerElementType(); const auto Align = DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index cd05797fdbdb..e82f9232b114 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -269,7 +269,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasGetWaveIdInst(false), HasSMemTimeInst(false), HasShaderCyclesRegister(false), - HasRegisterBanking(false), HasVOP3Literal(false), HasNoDataDepHazard(false), FlatAddressSpace(false), @@ -772,11 +771,11 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { } unsigned -GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { +GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. - if (HasFlatScratchInit || HasArchitectedFlatScratch) { + if (HasFlatScratch || HasArchitectedFlatScratch) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) @@ -794,20 +793,11 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { } unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { - // The logic to detect if the function has - // flat scratch init is slightly different than how - // SIMachineFunctionInfo constructor derives. - // We don't use amdgpu-calls, amdgpu-stack-objects - // attributes and isAmdHsaOrMesa here as it doesn't really matter. - // TODO: Outline this derivation logic and have just - // one common function in the backend to avoid duplication. - bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv()); - bool FunctionHasFlatScratchInit = false; - if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() && - enableFlatScratch()) { - FunctionHasFlatScratchInit = true; - } - return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); + // In principle we do not need to reserve SGPR pair used for flat_scratch if + // we know flat instructions do not access the stack anywhere in the + // program. For now assume it's needed if we have flat instructions. + const bool KernelUsesFlatScratch = hasFlatAddressSpace(); + return getBaseReservedNumSGPRs(KernelUsesFlatScratch); } unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 88ed4b2b7a24..7f1b94be4ffe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -212,7 +212,19 @@ public: /// Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const Function &F) const { - return isAmdHsaOrMesa(F) ? 0 : 36; + switch (TargetTriple.getOS()) { + case Triple::AMDHSA: + case Triple::AMDPAL: + case Triple::Mesa3D: + return 0; + case Triple::UnknownOS: + default: + // For legacy reasons unknown/other is treated as a different version of + // mesa. + return 36; + } + + llvm_unreachable("invalid triple OS"); } /// \returns Maximum number of work groups per compute unit supported by the diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 226646a96953..dd3676f3b707 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -21,8 +21,6 @@ namespace llvm { -class ScheduleDAGMILive; - //===----------------------------------------------------------------------===// // AMDGPU Target Machine (R600+) //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 09c5eb192e1f..a8df7789c8a1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -844,15 +844,8 @@ bool GCNTTIImpl::isInlineAsmSourceOfDivergence( TLI->ComputeConstraintToUse(TC, SDValue()); - Register AssignedReg; - const TargetRegisterClass *RC; - std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint( - TRI, TC.ConstraintCode, TC.ConstraintVT); - if (AssignedReg) { - // FIXME: This is a workaround for getRegForInlineAsmConstraint - // returning VS_32 - RC = TRI->getPhysRegClass(AssignedReg); - } + const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint( + TRI, TC.ConstraintCode, TC.ConstraintVT).second; // For AGPR constraints null is returned on subtargets without AGPRs, so // assume divergent for null. diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2bb59086f391..c1c88d9a7462 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -62,7 +62,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { public: AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_) - : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {} + : Kind(Kind_), AsmParser(AsmParser_) {} using Ptr = std::unique_ptr<AMDGPUOperand>; @@ -1548,6 +1548,7 @@ private: bool validateVccOperand(unsigned Reg) const; bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands); bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands); + bool validateMFMA(const MCInst &Inst, const OperandVector &Operands); bool validateAGPRLdSt(const MCInst &Inst) const; bool validateVGPRAlign(const MCInst &Inst) const; bool validateGWS(const MCInst &Inst, const OperandVector &Operands); @@ -3613,6 +3614,40 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst, return true; } +bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst, + const OperandVector &Operands) { + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::IsMAI) == 0) + return true; + + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx == -1) + return true; + + const MCOperand &Src2 = Inst.getOperand(Src2Idx); + if (!Src2.isReg()) + return true; + + MCRegister Src2Reg = Src2.getReg(); + MCRegister DstReg = Inst.getOperand(0).getReg(); + if (Src2Reg == DstReg) + return true; + + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + if (TRI->getRegClass(Desc.OpInfo[0].RegClass).getSizeInBits() <= 128) + return true; + + if (isRegIntersect(Src2Reg, DstReg, TRI)) { + Error(getRegLoc(mc2PseudoReg(Src2Reg), Operands), + "source 2 operand must not partially overlap with dst"); + return false; + } + + return true; +} + bool AMDGPUAsmParser::validateDivScale(const MCInst &Inst) { switch (Inst.getOpcode()) { default: @@ -4297,6 +4332,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateMAIAccWrite(Inst, Operands)) { return false; } + if (!validateMFMA(Inst, Operands)) { + return false; + } if (!validateCoherencyBits(Inst, Operands, IDLoc)) { return false; } @@ -4568,7 +4606,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { uint64_t AccumOffset = 0; SMRange SGPRRange; uint64_t NextFreeSGPR = 0; - unsigned UserSGPRCount = 0; + + // Count the number of user SGPRs implied from the enabled feature bits. + unsigned ImpliedUserSGPRCount = 0; + + // Track if the asm explicitly contains the directive for the user SGPR + // count. + Optional<unsigned> ExplicitUserSGPRCount; bool ReserveVCC = true; bool ReserveFlatScr = true; Optional<bool> EnableWavefrontSize32; @@ -4617,6 +4661,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val)) return OutOfRangeError(ValRange); KD.kernarg_size = Val; + } else if (ID == ".amdhsa_user_sgpr_count") { + ExplicitUserSGPRCount = Val; } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") { if (hasArchitectedFlatScratch()) return Error(IDRange.Start, @@ -4626,31 +4672,31 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, Val, ValRange); if (Val) - UserSGPRCount += 4; + ImpliedUserSGPRCount += 4; } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_queue_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_dispatch_id") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { if (hasArchitectedFlatScratch()) return Error(IDRange.Start, @@ -4660,13 +4706,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, ValRange); if (Val) - UserSGPRCount += 2; + ImpliedUserSGPRCount += 2; } else if (ID == ".amdhsa_user_sgpr_private_segment_size") { PARSE_BITS_ENTRY(KD.kernel_code_properties, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, Val, ValRange); if (Val) - UserSGPRCount += 1; + ImpliedUserSGPRCount += 1; } else if (ID == ".amdhsa_wavefront_size32") { if (IVersion.Major < 10) return Error(IDRange.Start, "directive requires gfx10+", IDRange); @@ -4850,6 +4896,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, SGPRBlocks); + if (ExplicitUserSGPRCount && ImpliedUserSGPRCount > *ExplicitUserSGPRCount) + return TokError("amdgpu_user_sgpr_count smaller than than implied by " + "enabled user SGPRs"); + + unsigned UserSGPRCount = + ExplicitUserSGPRCount ? *ExplicitUserSGPRCount : ImpliedUserSGPRCount; + if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount)) return TokError("too many user SGPRs enabled"); AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 104b5160b985..c4043177b618 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -89,7 +89,6 @@ class DS_Real <DS_Pseudo ps> : !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0)); } - // DS Pseudo instructions class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32> diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c7ec5308e6d0..c530d3cb49f0 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -915,7 +915,7 @@ class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueT class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), - (inst $vaddr, $data, $offset) + (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 0f8dd0b3bf58..c0592f6f3c7a 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -95,7 +95,9 @@ static bool isDGEMM(unsigned Opcode) { return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 || Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 || - Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64; + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64; } static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { @@ -1438,7 +1440,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { if (!Use.isReg()) continue; - unsigned Reg = Use.getReg(); + Register Reg = Use.getReg(); bool FullReg; const MachineInstr *MI1; @@ -1477,6 +1479,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { switch (Opc1) { case AMDGPU::V_MFMA_F64_16X16X4F64_e64: case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: if (!isXDL(ST, *MI)) NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; break; @@ -1509,6 +1513,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { switch (Opc1) { case AMDGPU::V_MFMA_F64_16X16X4F64_e64: case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64: NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; break; case AMDGPU::V_MFMA_F64_4X4X4F64_e64: diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index 162121c2c525..716bc027a894 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -25,7 +25,6 @@ class MachineFunction; class MachineInstr; class MachineOperand; class MachineRegisterInfo; -class ScheduleDAG; class SIInstrInfo; class SIRegisterInfo; class GCNSubtarget; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 82c09378acac..fb106d98c162 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI, << *LIS.getInstructionFromIndex(SI); unsigned Num = 0; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - const unsigned Reg = Register::index2VirtReg(I); + const Register Reg = Register::index2VirtReg(I); if (!LIS.hasInterval(Reg)) continue; const auto &LI = LIS.getInterval(Reg); @@ -487,7 +487,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, const MachineRegisterInfo &MRI) { const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - unsigned Reg = Register::index2VirtReg(I); + Register Reg = Register::index2VirtReg(I); auto It = LiveRegs.find(Reg); if (It != LiveRegs.end() && It->second.any()) OS << ' ' << printVRegOrUnit(Reg, TRI) << ':' diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 53d6ff0aa731..a6e42ad3dfca 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -140,4 +140,4 @@ public: } // End namespace llvm -#endif // GCNSCHEDSTRATEGY_H +#endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index d8bc0b2df2bd..0cd2cfa2f0e7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -153,7 +153,6 @@ protected: bool HasGetWaveIdInst; bool HasSMemTimeInst; bool HasShaderCyclesRegister; - bool HasRegisterBanking; bool HasVOP3Literal; bool HasNoDataDepHazard; bool FlatAddressSpace; @@ -723,10 +722,6 @@ public: return HasShaderCyclesRegister; } - bool hasRegisterBanking() const { - return HasRegisterBanking; - } - bool hasVOP3Literal() const { return HasVOP3Literal; } @@ -1029,7 +1024,7 @@ public: /// \returns Reserved number of SGPRs. This is common /// utility function called by MachineFunction and /// Function variants of getReservedNumSGPRs. - unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const; + unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; /// \returns Reserved number of SGPRs for given machine function \p MF. unsigned getReservedNumSGPRs(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index b68b4b12e750..76663b563150 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1397,21 +1397,26 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, unsigned Vmcnt, Expcnt, Lgkmcnt; decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt); + bool IsDefaultVmcnt = Vmcnt == getVmcntBitMask(ISA); + bool IsDefaultExpcnt = Expcnt == getExpcntBitMask(ISA); + bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA); + bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt; + bool NeedSpace = false; - if (Vmcnt != getVmcntBitMask(ISA)) { + if (!IsDefaultVmcnt || PrintAll) { O << "vmcnt(" << Vmcnt << ')'; NeedSpace = true; } - if (Expcnt != getExpcntBitMask(ISA)) { + if (!IsDefaultExpcnt || PrintAll) { if (NeedSpace) O << ' '; O << "expcnt(" << Expcnt << ')'; NeedSpace = true; } - if (Lgkmcnt != getLgkmcntBitMask(ISA)) { + if (!IsDefaultLgkmcnt || PrintAll) { if (NeedSpace) O << ' '; O << "lgkmcnt(" << Lgkmcnt << ')'; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 7708579a4491..ded3fb7ab8d9 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -15,8 +15,7 @@ using namespace llvm; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, - const MCTargetOptions &Options) - : MCAsmInfoELF() { + const MCTargetOptions &Options) { CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4; StackGrowsUp = true; HasSingleParameterDotFile = false; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 9a9a2c973f44..9578bdb0bad0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -319,6 +319,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( << KD.private_segment_fixed_size << '\n'; OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n'; + PRINT_FIELD(OS, ".amdhsa_user_sgpr_count", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT); + if (!hasArchitectedFlatScratch(STI)) PRINT_FIELD( OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 6dd886367302..cf03fd682143 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -131,6 +131,38 @@ def MIMGMIPMappingTable : GenericTable { let PrimaryKeyName = "getMIMGMIPMappingInfo"; } +class MIMGBiasMapping<MIMGBaseOpcode bias, MIMGBaseOpcode nobias> { + MIMGBaseOpcode Bias = bias; + MIMGBaseOpcode NoBias = nobias; +} + +def MIMGBiasMappingTable : GenericTable { + let FilterClass = "MIMGBiasMapping"; + let CppTypeName = "MIMGBiasMappingInfo"; + let Fields = ["Bias", "NoBias"]; + string TypeOf_Bias = "MIMGBaseOpcode"; + string TypeOf_NoBias = "MIMGBaseOpcode"; + + let PrimaryKey = ["Bias"]; + let PrimaryKeyName = "getMIMGBiasMappingInfo"; +} + +class MIMGOffsetMapping<MIMGBaseOpcode offset, MIMGBaseOpcode nooffset> { + MIMGBaseOpcode Offset = offset; + MIMGBaseOpcode NoOffset = nooffset; +} + +def MIMGOffsetMappingTable : GenericTable { + let FilterClass = "MIMGOffsetMapping"; + let CppTypeName = "MIMGOffsetMappingInfo"; + let Fields = ["Offset", "NoOffset"]; + string TypeOf_Offset = "MIMGBaseOpcode"; + string TypeOf_NoOffset = "MIMGBaseOpcode"; + + let PrimaryKey = ["Offset"]; + let PrimaryKeyName = "getMIMGOffsetMappingInfo"; +} + class MIMGG16Mapping<MIMGBaseOpcode g, MIMGBaseOpcode g16> { MIMGBaseOpcode G = g; MIMGBaseOpcode G16 = g16; @@ -1070,6 +1102,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>; + bits<8> NumOffsetArgs = DimEval.NumOffsetArgs; + bits<8> NumBiasArgs = DimEval.NumBiasArgs; + bits<8> NumZCompareArgs = DimEval.NumZCompareArgs; bits<8> NumGradients = DimEval.NumGradientArgs; bits<8> NumDmask = DimEval.NumDmaskArgs; bits<8> NumData = DimEval.NumDataArgs; @@ -1078,6 +1113,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> DMaskIndex = DimEval.DmaskArgIndex; bits<8> VAddrStart = DimEval.VAddrArgIndex; + bits<8> OffsetIndex = DimEval.OffsetArgIndex; + bits<8> BiasIndex = DimEval.BiasArgIndex; + bits<8> ZCompareIndex = DimEval.ZCompareArgIndex; bits<8> GradientStart = DimEval.GradientArgIndex; bits<8> CoordStart = DimEval.CoordArgIndex; bits<8> LodIndex = DimEval.LodArgIndex; @@ -1089,6 +1127,8 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex; bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex; + bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes, + !if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny)); bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes, !foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny))); bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); @@ -1096,10 +1136,10 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", + let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", + "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", - "GradientTyArg", "CoordTyArg"]; + "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; @@ -1132,6 +1172,66 @@ def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>; def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>; def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>; +// Bias to NoBias Optimization Mapping +def : MIMGBiasMapping<IMAGE_SAMPLE_B, IMAGE_SAMPLE>; +def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL, IMAGE_SAMPLE_CL>; +def : MIMGBiasMapping<IMAGE_SAMPLE_C_B, IMAGE_SAMPLE_C>; +def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL, IMAGE_SAMPLE_C_CL>; +def : MIMGBiasMapping<IMAGE_SAMPLE_B_O, IMAGE_SAMPLE_O>; +def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL_O, IMAGE_SAMPLE_CL_O>; +def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_O, IMAGE_SAMPLE_C_O>; +def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL_O, IMAGE_SAMPLE_C_CL_O>; +def : MIMGBiasMapping<IMAGE_GATHER4_B, IMAGE_GATHER4>; +def : MIMGBiasMapping<IMAGE_GATHER4_B_CL, IMAGE_GATHER4_CL>; +def : MIMGBiasMapping<IMAGE_GATHER4_C_B, IMAGE_GATHER4_C>; +def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL, IMAGE_GATHER4_C_CL>; +def : MIMGBiasMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_O>; +def : MIMGBiasMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_CL_O>; +def : MIMGBiasMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_O>; +def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_CL_O>; + +// Offset to NoOffset Optimization Mapping +def : MIMGOffsetMapping<IMAGE_SAMPLE_O, IMAGE_SAMPLE>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CL_O, IMAGE_SAMPLE_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O, IMAGE_SAMPLE_D>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O, IMAGE_SAMPLE_D_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O_G16, IMAGE_SAMPLE_D_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O_G16, IMAGE_SAMPLE_D_CL_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_L_O, IMAGE_SAMPLE_L>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_B_O, IMAGE_SAMPLE_B>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_B_CL_O, IMAGE_SAMPLE_B_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_LZ_O, IMAGE_SAMPLE_LZ>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_O, IMAGE_SAMPLE_C>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CL_O, IMAGE_SAMPLE_C_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O, IMAGE_SAMPLE_C_D>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O, IMAGE_SAMPLE_C_D_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O_G16, IMAGE_SAMPLE_C_D_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O_G16, IMAGE_SAMPLE_C_D_CL_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_L_O, IMAGE_SAMPLE_C_L>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_CL_O, IMAGE_SAMPLE_C_B_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_O, IMAGE_SAMPLE_C_B>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_LZ_O, IMAGE_SAMPLE_C_LZ>; +def : MIMGOffsetMapping<IMAGE_GATHER4_O, IMAGE_GATHER4>; +def : MIMGOffsetMapping<IMAGE_GATHER4_CL_O, IMAGE_GATHER4_CL>; +def : MIMGOffsetMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_L>; +def : MIMGOffsetMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_B>; +def : MIMGOffsetMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_B_CL>; +def : MIMGOffsetMapping<IMAGE_GATHER4_LZ_O, IMAGE_GATHER4_LZ>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_O, IMAGE_GATHER4_C>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_CL_O, IMAGE_GATHER4_C_CL>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_L>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_B>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_B_CL>; +def : MIMGOffsetMapping<IMAGE_GATHER4_C_LZ_O, IMAGE_GATHER4_C_LZ>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O_G16, IMAGE_SAMPLE_CD_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O_G16, IMAGE_SAMPLE_CD_CL_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O_G16, IMAGE_SAMPLE_C_CD_G16>; +def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O_G16, IMAGE_SAMPLE_C_CD_CL_G16>; + // G to G16 Optimization Mapping def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>; def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL, IMAGE_SAMPLE_D_CL_G16>; diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index f9a9a6127322..1e75a0432ec3 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -19,7 +19,6 @@ namespace llvm { -class R600InstrInfo; class R600Subtarget; class R600TargetLowering final : public AMDGPUTargetLowering { diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h index fc567f1a1fca..bc8a4786df77 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -29,7 +29,6 @@ enum : uint64_t { }; } -class AMDGPUTargetMachine; class DFAPacketizer; class MachineFunction; class MachineInstr; diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h index 94403b88f21a..92d559b1f8e6 100644 --- a/llvm/lib/Target/AMDGPU/R600Subtarget.h +++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h @@ -21,12 +21,6 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" -namespace llvm { - -class MCInstrInfo; - -} // namespace llvm - #define GET_SUBTARGETINFO_HEADER #include "R600GenSubtargetInfo.inc" diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 397b2f873515..b81fac36fc95 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -245,6 +245,12 @@ Value *SIAnnotateControlFlow::handleLoopCondition( return CallInst::Create(IfBreak, Args, "", Insert); } + if (isa<Argument>(Cond)) { + Instruction *Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); + } + llvm_unreachable("Unhandled loop condition!"); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 580e4bc417a4..107ee5ed5532 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -379,6 +379,8 @@ enum Id { // HwRegCode, (6) [5:0] ID_FLAT_SCR_LO = 20, ID_FLAT_SCR_HI = 21, ID_XNACK_MASK = 22, + ID_HW_ID1 = 23, + ID_HW_ID2 = 24, ID_POPS_PACKER = 25, ID_SHADER_CYCLES = 29, ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 1f93284fc7ee..33954e11d6c6 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -300,6 +300,13 @@ static bool updateOperand(FoldCandidate &Fold, assert(!Fold.needsShrink() && "not handled"); if (Fold.isImm()) { + if (Old.isTied()) { + int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode()); + if (NewMFMAOpc == -1) + return false; + MI->setDesc(TII.get(NewMFMAOpc)); + MI->untieRegOperand(0); + } Old.ChangeToImmediate(Fold.ImmToFold); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index d4fe74ecb96e..6078f4a0577a 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1195,7 +1195,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( } } else if (TII->isStoreToStackSlot(MI, FrameIndex) || TII->isLoadFromStackSlot(MI, FrameIndex)) - NonVGPRSpillFIs.set(FrameIndex); + if (!MFI.isFixedObjectIndex(FrameIndex)) + NonVGPRSpillFIs.set(FrameIndex); } } @@ -1320,16 +1321,14 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, const BitVector AllSavedRegs = SavedRegs; SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); - // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. - const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; - // We have to anticipate introducing CSR VGPR spills or spill of caller // save VGPR reserved for SGPR spills as we now always create stack entry - // for it, if we don't have any stack objects already, since we require - // an FP if there is a call and stack. + // for it, if we don't have any stack objects already, since we require a FP + // if there is a call and stack. We will allocate a VGPR for SGPR spills if + // there are any SGPR spills. Whether they are CSR spills or otherwise. MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const bool WillHaveFP = - FrameInfo.hasCalls() && (HaveAnyCSRVGPR || MFI->VGPRReservedForSGPRSpill); + FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); // FP will be specially managed like SP. if (WillHaveFP || hasFP(MF)) diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 56fbb875ffd9..7949dcfa6632 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -13,11 +13,6 @@ namespace llvm { -class SIInstrInfo; -class SIMachineFunctionInfo; -class SIRegisterInfo; -class GCNSubtarget; - class SIFrameLowering final : public AMDGPUFrameLowering { public: SIFrameLowering(StackDirection D, Align StackAl, int LAO, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9f138136e6e9..561866b5a398 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -45,10 +45,6 @@ static cl::opt<bool> DisableLoopAlignment( cl::desc("Do not align and prefetch loops"), cl::init(false)); -static cl::opt<bool> VGPRReserveforSGPRSpill( - "amdgpu-reserve-vgpr-for-sgpr-spill", - cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); - static cl::opt<bool> UseDivergentRegisterIndexing( "amdgpu-use-divergent-register-indexing", cl::Hidden, @@ -138,6 +134,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -273,7 +271,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, - MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { + MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64, + MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -615,7 +614,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { + for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, + MVT::v8f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -677,6 +677,21 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v4f16, Promote); AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v8i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v8f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v8i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v8f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); @@ -686,6 +701,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand); + if (!Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -703,9 +722,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom); + setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand); + setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand); + setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand); + + for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand); + } } if (Subtarget->hasVOP3PInsts()) { @@ -739,34 +769,42 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); - setOperationAction(ISD::SHL, MVT::v4i16, Custom); - setOperationAction(ISD::SRA, MVT::v4i16, Custom); - setOperationAction(ISD::SRL, MVT::v4i16, Custom); - setOperationAction(ISD::ADD, MVT::v4i16, Custom); - setOperationAction(ISD::SUB, MVT::v4i16, Custom); - setOperationAction(ISD::MUL, MVT::v4i16, Custom); + for (MVT VT : { MVT::v4i16, MVT::v8i16 }) { + // Split vector operations. + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); - setOperationAction(ISD::SMIN, MVT::v4i16, Custom); - setOperationAction(ISD::SMAX, MVT::v4i16, Custom); - setOperationAction(ISD::UMIN, MVT::v4i16, Custom); - setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); - setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom); - setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom); - setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom); - setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); + } - setOperationAction(ISD::FADD, MVT::v4f16, Custom); - setOperationAction(ISD::FMUL, MVT::v4f16, Custom); - setOperationAction(ISD::FMA, MVT::v4f16, Custom); + for (MVT VT : { MVT::v4f16, MVT::v8f16 }) { + // Split vector operations. + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Custom); + } setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom); setOperationAction(ISD::FEXP, MVT::v2f16, Custom); setOperationAction(ISD::SELECT, MVT::v4i16, Custom); @@ -803,7 +841,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FABS, MVT::v2f16, Custom); } - for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { + for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, + MVT::v8i16, MVT::v8f16 }) { setOperationAction(ISD::SELECT, VT, Custom); } @@ -2776,6 +2815,7 @@ void SITargetLowering::passSpecialInputs( SelectionDAG &DAG = CLI.DAG; const SDLoc &DL = CLI.DL; + const Function &F = DAG.getMachineFunction().getFunction(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); @@ -2887,11 +2927,16 @@ void SITargetLowering::passSpecialInputs( // If incoming ids are not packed we need to pack them. if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && - NeedWorkItemIDX) - InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + NeedWorkItemIDX) { + if (Subtarget->getMaxWorkitemID(F, 0) != 0) { + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + } else { + InputReg = DAG.getConstant(0, DL, MVT::i32); + } + } if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && - NeedWorkItemIDY) { + NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) { SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, DAG.getShiftAmountConstant(10, MVT::i32, SL)); @@ -2900,7 +2945,7 @@ void SITargetLowering::passSpecialInputs( } if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && - NeedWorkItemIDZ) { + NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) { SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, DAG.getShiftAmountConstant(20, MVT::i32, SL)); @@ -2909,13 +2954,21 @@ void SITargetLowering::passSpecialInputs( } if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { - // Workitem ids are already packed, any of present incoming arguments - // will carry all required fields. - ArgDescriptor IncomingArg = ArgDescriptor::createArg( - IncomingArgX ? *IncomingArgX : - IncomingArgY ? *IncomingArgY : - *IncomingArgZ, ~0u); - InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { + // We're in a situation where the outgoing function requires the workitem + // ID, but the calling function does not have it (e.g a graphics function + // calling a C calling convention function). This is illegal, but we need + // to produce something. + InputReg = DAG.getUNDEF(MVT::i32); + } else { + // Workitem ids are already packed, any of present incoming arguments + // will carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : + IncomingArgY ? *IncomingArgY : + *IncomingArgZ, ~0u); + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + } } if (OutgoingArg->isRegister()) { @@ -4600,7 +4653,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); + VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 || + VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4621,21 +4675,26 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || + VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 || + VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; - std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Op0 = Op.getOperand(0); + std::tie(Lo0, Hi0) = Op0.getValueType().isVector() + ? DAG.SplitVectorOperand(Op.getNode(), 0) + : std::make_pair(Op0, Op0); SDValue Lo1, Hi1; std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); SDValue Lo2, Hi2; std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2); SDLoc SL(Op); + auto ResVT = DAG.GetSplitDestVTs(VT); - SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2, + SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags()); - SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2, + SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags()); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); @@ -5297,7 +5356,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, if (IsIEEEMode) return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); - if (VT == MVT::v4f16) + if (VT == MVT::v4f16 || VT == MVT::v8f16) return splitBinaryVectorOp(Op, DAG); return Op; } @@ -5501,6 +5560,22 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, MachineMemOperand::MOInvariant); } +/// Return true if the value is a known valid address, such that a null check is +/// not necessary. +static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG, + const AMDGPUTargetMachine &TM, unsigned AddrSpace) { + if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) || + isa<BasicBlockSDNode>(Val)) + return true; + + if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val)) + return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); + + // TODO: Search through arithmetic, handle arguments and loads + // marked nonnull. + return false; +} + SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -5508,48 +5583,64 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, SDValue Src = ASC->getOperand(0); SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + unsigned SrcAS = ASC->getSrcAddressSpace(); const AMDGPUTargetMachine &TM = static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); // flat -> local/private - if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { unsigned DestAS = ASC->getDestAddressSpace(); if (DestAS == AMDGPUAS::LOCAL_ADDRESS || DestAS == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + + if (isKnownNonNull(Src, DAG, TM, SrcAS)) + return Ptr; + unsigned NullVal = TM.getNullPointerValue(DestAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); - SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); - return DAG.getNode(ISD::SELECT, SL, MVT::i32, - NonNull, Ptr, SegmentNullPtr); + return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr, + SegmentNullPtr); } } // local/private -> flat if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { - unsigned SrcAS = ASC->getSrcAddressSpace(); - if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { + + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); + SDValue CvtPtr = + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); + + if (isKnownNonNull(Src, DAG, TM, SrcAS)) + return CvtPtr; + unsigned NullVal = TM.getNullPointerValue(SrcAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); - SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); - SDValue CvtPtr - = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); - - return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, - DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), + return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr, FlatNullPtr); } } + if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + Op.getValueType() == MVT::i64) { + const SIMachineFunctionInfo *Info = + DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); + SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); + } + if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT && Src.getValueType() == MVT::i64) return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); @@ -5676,7 +5767,6 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, EVT VecVT = Vec.getValueType(); unsigned VecSize = VecVT.getSizeInBits(); EVT EltVT = VecVT.getVectorElementType(); - assert(VecSize <= 64); DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); @@ -5687,6 +5777,28 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; + if (VecSize == 128) { + SDValue Lo, Hi; + EVT LoVT, HiVT; + SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); + Lo = + DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, + V2, DAG.getConstant(0, SL, MVT::i32))); + Hi = + DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, + V2, DAG.getConstant(1, SL, MVT::i32))); + EVT IdxVT = Idx.getValueType(); + unsigned NElem = VecVT.getVectorNumElements(); + assert(isPowerOf2_32(NElem)); + SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT); + SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask); + SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx); + } + + assert(VecSize <= 64); + unsigned EltSize = EltVT.getSizeInBits(); assert(isPowerOf2_32(EltSize)); @@ -5769,20 +5881,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SDLoc SL(Op); EVT VT = Op.getValueType(); - if (VT == MVT::v4i16 || VT == MVT::v4f16) { - EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8i16 || VT == MVT::v8f16) { + EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), + VT.getVectorNumElements() / 2); + MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits()); // Turn into pair of packed build_vectors. // TODO: Special case for constants that can be materialized with s_mov_b64. - SDValue Lo = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(0), Op.getOperand(1) }); - SDValue Hi = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(2), Op.getOperand(3) }); + SmallVector<SDValue, 4> LoOps, HiOps; + for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) { + LoOps.push_back(Op.getOperand(I)); + HiOps.push_back(Op.getOperand(I + E)); + } + SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps); + SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps); - SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); - SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo); + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi); - SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL, + { CastLo, CastHi }); return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } @@ -6155,10 +6274,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = - AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); - const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = - AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); @@ -6246,28 +6361,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; SmallVector<SDValue, 4> VAddrs; - // Optimize _L to _LZ when _L is zero - if (LZMappingInfo) { - if (auto *ConstantLod = dyn_cast<ConstantFPSDNode>( - Op.getOperand(ArgOffset + Intr->LodIndex))) { - if (ConstantLod->isZero() || ConstantLod->isNegative()) { - IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l - VAddrEnd--; // remove 'lod' - } - } - } - - // Optimize _mip away, when 'lod' is zero - if (MIPMappingInfo) { - if (auto *ConstantLod = dyn_cast<ConstantSDNode>( - Op.getOperand(ArgOffset + Intr->MipIndex))) { - if (ConstantLod->isZero()) { - IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip - VAddrEnd--; // remove 'mip' - } - } - } - // Check for 16 bit addresses or derivatives and pack if true. MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); @@ -6283,12 +6376,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // Push back extra arguments. for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { + assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); // Special handling of bias when A16 is on. Bias is of type half but // occupies full 32-bit. - SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); - VAddrs.push_back(bias); - } else + SDValue Bias = DAG.getBuildVector( + MVT::v2f16, DL, + {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); + VAddrs.push_back(Bias); + } else { + assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && + "Bias needs to be converted to 16 bit in A16 mode"); VAddrs.push_back(Op.getOperand(ArgOffset + I)); + } } if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { @@ -6731,14 +6830,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); case Intrinsic::amdgcn_workitem_id_x: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: + if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0) + return DAG.getConstant(0, DL, MVT::i32); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); @@ -6899,9 +7007,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(1, SL, MVT::i32)); return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); } - case Intrinsic::amdgcn_alignbit: - return DAG.getNode(ISD::FSHR, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_perm: return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -8408,21 +8513,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + if (VT.getSizeInBits() == 128) + return splitTernaryVectorOp(Op, DAG); + assert(VT.getSizeInBits() == 64); SDLoc DL(Op); SDValue Cond = Op.getOperand(0); - if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() && - !Op->isDivergent()) { - if (VT == MVT::i64) - return Op; - SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1)); - SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2)); - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS)); - } - SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); @@ -9550,6 +9648,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, SDValue SITargetLowering::performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) + return RV; + EVT VT = N->getValueType(0); if (VT != MVT::i64) return SDValue(); @@ -10462,6 +10563,9 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N, if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); + if (DAG.isBaseWithConstantOffset(SDValue(N, 0))) + return SDValue(); + unsigned Opc = N->getOpcode(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -10483,12 +10587,6 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N, if (Op1->isDivergent()) std::swap(Op1, Op2); - // If either operand is constant this will conflict with - // DAGCombiner::ReassociateOps(). - if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) || - DAG.isConstantIntBuildVectorOrConstantInt(Op1)) - return SDValue(); - SDLoc SL(N); SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); return DAG.getNode(Opc, SL, VT, Add1, Op2); @@ -11130,7 +11228,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || - Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; + Node->getConstantOperandVal(LWEIdx)) + ? true + : false; unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; @@ -11719,25 +11819,51 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, return std::make_pair(0U, RC); } - if (Constraint.size() > 1) { - if (Constraint[1] == 'v') { + if (Constraint.startswith("{") && Constraint.endswith("}")) { + StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); + if (RegName.consume_front("v")) { RC = &AMDGPU::VGPR_32RegClass; - } else if (Constraint[1] == 's') { + } else if (RegName.consume_front("s")) { RC = &AMDGPU::SGPR_32RegClass; - } else if (Constraint[1] == 'a') { + } else if (RegName.consume_front("a")) { RC = &AMDGPU::AGPR_32RegClass; } if (RC) { uint32_t Idx; - bool Failed = Constraint.substr(2).getAsInteger(10, Idx); - if (!Failed && Idx < RC->getNumRegs()) - return std::make_pair(RC->getRegister(Idx), RC); + if (RegName.consume_front("[")) { + uint32_t End; + bool Failed = RegName.consumeInteger(10, Idx); + Failed |= !RegName.consume_front(":"); + Failed |= RegName.consumeInteger(10, End); + Failed |= !RegName.consume_back("]"); + if (!Failed) { + uint32_t Width = (End - Idx + 1) * 32; + MCRegister Reg = RC->getRegister(Idx); + if (SIRegisterInfo::isVGPRClass(RC)) + RC = TRI->getVGPRClassForBitWidth(Width); + else if (SIRegisterInfo::isSGPRClass(RC)) + RC = TRI->getSGPRClassForBitWidth(Width); + else if (SIRegisterInfo::isAGPRClass(RC)) + RC = TRI->getAGPRClassForBitWidth(Width); + if (RC) { + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); + return std::make_pair(Reg, RC); + } + } + } else { + bool Failed = RegName.getAsInteger(10, Idx); + if (!Failed && Idx < RC->getNumRegs()) + return std::make_pair(RC->getRegister(Idx), RC); + } } } - // FIXME: Returns VS_32 for physical SGPR constraints - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + if (Ret.first) + Ret.second = TRI->getPhysRegClass(Ret.first); + + return Ret; } static bool isImmConstraint(StringRef Constraint) { @@ -11975,13 +12101,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } TargetLoweringBase::finalizeLowering(MF); - - // Allocate a VGPR for future SGPR Spill if - // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used - // FIXME: We won't need this hack if we split SGPR allocation from VGPR - if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() && - !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction()) - Info->reserveVGPRforSGPRSpills(MF); } void SITargetLowering::computeKnownBitsForFrameIndex( @@ -12441,17 +12560,10 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, for (auto &TC : TargetConstraints) { if (TC.Type == InlineAsm::isOutput) { ComputeConstraintToUse(TC, SDValue()); - unsigned AssignedReg; - const TargetRegisterClass *RC; - std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint( - SIRI, TC.ConstraintCode, TC.ConstraintVT); - if (RC) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg)) - return true; - else if (SIRI->isSGPRClass(RC)) - return true; - } + const TargetRegisterClass *RC = getRegForInlineAsmConstraint( + SIRI, TC.ConstraintCode, TC.ConstraintVT).second; + if (RC && SIRI->isSGPRClass(RC)) + return true; } } } @@ -12475,3 +12587,27 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, Cost.first += (Size + 255) / 256; return Cost; } + +bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { + SDNode::use_iterator I = N->use_begin(), E = N->use_end(); + for (; I != E; ++I) { + if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) { + if (getBasePtrIndex(M) == I.getOperandNo()) + return true; + } + } + return false; +} + +bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const { + if (!N0.hasOneUse()) + return false; + // Take care of the oportunity to keep N0 uniform + if (N0->isDivergent() || !N1->isDivergent()) + return true; + // Check if we have a good chance to form the memory access pattern with the + // base and offset + return (DAG.isBaseWithConstantOffset(N0) && + hasMemSDNodeUser(*N0->use_begin())); +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1315cc15dd02..bf81e082b478 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -449,6 +449,11 @@ public: bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; + bool hasMemSDNodeUser(SDNode *N) const; + + bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const override; + bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; bool isCanonicalized(Register Reg, MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6fbe5d45ce0a..f8a10bc8ef6f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -863,7 +863,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, Wait.ExpCnt = ~0u; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr + << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr << '\n'); } else { WaitcntInstr->eraseFromParent(); @@ -886,7 +886,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, Wait.VsCnt = ~0u; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI + << "Old Instr: " << *MI << "New Instr: " << *WaitcntVsCntInstr << '\n'); } else { WaitcntVsCntInstr->eraseFromParent(); @@ -1382,7 +1382,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { for (auto T : inst_counter_types()) { // Merge event flags for this counter - const bool OldOutOfOrder = counterOutOfOrder(T); const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) @@ -1425,7 +1424,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { } } - if (RegStrictDom && !OldOutOfOrder) + if (RegStrictDom) StrictDom = true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 1755b93538ce..0a2f9381e71f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -130,10 +130,24 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, return false; } +static bool readsExecAsData(const MachineInstr &MI) { + if (MI.isCompare()) + return true; + + switch (MI.getOpcode()) { + default: + break; + case AMDGPU::V_READFIRSTLANE_B32: + return true; + } + + return false; +} + bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { // Any implicit use of exec by VALU is not a real register read. return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && - isVALU(*MO.getParent()); + isVALU(*MO.getParent()) && !readsExecAsData(*MO.getParent()); } bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, @@ -3184,10 +3198,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; + int NewMFMAOpc = -1; switch (Opc) { default: - return nullptr; + NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); + if (NewMFMAOpc == -1) + return nullptr; + break; case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F16_e64: IsF16 = true; @@ -3216,6 +3234,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, } } + MachineInstrBuilder MIB; + MachineBasicBlock &MBB = *MI.getParent(); + + if (NewMFMAOpc != -1) { + MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); + updateLiveVariables(LV, MI, *MIB); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *MIB); + return MIB; + } + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); const MachineOperand *Src0Mods = @@ -3226,8 +3257,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - MachineInstrBuilder MIB; - MachineBasicBlock &MBB = *MI.getParent(); if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && // If we have an SGPR input, we will violate the constant bus restriction. @@ -4520,6 +4549,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) { + const MachineOperand &SrcOp = MI.getOperand(1); + if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) { + ErrInfo = "pseudo expects only physical SGPRs"; + return false; + } + } + return true; } @@ -6122,11 +6159,8 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, continue; case AMDGPU::S_CSELECT_B32: - lowerSelect32(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; case AMDGPU::S_CSELECT_B64: - splitSelect64(Worklist, Inst, MDT); + lowerSelect(Worklist, Inst, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_CMP_EQ_I32: @@ -6304,8 +6338,8 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, return std::make_pair(false, nullptr); } -void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT) const { +void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6380,95 +6414,6 @@ void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitSelect64(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT) const { - // Split S_CSELECT_B64 into a pair of S_CSELECT_B32 and lower them - // further. - const DebugLoc &DL = Inst.getDebugLoc(); - MachineBasicBlock::iterator MII = Inst; - MachineBasicBlock &MBB = *Inst.getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - // Get the original operands. - MachineOperand &Dest = Inst.getOperand(0); - MachineOperand &Src0 = Inst.getOperand(1); - MachineOperand &Src1 = Inst.getOperand(2); - MachineOperand &Cond = Inst.getOperand(3); - - Register SCCSource = Cond.getReg(); - bool IsSCC = (SCCSource == AMDGPU::SCC); - - // If this is a trivial select where the condition is effectively not SCC - // (SCCSource is a source of copy to SCC), then the select is semantically - // equivalent to copying SCCSource. Hence, there is no need to create - // V_CNDMASK, we can just use that and bail out. - if (!IsSCC && (Src0.isImm() && Src0.getImm() == -1) && - (Src1.isImm() && Src1.getImm() == 0)) { - MRI.replaceRegWith(Dest.getReg(), SCCSource); - return; - } - - // Prepare the split destination. - Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - // Split the source operands. - const TargetRegisterClass *Src0RC = nullptr; - const TargetRegisterClass *Src0SubRC = nullptr; - if (Src0.isReg()) { - Src0RC = MRI.getRegClass(Src0.getReg()); - Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); - } - const TargetRegisterClass *Src1RC = nullptr; - const TargetRegisterClass *Src1SubRC = nullptr; - if (Src1.isReg()) { - Src1RC = MRI.getRegClass(Src1.getReg()); - Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); - } - // Split lo. - MachineOperand SrcReg0Sub0 = - buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); - MachineOperand SrcReg1Sub0 = - buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); - // Split hi. - MachineOperand SrcReg0Sub1 = - buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = - buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); - // Select the lo part. - MachineInstr *LoHalf = - BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub0) - .add(SrcReg0Sub0) - .add(SrcReg1Sub0); - // Replace the condition operand with the original one. - LoHalf->getOperand(3).setReg(SCCSource); - Worklist.insert(LoHalf); - // Select the hi part. - MachineInstr *HiHalf = - BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub1) - .add(SrcReg0Sub1) - .add(SrcReg1Sub1); - // Replace the condition operand with the original one. - HiHalf->getOperand(3).setReg(SCCSource); - Worklist.insert(HiHalf); - // Merge them back to the original 64-bit one. - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - - // Try to legalize the operands in case we need to swap the order to keep - // it valid. - legalizeOperands(*LoHalf, MDT); - legalizeOperands(*HiHalf, MDT); - - // Move all users of this moved value. - addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); -} - void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7820,6 +7765,12 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { } } + if (isMAI(Opcode)) { + int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode); + if (MFMAOp != -1) + Opcode = MFMAOp; + } + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); // -1 means that Opcode is already a native instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dd9ea2b53ca2..e551d6c7223f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -78,11 +78,8 @@ private: moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; - - void splitSelect64(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; + void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; @@ -1249,6 +1246,10 @@ namespace AMDGPU { LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode); + /// \returns earlyclobber version of a MAC MFMA is exists. + LLVM_READONLY + int getMFMAEarlyClobberOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index dda92d3d25ff..713a08907e99 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2588,6 +2588,14 @@ def getFlatScratchInstSVfromSS : InstrMapping { let ValueCols = [["SV"]]; } +def getMFMAEarlyClobberOp : InstrMapping { + let FilterClass = "MFMATable"; + let RowFields = ["FMAOp"]; + let ColFields = ["IsMac"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 636337ede000..7be63ae6964b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1011,7 +1011,7 @@ def : GCNPat < } def : GCNPat < - (i32 (ctpop i32:$popcnt)), + (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) >; @@ -1020,6 +1020,14 @@ def : GCNPat < (V_BCNT_U32_B32_e64 $popcnt, $val) >; +def : GCNPat < + (i64 (DivergentUnaryFrag<ctpop> i64:$src)), + (REG_SEQUENCE VReg_64, + (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)), + (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0, + (i32 (V_MOV_B32_e32 (i32 0))), sub1) +>; + /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ @@ -1184,6 +1192,26 @@ def : Pat < (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) >; +def : Pat < + (extract_subvector v8i16:$vec, (i32 0)), + (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1)) +>; + +def : Pat < + (extract_subvector v8i16:$vec, (i32 4)), + (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3)) +>; + +def : Pat < + (extract_subvector v8f16:$vec, (i32 0)), + (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1)) +>; + +def : Pat < + (extract_subvector v8f16:$vec, (i32 4)), + (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) +>; + foreach Index = 0-31 in { def Extract_Element_v32i32_#Index : Extract_Element < i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) @@ -1279,6 +1307,26 @@ def : BitConvert <v2i64, v2f64, VReg_128>; def : BitConvert <v2f64, v2i64, VReg_128>; def : BitConvert <v4f32, v2i64, VReg_128>; def : BitConvert <v2i64, v4f32, VReg_128>; +def : BitConvert <v8i16, v4i32, SReg_128>; +def : BitConvert <v4i32, v8i16, SReg_128>; +def : BitConvert <v8f16, v4f32, VReg_128>; +def : BitConvert <v8f16, v4i32, VReg_128>; +def : BitConvert <v4f32, v8f16, VReg_128>; +def : BitConvert <v4i32, v8f16, VReg_128>; +def : BitConvert <v8i16, v8f16, VReg_128>; +def : BitConvert <v8f16, v8i16, VReg_128>; +def : BitConvert <v4f32, v8i16, VReg_128>; +def : BitConvert <v8i16, v4f32, VReg_128>; +def : BitConvert <v8i16, v8f16, SReg_128>; +def : BitConvert <v8i16, v2i64, SReg_128>; +def : BitConvert <v8i16, v2f64, SReg_128>; +def : BitConvert <v8f16, v2i64, SReg_128>; +def : BitConvert <v8f16, v2f64, SReg_128>; +def : BitConvert <v8f16, v8i16, SReg_128>; +def : BitConvert <v2i64, v8i16, SReg_128>; +def : BitConvert <v2f64, v8i16, SReg_128>; +def : BitConvert <v2i64, v8f16, SReg_128>; +def : BitConvert <v2f64, v8f16, SReg_128>; // 160-bit bitcast def : BitConvert <v5i32, v5f32, SReg_160>; @@ -1762,44 +1810,44 @@ def BFIImm32 : PatFrag< // (y & x) | (z & ~x) def : AMDGPUPat < (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), - (V_BFI_B32_e64 $x, $y, $z) + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) >; // (y & C) | (z & ~C) def : AMDGPUPat < (BFIImm32 i32:$x, i32:$y, i32:$z), - (V_BFI_B32_e64 $x, $y, $z) + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) >; // 64-bit version def : AMDGPUPat < (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), - (REG_SEQUENCE SReg_64, - (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0, - (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1) + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; // SHA-256 Ch function // z ^ (x & (y ^ z)) def : AMDGPUPat < (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), - (V_BFI_B32_e64 $x, $y, $z) + (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) >; // 64-bit version def : AMDGPUPat < (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), - (REG_SEQUENCE SReg_64, - (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0, - (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1) + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; def : AMDGPUPat < @@ -2725,21 +2773,21 @@ def : AMDGPUPat < def : AMDGPUPat < (DivergentBinFrag<or> (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y) + (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y) >; def : AMDGPUPat < (DivergentBinFrag<or> (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))), - (REG_SEQUENCE SReg_64, - (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0, - (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), - (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)), - (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1) + (REG_SEQUENCE VReg_64, + (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, + (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) >; multiclass IntMed3Pat<Instruction med3Inst, @@ -2825,6 +2873,15 @@ class AMDGPUGenericInstruction : GenericInstruction { let Namespace = "AMDGPU"; } +// Convert a wave address to a swizzled vector address (i.e. this is +// for copying the stack pointer to a vector address appropriate to +// use in the offset field of mubuf instructions). +def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + // Returns -1 if the input is zero. def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); @@ -3027,6 +3084,16 @@ def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { let mayStore = 1; } +def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayLoad = 1; + + // FIXME: Use separate opcode for atomics. + let mayStore = 1; +} + // This is equivalent to the G_INTRINSIC*, but the operands may have // been legalized depending on the subtarget requirements. def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { @@ -3036,6 +3103,13 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { let mayStore = 1; } +def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins unknown:$intrin, variable_ops); + let hasSideEffects = 0; + let mayStore = 1; +} + def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins unknown:$intrin, variable_ops); diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index f4d9002e930e..c18637bdbc43 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -105,6 +105,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned DMask; InstClassEnum InstClass; unsigned CPol = 0; + bool IsAGPR; bool UseST64; int AddrIdx[MaxAddressRegs]; const MachineOperand *AddrReg[MaxAddressRegs]; @@ -158,8 +159,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { return true; } - void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, - const GCNSubtarget &STM); + void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); }; struct BaseRegisters { @@ -484,15 +484,16 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { } void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, - const SIInstrInfo &TII, - const GCNSubtarget &STM) { + const SILoadStoreOptimizer &LSO) { I = MI; unsigned Opc = MI->getOpcode(); - InstClass = getInstClass(Opc, TII); + InstClass = getInstClass(Opc, *LSO.TII); if (InstClass == UNKNOWN) return; + IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); + switch (InstClass) { case DS_READ: EltSize = @@ -505,7 +506,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, : 4; break; case S_BUFFER_LOAD_IMM: - EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); + EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); break; default: EltSize = 4; @@ -513,7 +514,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, } if (InstClass == MIMG) { - DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); + DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); // Offset is not considered for MIMG instructions. Offset = 0; } else { @@ -522,17 +523,17 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, } if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) - Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); + Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); - Width = getOpcodeWidth(*I, TII); + Width = getOpcodeWidth(*I, *LSO.TII); if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { Offset &= 0xffff; } else if (InstClass != MIMG) { - CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); + CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); } - AddressRegs Regs = getRegs(Opc, TII); + AddressRegs Regs = getRegs(Opc, *LSO.TII); NumAddresses = 0; for (unsigned J = 0; J < Regs.NumVAddrs; J++) @@ -910,19 +911,10 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( } const unsigned InstSubclass = getInstSubclass(Opc, *TII); - // Do not merge VMEM buffer instructions with "swizzled" bit set. - int Swizzled = - AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); - if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) - return false; - DenseSet<Register> RegDefsToMove; DenseSet<Register> PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - const TargetRegisterClass *DataRC = getDataRegClass(*CI.I); - bool IsAGPR = TRI->hasAGPRs(DataRC); - MachineBasicBlock::iterator E = std::next(Paired.I); MachineBasicBlock::iterator MBBI = std::next(CI.I); MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); @@ -971,15 +963,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( continue; } - // Don't merge volatiles. - if (MBBI->hasOrderedMemoryRef()) - return false; - - int Swizzled = - AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); - if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) - return false; - // Handle a case like // DS_WRITE_B32 addr, v, idx0 // w = DS_READ_B32 addr, idx0 @@ -991,17 +974,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge( continue; if (&*MBBI == &*Paired.I) { - if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR) - return false; - // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data - // operands. However we are reporting that ds_write2 shall have - // only VGPR data so that machine copy propagation does not - // create an illegal instruction with a VGPR and AGPR sources. - // Consequenctially if we create such instruction the verifier - // will complain. - if (IsAGPR && CI.InstClass == DS_WRITE) - return false; - // We need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. @@ -1542,49 +1514,36 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, std::pair<unsigned, unsigned> SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { - - assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero"); - bool ReverseOrder; if (CI.InstClass == MIMG) { assert( (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && "No overlaps"); ReverseOrder = CI.DMask > Paired.DMask; - } else + } else { ReverseOrder = CI.Offset > Paired.Offset; + } unsigned Idx0; unsigned Idx1; - if (CI.Width + Paired.Width > 4) { - assert(CI.Width == 4 && Paired.Width == 4); + static const unsigned Idxs[5][4] = { + {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, + {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, + {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, + {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, + {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, + }; - if (ReverseOrder) { - Idx1 = AMDGPU::sub0_sub1_sub2_sub3; - Idx0 = AMDGPU::sub4_sub5_sub6_sub7; - } else { - Idx0 = AMDGPU::sub0_sub1_sub2_sub3; - Idx1 = AMDGPU::sub4_sub5_sub6_sub7; - } + assert(CI.Width >= 1 && CI.Width <= 4); + assert(Paired.Width >= 1 && Paired.Width <= 4); + + if (ReverseOrder) { + Idx1 = Idxs[0][Paired.Width - 1]; + Idx0 = Idxs[Paired.Width][CI.Width - 1]; } else { - static const unsigned Idxs[4][4] = { - {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, - {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, - {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, - {AMDGPU::sub3, 0, 0, 0}, - }; - - assert(CI.Width >= 1 && CI.Width <= 3); - assert(Paired.Width >= 1 && Paired.Width <= 3); - - if (ReverseOrder) { - Idx1 = Idxs[0][Paired.Width - 1]; - Idx0 = Idxs[Paired.Width][CI.Width - 1]; - } else { - Idx0 = Idxs[0][CI.Width - 1]; - Idx1 = Idxs[CI.Width][Paired.Width - 1]; - } + Idx0 = Idxs[0][CI.Width - 1]; + Idx1 = Idxs[CI.Width][Paired.Width - 1]; } return std::make_pair(Idx0, Idx1); @@ -1847,7 +1806,8 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) return false; - if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + if (MI.mayLoad() && + TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) return false; if (AnchorList.count(&MI)) @@ -1988,6 +1948,7 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, std::list<std::list<CombineInfo> > &MergeableInsts) const { for (std::list<CombineInfo> &AddrList : MergeableInsts) { if (AddrList.front().InstClass == CI.InstClass && + AddrList.front().IsAGPR == CI.IsAGPR && AddrList.front().hasSameBaseAddress(*CI.I)) { AddrList.emplace_back(CI); return; @@ -2030,13 +1991,29 @@ SILoadStoreOptimizer::collectMergeableInsts( if (InstClass == UNKNOWN) continue; + // Do not merge VMEM buffer instructions with "swizzled" bit set. + int Swizzled = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) + continue; + CombineInfo CI; - CI.setMI(MI, *TII, *STM); + CI.setMI(MI, *this); CI.Order = Order++; if (!CI.hasMergeableAddress(*MRI)) continue; + if (CI.InstClass == DS_WRITE && CI.IsAGPR) { + // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data + // operands. However we are reporting that ds_write2 shall have + // only VGPR data so that machine copy propagation does not + // create an illegal instruction with a VGPR and AGPR sources. + // Consequenctially if we create such instruction the verifier + // will complain. + continue; + } + LLVM_DEBUG(dbgs() << "Mergeable: " << MI); addInstToMergeableList(CI, MergeableInsts); @@ -2144,54 +2121,54 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( case DS_READ: { MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); break; } case DS_WRITE: { MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); break; } case S_BUFFER_LOAD_IMM: { MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 8; break; } case BUFFER_LOAD: { MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case BUFFER_STORE: { MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case MIMG: { MachineBasicBlock::iterator NewMI = mergeImagePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case TBUFFER_LOAD: { MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } case TBUFFER_STORE: { MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI, Paired, InstsToMove); - CI.setMI(NewMI, *TII, *STM); + CI.setMI(NewMI, *this); OptimizeListAgain |= (CI.Width + Paired.Width) < 4; break; } diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 3168bcd53eda..e1018bdfde46 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -56,6 +56,7 @@ #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -90,6 +91,8 @@ private: unsigned OrSaveExecOpc; unsigned Exec; + bool EnableOptimizeEndCf = false; + bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End); void emitIf(MachineInstr &MI); @@ -579,10 +582,10 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { void SILowerControlFlow::optimizeEndCf() { // If the only instruction immediately following this END_CF is an another // END_CF in the only successor we can avoid emitting exec mask restore here. - if (!RemoveRedundantEndcf) + if (!EnableOptimizeEndCf) return; - for (MachineInstr *MI : LoweredEndCf) { + for (MachineInstr *MI : reverse(LoweredEndCf)) { MachineBasicBlock &MBB = *MI->getParent(); auto Next = skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator())); @@ -807,6 +810,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); + EnableOptimizeEndCf = + RemoveRedundantEndcf && MF.getTarget().getOptLevel() > CodeGenOpt::None; // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable<LiveIntervals>(); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 55196fe334e6..0fbdbef6fcce 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -127,7 +127,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, // FIXME: Just emit the readlane/writelane directly if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { for (const CalleeSavedInfo &CI : reverse(CSI)) { - unsigned Reg = CI.getReg(); + Register Reg = CI.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MVT::i32); @@ -239,50 +239,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { return false; } -// Find lowest available VGPR and use it as VGPR reserved for SGPR spills. -static bool lowerShiftReservedVGPR(MachineFunction &MF, - const GCNSubtarget &ST) { - SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill; - // Early out if pre-reservation of a VGPR for SGPR spilling is disabled. - if (!PreReservedVGPR) - return false; - - // If there are no free lower VGPRs available, default to using the - // pre-reserved register instead. - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - Register LowestAvailableVGPR = - TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF); - if (!LowestAvailableVGPR) - LowestAvailableVGPR = PreReservedVGPR; - - MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - // Create a stack object for a possible spill in the function prologue. - // Note Non-CSR VGPR also need this as we may overwrite inactive lanes. - Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4)); - - // Find saved info about the pre-reserved register. - const auto *ReservedVGPRInfoItr = - llvm::find_if(FuncInfo->getSGPRSpillVGPRs(), - [PreReservedVGPR](const auto &SpillRegInfo) { - return SpillRegInfo.VGPR == PreReservedVGPR; - }); - - assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end()); - auto Index = - std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr); - - FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index); - - for (MachineBasicBlock &MBB : MF) { - assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR"); - MBB.addLiveIn(LowestAvailableVGPR); - MBB.sortUniqueLiveIns(); - } - - return true; -} - bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -304,11 +260,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { if (!MFI.hasStackObjects() && !HasCSRs) { SaveBlocks.clear(); RestoreBlocks.clear(); - if (FuncInfo->VGPRReservedForSGPRSpill) { - // Free the reserved VGPR for later possible use by frame lowering. - FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF); - MRI.freezeReservedRegs(MF); - } return false; } @@ -326,8 +277,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // This operates under the assumption that only other SGPR spills are users // of the frame index. - lowerShiftReservedVGPR(MF, ST); - // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); @@ -375,8 +324,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { FuncInfo->removeDeadFrameIndices(MFI); MadeChange = true; - } else if (FuncInfo->VGPRReservedForSGPRSpill) { - FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF); } SaveBlocks.clear(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 3ce368ef4db9..cca8565c9ff9 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -118,10 +118,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x")) WorkItemIDX = true; - if (!F.hasFnAttribute("amdgpu-no-workitem-id-y")) + if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") && + ST.getMaxWorkitemID(F, 1) != 0) WorkItemIDY = true; - if (!F.hasFnAttribute("amdgpu-no-workitem-id-z")) + if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") && + ST.getMaxWorkitemID(F, 2) != 0) WorkItemIDZ = true; if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) @@ -274,7 +276,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, MachineFrameInfo &FrameInfo = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned WaveSize = ST.getWavefrontSize(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); unsigned Size = FrameInfo.getObjectSize(FI); unsigned NumLanes = Size / 4; @@ -291,16 +292,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, Register LaneVGPR; unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); - // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and - // when one of the two conditions is true: - // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet - // reserved. - // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is - // required. - if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) { - assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR); - LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill; - } else if (VGPRIndex == 0) { + if (VGPRIndex == 0) { LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not @@ -308,6 +300,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; + // FIXME: We can run out of free registers with split allocation if + // IPRA is enabled and a called function already uses every VGPR. #if 0 DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), "VGPRs for SGPR spilling", @@ -340,21 +334,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, return true; } -/// Reserve a VGPR for spilling of SGPRs -bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - - Register LaneVGPR = TRI->findUnusedRegister( - MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true); - if (LaneVGPR == Register()) - return false; - SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None)); - FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR; - return true; -} - /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. /// Either AGPR is spilled to VGPR to vice versa. /// Returns true if a \p FI can be eliminated completely. @@ -616,24 +595,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( return false; } -// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs -bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR, - MachineFunction &MF) { - for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) { - if (i->VGPR == ReservedVGPR) { - SpillVGPRs.erase(i); - - for (MachineBasicBlock &MBB : MF) { - MBB.removeLiveIn(ReservedVGPR); - MBB.sortUniqueLiveIns(); - } - this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister; - return true; - } - } - return false; -} - bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const { if (UsesAGPRs) return *UsesAGPRs; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 8accbf611c5f..8e821274bb77 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -502,7 +502,6 @@ public: // FIXME Register SGPRForBPSaveRestoreCopy; Optional<int> BasePointerSaveIndex; - Register VGPRReservedForSGPRSpill; bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); public: @@ -528,7 +527,6 @@ public: void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) { SpillVGPRs[Index].VGPR = NewVGPR; SpillVGPRs[Index].FI = newFI; - VGPRReservedForSGPRSpill = NewVGPR; } bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF); @@ -556,7 +554,6 @@ public: bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); - bool reserveVGPRforSGPRSpills(MachineFunction &MF); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); void removeDeadFrameIndices(MachineFrameInfo &MFI); diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index 69eab762f05c..24a8879b5684 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -188,7 +188,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask); unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset); unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); - BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32)) .addImm(Value) .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 6bf6c45d8cf6..e13e33ed5457 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -155,6 +155,11 @@ public: return MachineFunctionProperties().set( MachineFunctionProperties::Property::IsSSA); } + + MachineFunctionProperties getClearedProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoPHIs); + } }; } // end anonymous namespace @@ -366,47 +371,42 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( // Re-calculate the liveness of \p Reg in the THEN-region void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const { - - SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming; - - MachineBasicBlock *ThenEntry = nullptr; - for (auto *Succ : If->successors()) { - if (Succ != Flow) { - ThenEntry = Succ; - break; + SetVector<MachineBasicBlock *> Blocks; + SmallVector<MachineBasicBlock *> WorkList({If}); + + // Collect all successors until we see the flow block, where we should + // reconverge. + while (!WorkList.empty()) { + auto *MBB = WorkList.pop_back_val(); + for (auto *Succ : MBB->successors()) { + if (Succ != Flow && !Blocks.contains(Succ)) { + WorkList.push_back(Succ); + Blocks.insert(Succ); + } } } - assert(ThenEntry && "No successor in Then region?"); LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg); - df_iterator_default_set<MachineBasicBlock *, 16> Visited; - - for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { - if (MBB == Flow) - break; - + for (MachineBasicBlock *MBB : Blocks) { // Clear Live bit, as we will recalculate afterwards LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB) << '\n'); OldVarInfo.AliveBlocks.reset(MBB->getNumber()); } + SmallPtrSet<MachineBasicBlock *, 4> PHIIncoming; + // Get the blocks the Reg should be alive through for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; ++I) { auto *UseMI = I->getParent(); if (UseMI->isPHI() && I->readsReg()) { - if (Visited.contains(UseMI->getParent())) + if (Blocks.contains(UseMI->getParent())) PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB()); } } - Visited.clear(); - - for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) { - if (MBB == Flow) - break; - + for (MachineBasicBlock *MBB : Blocks) { SmallVector<MachineInstr *> Uses; // PHI instructions has been processed before. findNonPHIUsesInBlock(Reg, MBB, Uses); @@ -433,7 +433,7 @@ void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion( // Set the isKilled flag if we get new Kills in the THEN region. for (auto *MI : OldVarInfo.Kills) { - if (Visited.contains(MI->getParent())) + if (Blocks.contains(MI->getParent())) MI->addRegisterKilled(Reg, TRI); } } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 340e2b48e5cd..eb9452f4b85e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -617,7 +617,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16 let HasSGPR = 1; } -def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, +def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32, (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; @@ -784,7 +784,7 @@ multiclass SRegClass<int numRegs, int priority, } defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; @@ -824,7 +824,7 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], (add VGPR_64)>; defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; -defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>; +defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>; defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; @@ -846,7 +846,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], (add AGPR_64)>; defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; -defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; +defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>; defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>; defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>; diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 77ee3c0ff0e4..46efb3c605c6 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -861,12 +861,16 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, MachineInstr *VcmpMI; const MachineOperand &Op0 = MI.getOperand(0); const MachineOperand &Op1 = MI.getOperand(1); + + // VCC represents lanes killed. + Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + if (TRI->isVGPR(*MRI, Op0.getReg())) { Opcode = AMDGPU::getVOPe32(Opcode); VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0); } else { VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .addReg(AMDGPU::VCC, RegState::Define) + .addReg(VCC, RegState::Define) .addImm(0) // src0 modifiers .add(Op1) .addImm(0) // src1 modifiers @@ -874,9 +878,6 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB, .addImm(0); // omod } - // VCC represents lanes killed. - Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; - MachineInstr *MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg) .addReg(LiveMaskReg) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 1713586dcf5b..3f7837f7dbf1 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -246,10 +246,10 @@ let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", - [(set i32:$sdst, (ctpop i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >; def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64", - [(set i32:$sdst, (ctpop i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<ctpop> i64:$src0))] >; } // End Defs = [SCC] @@ -518,10 +518,9 @@ let Uses = [SCC] in { def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32", [(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))] >; - def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64", - [(set i64:$sdst, (SelectPat<select> i64:$src0, i64:$src1))] - >; } + + def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">; } // End Uses = [SCC] let Defs = [SCC] in { @@ -551,11 +550,11 @@ def S_XOR_B64 : SOP2_64 <"s_xor_b64", >; def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", - [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))] + [(set i32:$sdst, (UniformUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1)))] >; def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", - [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] + [(set i64:$sdst, (UniformUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1)))] >; def S_NAND_B32 : SOP2_32 <"s_nand_b32", @@ -1371,7 +1370,7 @@ def : GCNPat < >; def : GCNPat < - (i64 (ctpop i64:$src)), + (i64 (UniformUnaryFrag<ctpop> i64:$src)), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 (i32 0)), sub1)) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 0bee9022975e..18c348d1cf89 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -79,8 +79,8 @@ const char* const IdSymbolic[] = { "HW_REG_FLAT_SCR_LO", "HW_REG_FLAT_SCR_HI", "HW_REG_XNACK_MASK", - nullptr, // HW_ID1, no predictable values - nullptr, // HW_ID2, no predictable values + "HW_REG_HW_ID1", + "HW_REG_HW_ID2", "HW_REG_POPS_PACKER", nullptr, nullptr, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index d20eaaaa65e8..1e96266eb06c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -132,6 +132,8 @@ bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) { #define GET_MIMGInfoTable_IMPL #define GET_MIMGLZMappingTable_IMPL #define GET_MIMGMIPMappingTable_IMPL +#define GET_MIMGBiasMappingTable_IMPL +#define GET_MIMGOffsetMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" @@ -410,7 +412,7 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) { } std::string AMDGPUTargetID::toString() const { - std::string StringRep = ""; + std::string StringRep; raw_string_ostream StreamRep(StringRep); auto TargetTriple = STI.getTargetTriple(); @@ -421,7 +423,7 @@ std::string AMDGPUTargetID::toString() const { << TargetTriple.getOSName() << '-' << TargetTriple.getEnvironmentName() << '-'; - std::string Processor = ""; + std::string Processor; // TODO: Following else statement is present here because we used various // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803'). // Remove once all aliases are removed from GCNProcessors.td. @@ -432,7 +434,7 @@ std::string AMDGPUTargetID::toString() const { Twine(Version.Stepping)) .str(); - std::string Features = ""; + std::string Features; if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) { switch (*HsaAbiVersion) { case ELF::ELFABIVERSION_AMDGPU_HSA_V2: @@ -1018,9 +1020,18 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) { } bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) { - return - ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && - IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI)); + switch (Id) { + case ID_HW_ID: + return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI); + case ID_HW_ID1: + case ID_HW_ID2: + return isGFX10Plus(STI); + case ID_XNACK_MASK: + return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI); + default: + return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && + IdSymbolic[Id]; + } } bool isValidHwreg(int64_t Id) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 061c74c0ace6..89f928eb8b92 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -64,6 +64,7 @@ struct GcnBufferFormatInfo { #define GET_MIMGEncoding_DECL #define GET_MIMGLZMapping_DECL #define GET_MIMGMIPMapping_DECL +#define GET_MIMGBiASMapping_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -330,6 +331,16 @@ struct MIMGMIPMappingInfo { MIMGBaseOpcode NONMIP; }; +struct MIMGBiasMappingInfo { + MIMGBaseOpcode Bias; + MIMGBaseOpcode NoBias; +}; + +struct MIMGOffsetMappingInfo { + MIMGBaseOpcode Offset; + MIMGBaseOpcode NoOffset; +}; + struct MIMGG16MappingInfo { MIMGBaseOpcode G; MIMGBaseOpcode G16; @@ -342,6 +353,12 @@ LLVM_READONLY const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP); LLVM_READONLY +const MIMGBiasMappingInfo *getMIMGBiasMappingInfo(unsigned Bias); + +LLVM_READONLY +const MIMGOffsetMappingInfo *getMIMGOffsetMappingInfo(unsigned Offset); + +LLVM_READONLY const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G); LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 8d232ffe4114..b9ff814a4dc5 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -637,9 +637,9 @@ class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> : ) >; -def : divergent_i64_BinOp <and, V_AND_B32_e32>; -def : divergent_i64_BinOp <or, V_OR_B32_e32>; -def : divergent_i64_BinOp <xor, V_XOR_B32_e32>; +def : divergent_i64_BinOp <and, V_AND_B32_e64>; +def : divergent_i64_BinOp <or, V_OR_B32_e64>; +def : divergent_i64_BinOp <xor, V_XOR_B32_e64>; let SubtargetPredicate = Has16BitInsts in { @@ -688,6 +688,36 @@ let SubtargetPredicate = HasDLInsts in { let isReMaterializable = 1 in defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>; +def : GCNPat< + (i32 (DivergentUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1))), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i32 (DivergentBinFrag<xor_oneuse> (not i32:$src0), i32:$src1)), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1))), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + +def : GCNPat< + (i64 (DivergentBinFrag<xor_oneuse> (not i64:$src0), i64:$src1)), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + let Constraints = "$vdst = $src2", DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 32222b3eb93c..707475ceccee 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -388,6 +388,12 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC let HasModifiers = 0; let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp"; let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); + // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. + // We then create two versions of the instruction: with tied dst and src2 + // and with the eralyclobber flag on the dst. This is strciter than the + // actual HW restriction. In particular earlyclobber also affects src0 and + // src1 allocation which is not required. + bit NoDstOverlap = !gt(DstVT.Size, 128); } def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>; @@ -426,6 +432,11 @@ def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>; def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>; +class MFMATable <bit is_mac, string Name> { + bit IsMac = is_mac; + string FMAOp = Name; +} + let Predicates = [HasMAIInsts] in { let isAsCheapAsAMove = 1, isReMaterializable = 1 in { @@ -435,13 +446,31 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in { } // End isMoveImm = 1 } // End isAsCheapAsAMove = 1, isReMaterializable = 1 -multiclass MAIInst<string OpName, string P, SDPatternOperator node> { +multiclass MAIInst<string OpName, string P, SDPatternOperator node, + bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> { let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. - defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>; - - let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in - defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>; + let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { + defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>, + MFMATable<0, NAME # "_e64">; + + let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in + defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>, + MFMATable<0, NAME # "_vgprcd_e64">; + } + + foreach _ = BoolToList<NoDstOverlap>.ret in { + let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), + isConvertibleToThreeAddress = NoDstOverlap, + Mnemonic = OpName in { + defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>, + MFMATable<1, NAME # "_e64">; + + let SubtargetPredicate = isGFX90APlus in + defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>, + MFMATable<1, NAME # "_vgprcd_e64">; + } + } } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 } @@ -517,6 +546,7 @@ multiclass VOP3P_Real_MAI<bits<7> op> { } } +let Constraints = "" in { multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> { let SubtargetPredicate = isGFX90AOnly, AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in { @@ -536,6 +566,7 @@ multiclass VOP3P_Real_MFMA<bits<7> op> : let DecoderNamespace = "GFX8"; } } +} defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>; defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>; |