diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2015-01-18 16:17:27 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2015-01-18 16:17:27 +0000 |
commit | 67c32a98315f785a9ec9d531c1f571a0196c7463 (patch) | |
tree | 4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Target/R600 | |
parent | 9f61947910e6ab40de38e6b4034751ef1513200f (diff) | |
download | src-67c32a98315f785a9ec9d531c1f571a0196c7463.tar.gz src-67c32a98315f785a9ec9d531c1f571a0196c7463.zip |
Vendor import of llvm RELEASE_360/rc1 tag r226102 (effectively, 3.6.0 RC1):vendor/llvm/llvm-release_360-r226102
Notes
Notes:
svn path=/vendor/llvm/dist/; revision=277323
svn path=/vendor/llvm/llvm-release_360-r226102/; revision=277324; tag=vendor/llvm/llvm-release_360-r226102
Diffstat (limited to 'lib/Target/R600')
95 files changed, 11789 insertions, 4294 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index d7e94f75e123..fcf9eca80e96 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef AMDGPU_H -#define AMDGPU_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H +#define LLVM_LIB_TARGET_R600_AMDGPU_H #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetMachine.h" @@ -38,21 +38,31 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); +FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); +FunctionPass *createSIPrepareScratchRegs(); + +void initializeSIFoldOperandsPass(PassRegistry &); +extern char &SIFoldOperandsID; void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeSILoadStoreOptimizerPass(PassRegistry &); +extern char &SILoadStoreOptimizerID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); +ModulePass *createAMDGPUAlwaysInlinePass(); /// \brief Creates an AMDGPU-specific Target Transformation Info pass. ImmutablePass * @@ -63,6 +73,7 @@ extern char &SIFixSGPRLiveRangesID; extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; namespace AMDGPU { enum TargetIndex { @@ -127,4 +138,4 @@ enum AddressSpaces { } // namespace AMDGPUAS -#endif // AMDGPU_H +#endif diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 5645f1a2322e..8a5ca613dc80 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -81,6 +81,17 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug", "true", "GPU has CF_ALU bug">; +// XXX - This should probably be removed once enabled by default +def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", + "EnableLoadStoreOpt", + "true", + "Enable SI load/store optimizer pass">; + +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", + "FlatAddressSpace", + "true", + "Support flat address space">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", @@ -135,16 +146,28 @@ def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64]>; + FeatureWavefrontSize64, FeatureFlatAddressSpace]>; + +def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace]>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { let guessInstructionProperties = 1; } +def AMDGPUAsmParser : AsmParser { + // Some of the R600 registers have the same name, so this crashes. + // For example T0_XYZW and T0_XY both have the asm name T0. + let ShouldEmitMatchRegisterName = 0; +} + def AMDGPU : Target { // Pull in Instruction Info: let InstructionSet = AMDGPUInstrInfo; + let AssemblyParsers = [AMDGPUAsmParser]; } // Dummy Instruction itineraries for pseudo instructions diff --git a/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp new file mode 100644 index 000000000000..b545b456161f --- /dev/null +++ b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp @@ -0,0 +1,66 @@ +//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass marks all internal functions as always_inline and creates +/// duplicates of all other functions a marks the duplicates as always_inline. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +class AMDGPUAlwaysInline : public ModulePass { + + static char ID; + +public: + AMDGPUAlwaysInline() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { return "AMDGPU Always Inline Pass"; } +}; + +} // End anonymous namespace + +char AMDGPUAlwaysInline::ID = 0; + +bool AMDGPUAlwaysInline::runOnModule(Module &M) { + + std::vector<Function*> FuncsToClone; + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty()) + FuncsToClone.push_back(&F); + } + + for (Function *F : FuncsToClone) { + ValueToValueMapTy VMap; + Function *NewFunc = CloneFunction(F, VMap, false); + NewFunc->setLinkage(GlobalValue::InternalLinkage); + F->getParent()->getFunctionList().push_back(NewFunc); + F->replaceAllUsesWith(NewFunc); + } + + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (F.hasLocalLinkage()) { + F.addFnAttr(Attribute::AlwaysInline); + } + } + return false; +} + +ModulePass *llvm::createAMDGPUAlwaysInlinePass() { + return new AMDGPUAlwaysInline(); +} diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 73faaa183581..624f3919b409 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -18,6 +18,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" +#include "AMDKernelCodeT.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" @@ -79,6 +80,7 @@ static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, extern "C" void LLVMInitializeR600AsmPrinter() { TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); } AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) @@ -97,9 +99,13 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + + // The starting address of all shader programs must be 256 bytes aligned. + MF.setAlignment(8); + SetupMachineFunction(MF); - OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':')); + EmitFunctionHeader(); MCContext &Context = getObjFileLowering().getContext(); const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", @@ -109,7 +115,12 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; - if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { + if (STM.isAmdHsaOS()) { + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + getSIProgramInfo(KernelInfo, MF); + EmitAmdKernelCodeT(MF, KernelInfo); + OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1)); + } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { getSIProgramInfo(KernelInfo, MF); EmitProgramInfoSI(MF, KernelInfo); } else { @@ -151,23 +162,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } } - if (STM.dumpCode()) { -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - MF.dump(); -#endif + if (STM.dumpCode() && DisasmEnabled) { - if (DisasmEnabled) { - OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm", - ELF::SHT_NOTE, 0, - SectionKind::getReadOnly())); + OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm", + ELF::SHT_NOTE, 0, + SectionKind::getReadOnly())); - for (size_t i = 0; i < DisasmLines.size(); ++i) { - std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); - Comment += " ; " + HexLines[i] + "\n"; + for (size_t i = 0; i < DisasmLines.size(); ++i) { + std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); + Comment += " ; " + HexLines[i] + "\n"; - OutStreamer.EmitBytes(StringRef(DisasmLines[i])); - OutStreamer.EmitBytes(StringRef(Comment)); - } + OutStreamer.EmitBytes(StringRef(DisasmLines[i])); + OutStreamer.EmitBytes(StringRef(Comment)); } } @@ -177,8 +183,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned MaxGPR = 0; bool killPixel = false; - const R600RegisterInfo *RI - = static_cast<const R600RegisterInfo*>(TM.getRegisterInfo()); + const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); @@ -236,12 +242,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) const { + const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; - const SIRegisterInfo *RI - = static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); + bool FlatUsed = false; + const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -262,6 +271,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, reg == AMDGPU::VCC_HI) { VCCUsed = true; continue; + } else if (reg == AMDGPU::FLAT_SCR || + reg == AMDGPU::FLAT_SCR_LO || + reg == AMDGPU::FLAT_SCR_HI) { + FlatUsed = true; + continue; } switch (reg) { @@ -275,7 +289,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (AMDGPU::SReg_32RegClass.contains(reg)) { isSGPR = true; width = 1; - } else if (AMDGPU::VReg_32RegClass.contains(reg)) { + } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { isSGPR = false; width = 1; } else if (AMDGPU::SReg_64RegClass.contains(reg)) { @@ -322,9 +336,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) MaxSGPR += 2; - ProgInfo.NumVGPR = MaxVGPR; - ProgInfo.NumSGPR = MaxSGPR; + if (FlatUsed) + MaxSGPR += 2; + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + ProgInfo.NumVGPR = MaxVGPR + 1; + ProgInfo.NumSGPR = MaxSGPR + 1; + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; + ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. ProgInfo.FloatMode = getFPMode(MF); @@ -338,22 +359,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + ProgInfo.FlatUsed = FlatUsed; + ProgInfo.VCCUsed = VCCUsed; ProgInfo.CodeLen = CodeSize; -} - -void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) { - const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - - unsigned RsrcReg; - switch (MFI->getShaderType()) { - default: // Fall through - case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break; - case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break; - case ShaderType::PIXEL: RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break; - case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; - } unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { @@ -364,52 +372,188 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, LDSAlignShift = 9; } - unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + unsigned LDSSpillSize = MFI->LDSWaveSpillSize * + MFI->getMaximumWorkGroupSize(MF); + + ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; + ProgInfo.LDSBlocks = + RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; // Scratch is allocated in 256 dword blocks. unsigned ScratchAlignShift = 10; // We need to program the hardware with the amount of scratch memory that - // is used by the entire wave. KernelInfo.ScratchSize is the amount of + // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. - unsigned ScratchBlocks = - RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(), + ProgInfo.ScratchBlocks = + RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), 1 << ScratchAlignShift) >> ScratchAlignShift; + ProgInfo.ComputePGMRSrc1 = + S_00B848_VGPRS(ProgInfo.VGPRBlocks) | + S_00B848_SGPRS(ProgInfo.SGPRBlocks) | + S_00B848_PRIORITY(ProgInfo.Priority) | + S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | + S_00B848_PRIV(ProgInfo.Priv) | + S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | + S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + + ProgInfo.ComputePGMRSrc2 = + S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | + S_00B84C_TGID_X_EN(1) | + S_00B84C_TGID_Y_EN(1) | + S_00B84C_TGID_Z_EN(1) | + S_00B84C_TG_SIZE_EN(1) | + S_00B84C_TIDIG_COMP_CNT(2) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); +} + +static unsigned getRsrcReg(unsigned ShaderType) { + switch (ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; + case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + } +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); + if (MFI->getShaderType() == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); - const uint32_t ComputePGMRSrc1 = - S_00B848_VGPRS(KernelInfo.NumVGPR / 4) | - S_00B848_SGPRS(KernelInfo.NumSGPR / 8) | - S_00B848_PRIORITY(KernelInfo.Priority) | - S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | - S_00B848_PRIV(KernelInfo.Priv) | - S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) | - S_00B848_IEEE_MODE(KernelInfo.DebugMode) | - S_00B848_IEEE_MODE(KernelInfo.IEEEMode); - - OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); + OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - const uint32_t ComputePGMRSrc2 = - S_00B84C_LDS_SIZE(LDSBlocks) | - S_00B02C_SCRATCH_EN(ScratchBlocks > 0); - - OutStreamer.EmitIntValue(ComputePGMRSrc2, 4); + OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4); + OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); + + // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = + // 0" comment but I don't see a corresponding field in the register spec. } else { OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | - S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | + S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); } if (MFI->getShaderType() == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); + OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); } } + +void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + amd_kernel_code_t header; + + memset(&header, 0, sizeof(header)); + + header.amd_code_version_major = AMD_CODE_VERSION_MAJOR; + header.amd_code_version_minor = AMD_CODE_VERSION_MINOR; + + header.struct_byte_size = sizeof(amd_kernel_code_t); + + header.target_chip = STM.getAmdKernelCodeChipID(); + + header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment()); + + header.compute_pgm_resource_registers = + KernelInfo.ComputePGMRSrc1 | + (KernelInfo.ComputePGMRSrc2 << 32); + + // Code Properties: + header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | + AMD_CODE_PROPERTY_IS_PTR64; + + if (KernelInfo.FlatUsed) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + if (KernelInfo.ScratchBlocks) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; + + header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + + // MFI->ABIArgOffset is the number of bytes for the kernel arguments + // plus 36. 36 is the number of bytes reserved at the begining of the + // input buffer to store work-group size information. + // FIXME: We should be adding the size of the implicit arguments + // to this value. + header.kernarg_segment_byte_size = MFI->ABIArgOffset; + + header.wavefront_sgpr_count = KernelInfo.NumSGPR; + header.workitem_vgpr_count = KernelInfo.NumVGPR; + + // FIXME: What values do I put for these alignments + header.kernarg_segment_alignment = 0; + header.group_segment_alignment = 0; + header.private_segment_alignment = 0; + + header.code_type = 1; // HSA_EXT_CODE_KERNEL + + header.wavefront_size = STM.getWavefrontSize(); + + if (isVerbose()) { + OutStreamer.emitRawComment("amd_code_version_major = " + + Twine(header.amd_code_version_major), false); + OutStreamer.emitRawComment("amd_code_version_minor = " + + Twine(header.amd_code_version_minor), false); + OutStreamer.emitRawComment("struct_byte_size = " + + Twine(header.struct_byte_size), false); + OutStreamer.emitRawComment("target_chip = " + + Twine(header.target_chip), false); + OutStreamer.emitRawComment(" compute_pgm_rsrc1: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc1), false); + OutStreamer.emitRawComment(" compute_pgm_rsrc2: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc2), false); + OutStreamer.emitRawComment("enable_sgpr_private_segment_buffer = " + + Twine((bool)(header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false); + OutStreamer.emitRawComment("enable_sgpr_kernarg_segment_ptr = " + + Twine((bool)(header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false); + OutStreamer.emitRawComment("private_element_size = 2 ", false); + OutStreamer.emitRawComment("is_ptr64 = " + + Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false); + OutStreamer.emitRawComment("workitem_private_segment_byte_size = " + + Twine(header.workitem_private_segment_byte_size), + false); + OutStreamer.emitRawComment("workgroup_group_segment_byte_size = " + + Twine(header.workgroup_group_segment_byte_size), + false); + OutStreamer.emitRawComment("gds_segment_byte_size = " + + Twine(header.gds_segment_byte_size), false); + OutStreamer.emitRawComment("kernarg_segment_byte_size = " + + Twine(header.kernarg_segment_byte_size), false); + OutStreamer.emitRawComment("wavefront_sgpr_count = " + + Twine(header.wavefront_sgpr_count), false); + OutStreamer.emitRawComment("workitem_vgpr_count = " + + Twine(header.workitem_vgpr_count), false); + OutStreamer.emitRawComment("code_type = " + Twine(header.code_type), false); + OutStreamer.emitRawComment("wavefront_size = " + + Twine((int)header.wavefront_size), false); + OutStreamer.emitRawComment("optimization_level = " + + Twine(header.optimization_level), false); + OutStreamer.emitRawComment("hsail_profile = " + + Twine(header.hsail_profile), false); + OutStreamer.emitRawComment("hsail_machine_model = " + + Twine(header.hsail_machine_model), false); + OutStreamer.emitRawComment("hsail_version_major = " + + Twine(header.hsail_version_major), false); + OutStreamer.emitRawComment("hsail_version_minor = " + + Twine(header.hsail_version_minor), false); + } + + OutStreamer.EmitBytes(StringRef((char*)&header, sizeof(header))); +} diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 19907cfd013e..b360ae88f1e6 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPU_ASMPRINTER_H -#define AMDGPU_ASMPRINTER_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H +#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H #include "llvm/CodeGen/AsmPrinter.h" #include <vector> @@ -24,8 +24,8 @@ class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : - NumVGPR(0), - NumSGPR(0), + VGPRBlocks(0), + SGPRBlocks(0), Priority(0), FloatMode(0), Priv(0), @@ -33,11 +33,19 @@ private: DebugMode(0), IEEEMode(0), ScratchSize(0), + ComputePGMRSrc1(0), + LDSBlocks(0), + ScratchBlocks(0), + ComputePGMRSrc2(0), + NumVGPR(0), + NumSGPR(0), + FlatUsed(false), + VCCUsed(false), CodeLen(0) {} // Fields set in PGM_RSRC1 pm4 packet. - uint32_t NumVGPR; - uint32_t NumSGPR; + uint32_t VGPRBlocks; + uint32_t SGPRBlocks; uint32_t Priority; uint32_t FloatMode; uint32_t Priv; @@ -46,7 +54,21 @@ private: uint32_t IEEEMode; uint32_t ScratchSize; + uint64_t ComputePGMRSrc1; + + // Fields set in PGM_RSRC2 pm4 packet. + uint32_t LDSBlocks; + uint32_t ScratchBlocks; + + uint64_t ComputePGMRSrc2; + + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t LDSSize; + bool FlatUsed; + // Bonus information for debugging. + bool VCCUsed; uint64_t CodeLen; }; @@ -59,6 +81,8 @@ private: /// can correctly setup the GPU state. void EmitProgramInfoR600(const MachineFunction &MF); void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const; public: explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer); @@ -82,4 +106,4 @@ protected: } // End anonymous llvm -#endif //AMDGPU_ASMPRINTER_H +#endif diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index 3586c8826908..6ffa7a083583 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -59,16 +59,24 @@ def CC_AMDGPU_Kernel : CallingConv<[ ]>; def CC_AMDGPU : CallingConv<[ - CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() >= " - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"# - "getShaderType() == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>, - CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()->" - "getShaderType() == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>, - CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"# - ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>, - CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"# - ".getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>> + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >=" + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo<CC_AMDGPU_Kernel>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo<CC_AMDGPU_Kernel>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo<CC_SI>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo<CC_R600>> ]>; diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h index d18ede5004e1..15a6636a1aea 100644 --- a/lib/Target/R600/AMDGPUFrameLowering.h +++ b/lib/Target/R600/AMDGPUFrameLowering.h @@ -12,8 +12,8 @@ /// machine. // //===----------------------------------------------------------------------===// -#ifndef AMDILFRAME_LOWERING_H -#define AMDILFRAME_LOWERING_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H #include "llvm/CodeGen/MachineFunction.h" #include "llvm/Target/TargetFrameLowering.h" @@ -42,4 +42,4 @@ public: bool hasFP(const MachineFunction &MF) const override; }; } // namespace llvm -#endif // AMDILFRAME_LOWERING_H +#endif diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index cc17b7ec6183..eaa506db96c3 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -65,6 +65,7 @@ private: static bool checkPrivateAddress(const MachineMemOperand *Op); static bool isGlobalStore(const StoreSDNode *N); + static bool isFlatStore(const StoreSDNode *N); static bool isPrivateStore(const StoreSDNode *N); static bool isLocalStore(const StoreSDNode *N); static bool isRegionStore(const StoreSDNode *N); @@ -72,30 +73,49 @@ private: bool isCPLoad(const LoadSDNode *N) const; bool isConstantLoad(const LoadSDNode *N, int cbID) const; bool isGlobalLoad(const LoadSDNode *N) const; + bool isFlatLoad(const LoadSDNode *N) const; bool isParamLoad(const LoadSDNode *N) const; bool isPrivateLoad(const LoadSDNode *N) const; bool isLocalLoad(const LoadSDNode *N) const; bool isRegionLoad(const LoadSDNode *N) const; - /// \returns True if the current basic block being selected is at control - /// flow depth 0. Meaning that the current block dominates the - // exit block. - bool isCFDepth0() const; - const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, SDValue& Offset); bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset, - SDValue &ImmOffset) const; + bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const; + bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; + bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1) const; + void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &Offset) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &Offset, + SDValue &SLC) const; bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; - bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &GLC, SDValue &SLC, + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset, SDValue &GLC) const; + SDNode *SelectAddrSpaceCast(SDNode *N); + bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; + + bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Omod) const; + bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const; SDNode *SelectADD_SUB_I64(SDNode *N); SDNode *SelectDIV_SCALE(SDNode *N); @@ -135,7 +155,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, switch (N->getMachineOpcode()) { default: { - const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode()); + const MCInstrDesc &Desc = + TM.getSubtargetImpl()->getInstrInfo()->get(N->getMachineOpcode()); unsigned OpIdx = Desc.getNumDefs() + OpNo; if (OpIdx >= Desc.getNumOperands()) return nullptr; @@ -143,15 +164,17 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, if (RegClass == -1) return nullptr; - return TM.getRegisterInfo()->getRegClass(RegClass); + return TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RegClass); } case AMDGPU::REG_SEQUENCE: { unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID); + const TargetRegisterClass *SuperRC = + TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RCID); SDValue SubRegOp = N->getOperand(OpNo + 1); unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); - return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx); + return TM.getSubtargetImpl()->getRegisterInfo()->getSubClassWithSubReg( + SuperRC, SubRegIdx); } } } @@ -239,10 +262,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { unsigned RegClassID; - const AMDGPURegisterInfo *TRI = - static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo()); - const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); + const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); @@ -263,7 +286,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } } switch(NumVectorElts) { - case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID : + case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : AMDGPU::SReg_32RegClassID; break; case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : @@ -470,7 +493,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::DIV_SCALE: { return SelectDIV_SCALE(N); } + case ISD::CopyToReg: { + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + Lowering.legalizeTargetIndependentNode(N, *CurDAG); + break; + } + case ISD::ADDRSPACECAST: + return SelectAddrSpaceCast(N); } + return SelectCode(N); } @@ -508,6 +540,10 @@ bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); } +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); } @@ -539,6 +575,10 @@ bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); } +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); } @@ -568,23 +608,16 @@ bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { const Value *MemVal = N->getMemOperand()->getValue(); if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){ + !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { return true; } return false; } -bool AMDGPUDAGToDAGISel::isCFDepth0() const { - // FIXME: Figure out a way to use DominatorTree analysis here. - const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock(); - const Function *Fn = FuncInfo->Fn; - return &Fn->front() == CurBlock || &Fn->back() == CurBlock; -} - - const char *AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } @@ -687,14 +720,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; - unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32; + unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - if (!isCFDepth0()) { - Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32; - CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32; - } - SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); SDValue Carry(AddLo, 1); SDNode *AddHi @@ -721,34 +749,134 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); - + const SDValue False = CurDAG->getTargetConstant(0, MVT::i1); SDValue Ops[] = { - N->getOperand(0), - N->getOperand(1), - N->getOperand(2), - Zero, - Zero, - Zero, - Zero + Zero, // src0_modifiers + N->getOperand(0), // src0 + Zero, // src1_modifiers + N->getOperand(1), // src1 + Zero, // src2_modifiers + N->getOperand(2), // src2 + False, // clamp + Zero // omod }; return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); } -static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { - return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32, - Ptr), 0); +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const { + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if ((OffsetBits == 16 && !isUInt<16>(Offset)) || + (OffsetBits == 8 && !isUInt<8>(Offset))) + return false; + + if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return CurDAG->SignBitIsZero(Base); +} + +bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, + SDValue &Offset) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { + // (add n0, c0) + Base = N0; + Offset = N1; + return true; + } + } + + // If we have a constant address, prefer to put the constant into the + // offset. This can save moves to load the constant address since multiple + // operations can share the zero base address register, and enables merging + // into read2 / write2 instructions. + if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + if (isUInt<16>(CAddr->getZExtValue())) { + SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SDLoc(Addr), MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset = Addr; + return true; + } + } + + // default case + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, + SDValue &Offset0, + SDValue &Offset1) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + unsigned DWordOffset0 = C1->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + // (add n0, c0) + if (isDSOffsetLegal(N0, DWordOffset1, 8)) { + Base = N0; + Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8); + return true; + } + } + + if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + unsigned DWordOffset0 = CAddr->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + assert(4 * DWordOffset0 == CAddr->getZExtValue()); + + if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { + SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32); + MachineSDNode *MovZero + = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SDLoc(Addr), MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8); + return true; + } + } + + // default case + Base = Addr; + Offset0 = CurDAG->getTargetConstant(0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(1, MVT::i8); + return true; } static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { return isUInt<12>(Imm->getZExtValue()); } -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, - SDValue &Offset, - SDValue &ImmOffset) const { +void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { SDLoc DL(Addr); + GLC = CurDAG->getTargetConstant(0, MVT::i1); + SLC = CurDAG->getTargetConstant(0, MVT::i1); + TFE = CurDAG->getTargetConstant(0, MVT::i1); + + Idxen = CurDAG->getTargetConstant(0, MVT::i1); + Offen = CurDAG->getTargetConstant(0, MVT::i1); + Addr64 = CurDAG->getTargetConstant(0, MVT::i1); + SOffset = CurDAG->getTargetConstant(0, MVT::i32); + if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); @@ -757,57 +885,69 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, if (isLegalMUBUFImmOffset(C1)) { if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) + // (add (add N2, N3), C1) -> addr64 SDValue N2 = N0.getOperand(0); SDValue N3 = N0.getOperand(1); - Ptr = wrapAddr64Rsrc(CurDAG, DL, N2); - Offset = N3; - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); - return true; + Addr64 = CurDAG->getTargetConstant(1, MVT::i1); + Ptr = N2; + VAddr = N3; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return; } - // (add N0, C1) - Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));; - Offset = N0; - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); - return true; + // (add N0, C1) -> offset + VAddr = CurDAG->getTargetConstant(0, MVT::i32); + Ptr = N0; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16); + return; } } if (Addr.getOpcode() == ISD::ADD) { - // (add N0, N1) + // (add N0, N1) -> addr64 SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - Ptr = wrapAddr64Rsrc(CurDAG, DL, N0); - Offset = N1; - ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); - return true; + Addr64 = CurDAG->getTargetConstant(1, MVT::i1); + Ptr = N0; + VAddr = N1; + Offset = CurDAG->getTargetConstant(0, MVT::i16); + return; } - // default case - Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64)); - Offset = Addr; - ImmOffset = CurDAG->getTargetConstant(0, MVT::i16); - return true; + // default case -> offset + VAddr = CurDAG->getTargetConstant(0, MVT::i32); + Ptr = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i16); + } -/// \brief Return a resource descriptor with the 'Add TID' bit enabled -/// The TID (Thread ID) is multipled by the stride value (bits [61:48] -/// of the resource descriptor) to create an offset, which is added to the -/// resource ponter. -static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, + SDValue &Offset) const { + SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE; + + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE); + + ConstantSDNode *C = cast<ConstantSDNode>(Addr64); + if (C->getSExtValue()) { + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); + return true; + } - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | - 0xffffffff; + return false; +} - SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); - SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); - SDValue DataLo = DAG->getTargetConstant( - Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32); - SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32); +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &Offset, + SDValue &SLC) const { + SLC = CurDAG->getTargetConstant(0, MVT::i1); - const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi }; - return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL, - MVT::v4i32, Ops), 0); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, @@ -816,16 +956,23 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo()); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); MachineRegisterInfo &MRI = MF.getRegInfo(); - + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); unsigned ScratchPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); unsigned ScratchOffsetReg = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, + ScratchOffsetReg, MVT::i32); - Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64)); + SDValue ScratchPtr = + CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, + MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64); + Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); @@ -863,20 +1010,150 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, return true; } -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const { +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &SOffset, SDValue &Offset, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { + SDValue Ptr, VAddr, Offen, Idxen, Addr64; + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(Subtarget.getInstrInfo()); - GLC = CurDAG->getTargetConstant(0, MVT::i1); - SLC = CurDAG->getTargetConstant(0, MVT::i1); - TFE = CurDAG->getTargetConstant(0, MVT::i1); + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE); - Idxen = CurDAG->getTargetConstant(0, MVT::i1); - Offen = CurDAG->getTargetConstant(1, MVT::i1); + if (!cast<ConstantSDNode>(Offen)->getSExtValue() && + !cast<ConstantSDNode>(Idxen)->getSExtValue() && + !cast<ConstantSDNode>(Addr64)->getSExtValue()) { + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | + APInt::getAllOnesValue(32).getZExtValue(); // Size + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast<const SITargetLowering*>(getTargetLowering()); + + SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &Soffset, SDValue &Offset, + SDValue &GLC) const { + SDValue SLC, TFE; + + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); +} + +// FIXME: This is incorrect and only enough to be able to compile. +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { + AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); + SDLoc DL(N); + + assert(Subtarget.hasFlatAddressSpace() && + "addrspacecast only supported with flat address space!"); + + assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && + "Cannot cast address space to / from constant address!"); + + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && + "Can only cast to / from flat address space!"); + + // The flat instructions read the address as the index of the VGPR holding the + // address, so casting should just be reinterpreting the base VGPR, so just + // insert trunc / bitcast / zext. + + SDValue Src = ASC->getOperand(0); + EVT DestVT = ASC->getValueType(0); + EVT SrcVT = Src.getValueType(); + + unsigned SrcSize = SrcVT.getSizeInBits(); + unsigned DestSize = DestVT.getSizeInBits(); + + if (SrcSize > DestSize) { + assert(SrcSize == 64 && DestSize == 32); + return CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, + DL, + DestVT, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32)); + } + + + if (DestSize > SrcSize) { + assert(SrcSize == 32 && DestSize == 64); + + // FIXME: This is probably wrong, we should never be defining + // a register class with both VGPRs and SGPRs + SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, MVT::i32); + + const SDValue Ops[] = { + RC, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32, + CurDAG->getConstant(0, MVT::i32)), 0), + CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + SDLoc(N), N->getValueType(0), Ops); + } + + assert(SrcSize == 64 && DestSize == 64); + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + + unsigned Mods = 0; + + Src = In; + + if (Src.getOpcode() == ISD::FNEG) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } + + if (Src.getOpcode() == ISD::FABS) { + Mods |= SISrcMods::ABS; + Src = Src.getOperand(0); + } + + SrcMods = CurDAG->getTargetConstant(Mods, MVT::i32); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + // FIXME: Handle Clamp and Omod + Clamp = CurDAG->getTargetConstant(0, MVT::i32); + Omod = CurDAG->getTargetConstant(0, MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Omod) const { + // FIXME: Handle Omod + Omod = CurDAG->getTargetConstant(0, MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} - return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset); +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const { + Clamp = Omod = CurDAG->getTargetConstant(0, MVT::i32); + return SelectVOP3Mods(In, Src, SrcMods); } void AMDGPUDAGToDAGISel::PostprocessISelDAG() { diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 5a46297b6032..206050d54a02 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -103,7 +103,7 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { } AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : - TargetLowering(TM, new TargetLoweringObjectFileELF()) { + TargetLowering(TM) { Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); @@ -130,6 +130,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FREM, MVT::f32, Custom); + setOperationAction(ISD::FREM, MVT::f64, Custom); + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); @@ -213,18 +216,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); + // There are no 64-bit extloads. These should be done as a 32-bit extload and + // an extension to 64-bit. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); + } setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -243,7 +256,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); @@ -282,6 +296,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::UDIV, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); if (!Subtarget->hasFFBH()) @@ -310,7 +327,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::SUB, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); - // TODO: Implement custom UREM / SREM routines. setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); @@ -342,12 +358,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : for (MVT VT : FloatVectorTypes) { setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FMINNUM, VT, Expand); + setOperationAction(ISD::FMAXNUM, VT, Expand); setOperationAction(ISD::FADD, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -370,22 +389,29 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::STORE); + setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); + // SI at least has hardware support for floating point exceptions, but no way + // of using or handling them is implemented. They are also optional in OpenCL + // (Section 7.3) + setHasFloatingPointExceptions(false); + setSelectIsExpensive(false); PredictableSelectIsExpensive = false; // There are no integer divide instructions, and these expand to a pretty // large sequence of instructions. setIntDivIsCheap(false); - setPow2DivIsCheap(false); - - // TODO: Investigate this when 64-bit divides are implemented. - addBypassSlowDiv(64, 32); + setPow2SDivIsCheap(false); + setFsqrtIsCheap(true); // FIXME: Need to really handle these. MaxStoresPerMemcpy = 4096; @@ -418,6 +444,29 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); } +bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, + ISD::LoadExtType, + EVT NewVT) const { + + unsigned NewSize = NewVT.getStoreSizeInBits(); + + // If we are reducing to a 32-bit load, this is always better. + if (NewSize == 32) + return true; + + EVT OldVT = N->getValueType(0); + unsigned OldSize = OldVT.getStoreSizeInBits(); + + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar + // extloads, so doing one requires using a buffer_load. In cases where we + // still couldn't use a scalar load, using the wider load shouldn't really + // hurt anything. + + // If the old size already had to be an extload, there's no harm in continuing + // to reduce the width. + return (OldSize < 32); +} + bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy) const { if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) @@ -431,18 +480,30 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, (LScalarSize < 32)); } +// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also +// profitable with the expansion for 64-bit since it's generally good to +// speculate things. +// FIXME: These should really have the size as a parameter. +bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { + return true; +} + +bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { + return true; +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32; + return VT == MVT::f32 || VT == MVT::f64; } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32; + return VT == MVT::f32 || VT == MVT::f64; } bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { @@ -542,16 +603,18 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::SDIV: return LowerSDIV(Op, DAG); - case ISD::SREM: return LowerSREM(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); + case ISD::FREM: return LowerFREM(Op, DAG); case ISD::FCEIL: return LowerFCEIL(Op, DAG); case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); } return Op; } @@ -606,7 +669,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, const SDValue &InitPtr, SDValue Chain, SelectionDAG &DAG) const { - const DataLayout *TD = getTargetMachine().getDataLayout(); + const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); SDLoc DL(InitPtr); Type *InitTy = Init->getType(); @@ -679,22 +742,35 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, llvm_unreachable("Unhandled constant initializer"); } +static bool hasDefinedInitializer(const GlobalValue *GV) { + const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); + if (!GVar || !GVar->hasInitializer()) + return false; + + if (isa<UndefValue>(GVar->getInitializer())) + return false; + + return true; +} + SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, SDValue Op, SelectionDAG &DAG) const { - const DataLayout *TD = getTargetMachine().getDataLayout(); + const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); switch (G->getAddressSpace()) { - default: llvm_unreachable("Global Address lowering not implemented for this " - "address space"); case AMDGPUAS::LOCAL_ADDRESS: { // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); + // TODO: We could emit code to handle the initialization somewhere. + if (hasDefinedInitializer(GV)) + break; + unsigned Offset; if (MFI->LocalMemoryObjects.count(GV) == 0) { uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); @@ -706,7 +782,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, Offset = MFI->LocalMemoryObjects[GV]; } - return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); + return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); } case AMDGPUAS::CONSTANT_ADDRESS: { MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); @@ -748,6 +824,12 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); } } + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + DiagnosticInfoUnsupported BadInit(Fn, + "initializer for address space"); + DAG.getContext()->diagnose(BadInit); + return SDValue(); } SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, @@ -778,8 +860,8 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + getTargetMachine().getSubtargetImpl()->getFrameLowering()); FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); @@ -821,13 +903,21 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // first parameter must be the same as the first instruction. SDValue Numerator = Op.getOperand(1); SDValue Denominator = Op.getOperand(2); + + // Note this order is opposite of the machine instruction's operations, + // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The + // intrinsic has the numerator as the first operand to match a normal + // division operation. + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; - return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, - Src0, Denominator, Numerator); + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, + Denominator, Numerator); } case Intrinsic::AMDGPU_div_fmas: + // FIXME: Dropping bool parameter. Work is needed to support the implicit + // read from VCC. return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -849,7 +939,23 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); case Intrinsic::AMDGPU_rsq_clamped: - return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + Type *Type = VT.getTypeForEVT(*DAG.getContext()); + APFloat Max = APFloat::getLargest(Type->getFltSemantics()); + APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); + + SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, + DAG.getConstantFP(Max, VT)); + return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, + DAG.getConstantFP(Min, VT)); + } else { + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + } + + case Intrinsic::AMDGPU_ldexp: + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), + Op.getOperand(2)); case AMDGPUIntrinsic::AMDGPU_imax: return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), @@ -918,6 +1024,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case AMDGPUIntrinsic::AMDGPU_brev: return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); + case Intrinsic::AMDGPU_class: + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); @@ -956,22 +1066,21 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, } /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, - SelectionDAG &DAG) const { - SDLoc DL(N); - EVT VT = N->getValueType(0); - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - SDValue True = N->getOperand(2); - SDValue False = N->getOperand(3); - SDValue CC = N->getOperand(4); +SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return SDValue(); - if (VT != MVT::f32 || - !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); - } + SelectionDAG &DAG = DCI.DAG; ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); switch (CCOpcode) { case ISD::SETOEQ: @@ -986,24 +1095,49 @@ SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, case ISD::SETTRUE2: case ISD::SETUO: case ISD::SETO: - llvm_unreachable("Operation should already be optimised!"); + break; case ISD::SETULE: - case ISD::SETULT: + case ISD::SETULT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + } case ISD::SETOLE: case ISD::SETOLT: case ISD::SETLE: case ISD::SETLT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); + // Ordered. Assume ordered for undefined. + + // Only do this after legalization to avoid interfering with other combines + // which might occur. + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + + // We need to permute the operands to get the correct NaN behavior. The + // selected operand is the second one based on the failing compare with NaN, + // so permute it based on the compare type the hardware uses. + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); } case ISD::SETGT: case ISD::SETGE: - case ISD::SETUGE: case ISD::SETOGE: - case ISD::SETUGT: case ISD::SETOGT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); } case ISD::SETCC_INVALID: llvm_unreachable("Invalid setcc condcode!"); @@ -1011,12 +1145,53 @@ SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, return SDValue(); } -SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, - SelectionDAG &DAG) const { - LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); - EVT MemEltVT = Load->getMemoryVT().getVectorElementType(); +/// \brief Generate Min/Max node +SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const { + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); + switch (CCOpcode) { + case ISD::SETULE: + case ISD::SETULT: { + unsigned Opc = (LHS == True) ? AMDGPUISD::UMIN : AMDGPUISD::UMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETLE: + case ISD::SETLT: { + unsigned Opc = (LHS == True) ? AMDGPUISD::SMIN : AMDGPUISD::SMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: { + unsigned Opc = (LHS == True) ? AMDGPUISD::SMAX : AMDGPUISD::SMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + unsigned Opc = (LHS == True) ? AMDGPUISD::UMAX : AMDGPUISD::UMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + default: + return SDValue(); + } +} + +SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = cast<LoadSDNode>(Op); + EVT MemVT = Load->getMemoryVT(); + EVT MemEltVT = MemVT.getVectorElementType(); + EVT LoadVT = Op.getValueType(); - EVT EltVT = Op.getValueType().getVectorElementType(); + EVT EltVT = LoadVT.getVectorElementType(); EVT PtrVT = Load->getBasePtr().getValueType(); unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); @@ -1024,17 +1199,19 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, SmallVector<SDValue, 8> Chains; SDLoc SL(Op); + unsigned MemEltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); - for (unsigned i = 0, e = NumElts; i != e; ++i) { + for (unsigned i = 0; i < NumElts; ++i) { SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), - DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT)); + DAG.getConstant(i * MemEltSize, PtrVT)); SDValue NewLoad = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, Load->getChain(), Ptr, - MachinePointerInfo(Load->getMemOperand()->getValue()), + SrcValue.getWithOffset(i * MemEltSize), MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->getAlignment()); + Load->isInvariant(), Load->getAlignment()); Loads.push_back(NewLoad.getValue(0)); Chains.push_back(NewLoad.getValue(1)); } @@ -1047,6 +1224,55 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op, return DAG.getMergeValues(Ops, SL); } +SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorLoad(Op, DAG); + + LoadSDNode *Load = cast<LoadSDNode>(Op); + SDValue BasePtr = Load->getBasePtr(); + EVT PtrVT = BasePtr.getValueType(); + EVT MemVT = Load->getMemoryVT(); + SDLoc SL(Op); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + SDValue LoLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, + Load->getChain(), BasePtr, + SrcValue, + LoMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); + + SDValue HiLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, + Load->getChain(), HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + + SDValue Ops[] = { + DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + LoLoad.getValue(1), HiLoad.getValue(1)) + }; + + return DAG.getMergeValues(Ops, SL); +} + SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); @@ -1105,8 +1331,8 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, Store->getAlignment()); } -SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, + SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); EVT EltVT = Store->getValue().getValueType().getVectorElementType(); @@ -1116,21 +1342,77 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SmallVector<SDValue, 8> Chains; + unsigned EltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Store->getValue(), DAG.getConstant(i, MVT::i32)); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, - Store->getBasePtr(), - DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), - PtrVT)); - Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, - MachinePointerInfo(Store->getMemOperand()->getValue()), - MemEltVT, Store->isVolatile(), Store->isNonTemporal(), - Store->getAlignment())); + Store->getValue(), + DAG.getConstant(i, MVT::i32)); + + SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); + SDValue NewStore = + DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, + SrcValue.getWithOffset(i * EltSize), + MemEltVT, Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + Chains.push_back(NewStore); } + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); } +SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast<StoreSDNode>(Op); + SDValue Val = Store->getValue(); + EVT VT = Val.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorStore(Op, DAG); + + EVT MemVT = Store->getMemoryVT(); + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + SDLoc SL(Op); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); + + EVT PtrVT = BasePtr.getValueType(); + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); + + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + SDValue LoStore + = DAG.getTruncStore(Chain, SL, Lo, + BasePtr, + SrcValue, + LoMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + Store->getAlignment()); + SDValue HiStore + = DAG.getTruncStore(Chain, SL, Hi, + HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + Store->getAlignment()); + + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); +} + + SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); @@ -1138,24 +1420,6 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT MemVT = Load->getMemoryVT(); - if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { - // We can do the extload to 32-bits, and then need to separately extend to - // 64-bits. - - SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, - Load->getChain(), - Load->getBasePtr(), - MemVT, - Load->getMemOperand()); - - SDValue Ops[] = { - DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32), - ExtLoad32.getValue(1) - }; - - return DAG.getMergeValues(Ops, DL); - } - if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { assert(VT == MVT::i1 && "Only i1 non-extloads expected"); // FIXME: Copied from PPC @@ -1228,7 +1492,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { - return SplitVectorStore(Op, DAG); + return ScalarizeVectorStore(Op, DAG); } EVT MemVT = Store->getMemoryVT(); @@ -1273,249 +1537,179 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { +// This is a shortcut for integer division because we have fast i32<->f32 +// conversions, and fast f32 reciprocal instructions. The fractional part of a +// float is enough to accurately represent up to a 24-bit integer. +SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { SDLoc DL(Op); - EVT OVT = Op.getValueType(); + EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - MVT INTTY; - MVT FLTTY; - if (!OVT.isVector()) { - INTTY = MVT::i32; - FLTTY = MVT::f32; - } else if (OVT.getVectorNumElements() == 2) { - INTTY = MVT::v2i32; - FLTTY = MVT::v2f32; - } else if (OVT.getVectorNumElements() == 4) { - INTTY = MVT::v4i32; - FLTTY = MVT::v4f32; - } - unsigned bitsize = OVT.getScalarType().getSizeInBits(); - // char|short jq = ia ^ ib; - SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); - - // jq = jq >> (bitsize - 2) - jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); - - // jq = jq | 0x1 - jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); - - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, INTTY); + MVT IntVT = MVT::i32; + MVT FltVT = MVT::f32; + + ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + + if (VT.isVector()) { + unsigned NElts = VT.getVectorNumElements(); + IntVT = MVT::getVectorVT(MVT::i32, NElts); + FltVT = MVT::getVectorVT(MVT::f32, NElts); + } + + unsigned BitSize = VT.getScalarType().getSizeInBits(); + + SDValue jq = DAG.getConstant(1, IntVT); + + if (sign) { + // char|short jq = ia ^ ib; + jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, IntVT); + } // int ia = (int)LHS; - SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); + SDValue ia = sign ? + DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); // int ib, (int)RHS; - SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); + SDValue ib = sign ? + DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); // float fa = (float)ia; - SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); + SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); // float fb = (float)ib; - SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); + SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); // float fq = native_divide(fa, fb); - SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY, - fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb)); + SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, + fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); // fq = trunc(fq); - fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); // float fqneg = -fq; - SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, - DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, + DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); // int iq = (int)fq; - SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); + SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); // fr = fabs(fr); - fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); + fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); // fb = fabs(fb); - fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); - - // int cv = fr >= fb; - SDValue cv; - if (INTTY == MVT::i32) { - cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); - } else { - cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); - } - // jq = (cv ? jq : 0); - jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, - DAG.getConstant(0, OVT)); - // dst = iq + jq; - iq = DAG.getSExtOrTrunc(iq, DL, OVT); - iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); - return iq; -} - -SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - // The LowerSDIV32 function generates equivalent to the following IL. - // mov r0, LHS - // mov r1, RHS - // ilt r10, r0, 0 - // ilt r11, r1, 0 - // iadd r0, r0, r10 - // iadd r1, r1, r11 - // ixor r0, r0, r10 - // ixor r1, r1, r11 - // udiv r0, r0, r1 - // ixor r10, r10, r11 - // iadd r0, r0, r10 - // ixor DST, r0, r10 - - // mov r0, LHS - SDValue r0 = LHS; - - // mov r1, RHS - SDValue r1 = RHS; - - // ilt r10, r0, 0 - SDValue r10 = DAG.getSelectCC(DL, - r0, DAG.getConstant(0, OVT), - DAG.getConstant(-1, OVT), - DAG.getConstant(0, OVT), - ISD::SETLT); - - // ilt r11, r1, 0 - SDValue r11 = DAG.getSelectCC(DL, - r1, DAG.getConstant(0, OVT), - DAG.getConstant(-1, OVT), - DAG.getConstant(0, OVT), - ISD::SETLT); + fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); - // iadd r1, r1, r11 - r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); - - // ixor r0, r0, r10 - r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - - // ixor r1, r1, r11 - r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + // int cv = fr >= fb; + SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); - // udiv r0, r0, r1 - r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT)); - // ixor r10, r10, r11 - r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); + // dst = trunc/extend to legal type + iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + // dst = iq + jq; + SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); - // ixor DST, r0, r10 - SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - return DST; -} + // Rem needs compensation, it's easier to recompute it + SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); + Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); -SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(Op.getNode(), 0); + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); } -SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { - EVT OVT = Op.getValueType().getScalarType(); - - if (OVT == MVT::i64) - return LowerSDIV64(Op, DAG); - - if (OVT.getScalarType() == MVT::i32) - return LowerSDIV32(Op, DAG); +void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) const { + assert(Op.getValueType() == MVT::i64); - if (OVT == MVT::i16 || OVT == MVT::i8) { - // FIXME: We should be checking for the masked bits. This isn't reached - // because i8 and i16 are not legal types. - return LowerSDIV24(Op, DAG); - } - - return SDValue(Op.getNode(), 0); -} - -SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); - EVT OVT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - // The LowerSREM32 function generates equivalent to the following IL. - // mov r0, LHS - // mov r1, RHS - // ilt r10, r0, 0 - // ilt r11, r1, 0 - // iadd r0, r0, r10 - // iadd r1, r1, r11 - // ixor r0, r0, r10 - // ixor r1, r1, r11 - // udiv r20, r0, r1 - // umul r20, r20, r1 - // sub r0, r0, r20 - // iadd r0, r0, r10 - // ixor DST, r0, r10 + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - // mov r0, LHS - SDValue r0 = LHS; + SDValue one = DAG.getConstant(1, HalfVT); + SDValue zero = DAG.getConstant(0, HalfVT); - // mov r1, RHS - SDValue r1 = RHS; + //HiLo split + SDValue LHS = Op.getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - // ilt r10, r0, 0 - SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); + SDValue RHS = Op.getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - // ilt r11, r1, 0 - SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + SDValue REM_Hi = zero; + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - // iadd r1, r1, r11 - r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; - // ixor r0, r0, r10 - r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + const unsigned halfBitWidth = HalfVT.getSizeInBits(); - // ixor r1, r1, r11 - r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + for (unsigned i = 0; i < halfBitWidth; ++i) { + SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); + // Get Value of high bit + SDValue HBit; + if (halfBitWidth == 32 && Subtarget->hasBFE()) { + HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); + } else { + HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + } - // udiv r20, r0, r1 - SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); + SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, + DAG.getConstant(halfBitWidth - 1, HalfVT)); + REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); + REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - // umul r20, r20, r1 - r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); + REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); + REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); - // sub r0, r0, r20 - r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); - // iadd r0, r0, r10 - r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - // ixor DST, r0, r10 - SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); - return DST; -} + SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); -SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { - return SDValue(Op.getNode(), 0); -} + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); -SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { - EVT OVT = Op.getValueType(); + // Update REM - if (OVT.getScalarType() == MVT::i64) - return LowerSREM64(Op, DAG); + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - if (OVT.getScalarType() == MVT::i32) - return LowerSREM32(Op, DAG); + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); + REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); + REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); + } - return SDValue(Op.getNode(), 0); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); } SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, @@ -1523,15 +1717,31 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDLoc DL(Op); EVT VT = Op.getValueType(); + if (VT == MVT::i64) { + SmallVector<SDValue, 2> Results; + LowerUDIVREM64(Op, DAG, Results); + return DAG.getMergeValues(Results, DL); + } + SDValue Num = Op.getOperand(0); SDValue Den = Op.getOperand(1); + if (VT == MVT::i32) { + if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) && + DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) { + // TODO: We technically could do this for i64, but shouldn't that just be + // handled by something generally reducing 64-bit division on 32-bit + // values to 32-bit? + return LowerDIVREM24(Op, DAG, false); + } + } + // RCP = URECIP(Den) = 2^32 / Den + e // e is rounding error. SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); - // RCP_LO = umulo(RCP, Den) */ - SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); + // RCP_LO = mul(RCP, Den) */ + SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); // RCP_HI = mulhu (RCP, Den) */ SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); @@ -1562,7 +1772,7 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); // Num_S_Remainder = Quotient * Den - SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); + SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); // Remainder = Num - Num_S_Remainder SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); @@ -1627,12 +1837,22 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, SDLoc DL(Op); EVT VT = Op.getValueType(); - SDValue Zero = DAG.getConstant(0, VT); - SDValue NegOne = DAG.getConstant(-1, VT); - SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); + if (VT == MVT::i32) { + if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 && + DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) { + // TODO: We technically could do this for i64, but shouldn't that just be + // handled by something generally reducing 64-bit division on 32-bit + // values to 32-bit? + return LowerDIVREM24(Op, DAG, true); + } + } + + SDValue Zero = DAG.getConstant(0, VT); + SDValue NegOne = DAG.getConstant(-1, VT); + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); @@ -1660,6 +1880,20 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, return DAG.getMergeValues(Res, DL); } +// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) +SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT VT = Op.getValueType(); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); + SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); + + return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); +} + SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -1702,7 +1936,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { const unsigned ExpBits = 11; // Extract the exponent. - SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32, + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, Hi, DAG.getConstant(FractBits - 32, MVT::i32), DAG.getConstant(ExpBits, MVT::i32)); @@ -1793,13 +2027,43 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(0, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(1, MVT::i32)); + + SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, + SL, MVT::f64, Hi); + + SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); + + SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, + DAG.getConstant(32, MVT::i32)); + + return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); +} + SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue S0 = Op.getOperand(0); - SDLoc DL(Op); - if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64) + if (S0.getValueType() != MVT::i64) return SDValue(); + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, false); + + assert(DestVT == MVT::f32); + + SDLoc DL(Op); + // f32 uint_to_fp i64 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, DAG.getConstant(0, MVT::i32)); @@ -1812,16 +2076,62 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); } -SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, - unsigned BitsDiff, - SelectionDAG &DAG) const { - MVT VT = Op.getSimpleValueType(); - SDLoc DL(Op); - SDValue Shift = DAG.getConstant(BitsDiff, VT); - // Shift left by 'Shift' bits. - SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); - // Signed shift Right by 'Shift' bits. - return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); +SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + + SDValue Src = Op.getOperand(0); + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + SDValue K0 + = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), MVT::f64); + SDValue K1 + = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), MVT::f64); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); + + SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); + + + SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); + + SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, + MVT::i32, FloorMul); + SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); + + SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); + + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, false); + + return SDValue(); } SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, @@ -1887,7 +2197,8 @@ template <typename IntTy> static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width) { if (Width + Offset < 32) { - IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width); + uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); + IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); return DAG.getConstant(Result, MVT::i32); } @@ -1916,7 +2227,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SDValue Value = SN->getValue(); EVT VT = Value.getValueType(); - if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode())) + if (isTypeLegal(VT) || SN->isVolatile() || + !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) return SDValue(); LoadSDNode *LoadVal = cast<LoadSDNode>(Value); @@ -1992,9 +2304,30 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, simplifyI24(N1, DCI); return SDValue(); } - case ISD::SELECT_CC: { - return CombineMinMax(N, DAG); + case ISD::SELECT: { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + if (VT == MVT::f32) + return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + + // TODO: Implement min / max Evergreen instructions. + if (VT == MVT::i32 && + Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); + } } + + break; + } case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -2039,37 +2372,40 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); } - if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { if (Signed) { return constantFoldBFE<int32_t>(DAG, - Val->getSExtValue(), + CVal->getSExtValue(), OffsetVal, WidthVal); } return constantFoldBFE<uint32_t>(DAG, - Val->getZExtValue(), + CVal->getZExtValue(), OffsetVal, WidthVal); } - APInt Demanded = APInt::getBitsSet(32, - OffsetVal, - OffsetVal + WidthVal); - if ((OffsetVal + WidthVal) >= 32) { SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); } - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || - TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); + if (BitsFrom.hasOneUse()) { + APInt Demanded = APInt::getBitsSet(32, + OffsetVal, + OffsetVal + WidthVal); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + TLI.SimplifyDemandedBits(BitsFrom, Demanded, + KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } } break; @@ -2167,12 +2503,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(FMAX) + NODE_NAME_CASE(MAD) + NODE_NAME_CASE(FMAX_LEGACY) NODE_NAME_CASE(SMAX) NODE_NAME_CASE(UMAX) - NODE_NAME_CASE(FMIN) + NODE_NAME_CASE(FMIN_LEGACY) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(FMAX3) + NODE_NAME_CASE(SMAX3) + NODE_NAME_CASE(UMAX3) + NODE_NAME_CASE(FMIN3) + NODE_NAME_CASE(SMIN3) + NODE_NAME_CASE(UMIN3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) @@ -2182,6 +2525,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RSQ_LEGACY) NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(LDEXP) + NODE_NAME_CASE(FP_CLASS) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) @@ -2213,6 +2558,46 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { } } +SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rsq instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + // Reciprocal, < 1 ulp error. + // + // This reciprocal approximation converges to < 0.5 ulp error with one + // newton rhapson performed with two fused multiple adds (FMAs). + + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rcp instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + static void computeKnownBitsForMinMax(const SDValue Op0, const SDValue Op1, APInt &KnownZero, @@ -2276,17 +2661,8 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( unsigned BitWidth = 32; uint32_t Width = CWidth->getZExtValue() & 0x1f; - if (Width == 0) { - KnownZero = APInt::getAllOnesValue(BitWidth); - KnownOne = APInt::getNullValue(BitWidth); - return; - } - // FIXME: This could do a lot more. If offset is 0, should be the same as - // sign_extend_inreg implementation, but that involves duplicating it. - if (Opc == AMDGPUISD::BFE_I32) - KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width); - else + if (Opc == AMDGPUISD::BFE_U32) KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); break; diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 624d4e0c1967..15f529c40a31 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUISELLOWERING_H -#define AMDGPUISELLOWERING_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H #include "llvm/Target/TargetLowering.h" @@ -43,25 +43,22 @@ private: /// \brief Split a vector store into multiple scalar stores. /// \returns The resulting chain. - SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; - SDValue ExpandSIGN_EXTEND_INREG(SDValue Op, - unsigned BitsDiff, - SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -73,12 +70,25 @@ protected: virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector load into multiple scalar loads. - SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const; + + /// \brief Split a vector load into a scalar load of each component. + SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector load into 2 loads of half the vector. + SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into a scalar store of each component. + SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; + void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &Results) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; @@ -114,8 +124,14 @@ public: bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; bool ShouldShrinkFPConstant(EVT VT) const override; + bool shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtType, + EVT ExtVT) const override; bool isLoadBitCastBeneficial(EVT, EVT) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, @@ -132,9 +148,33 @@ public: SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const; + SDValue CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const; + SDValue CombineIMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const; + const char* getTargetNodeName(unsigned Opcode) const override; + SDValue getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; + virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const { return N; @@ -149,10 +189,8 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; - virtual unsigned ComputeNumSignBitsForTargetNode( - SDValue Op, - const SelectionDAG &DAG, - unsigned Depth = 0) const override; + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, + unsigned Depth = 0) const override; /// \brief Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. @@ -176,17 +214,24 @@ enum { DWORDADDR, FRACT, CLAMP, + MAD, // Multiply + add with same result as the separate operations. // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. // Denormals handled on some parts. COS_HW, SIN_HW, - FMAX, + FMAX_LEGACY, SMAX, UMAX, - FMIN, + FMIN_LEGACY, SMIN, UMIN, + FMAX3, + SMAX3, + UMAX3, + FMIN3, + SMIN3, + UMIN3, URECIP, DIV_SCALE, DIV_FMAS, @@ -199,6 +244,8 @@ enum { RSQ, RSQ_LEGACY, RSQ_CLAMPED, + LDEXP, + FP_CLASS, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. @@ -248,4 +295,4 @@ enum { } // End namespace llvm -#endif // AMDGPUISELLOWERING_H +#endif diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp index fef5b8cac5ba..5beaa6841c94 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ b/lib/Target/R600/AMDGPUInstrInfo.cpp @@ -86,21 +86,6 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // TODO: Implement this function return nullptr; } -bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter, - MachineBasicBlock &MBB) const { - while (iter != MBB.end()) { - switch (iter->getOpcode()) { - default: - break; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: - case AMDGPU::BRANCH: - return true; - }; - ++iter; - } - return false; -} void AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, @@ -147,7 +132,6 @@ bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const } else if (isRegisterStore(*MI)) { int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::val); - AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); @@ -215,15 +199,30 @@ AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, return 0; } -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const { - assert(Offset2 > Offset1 - && "Second offset should be larger than first offset!"); - // If we have less than 16 loads in a row, and the offsets are within 16, - // then schedule together. - // TODO: Make the loads schedule near if it fits in a cacheline - return (NumLoads < 16 && (Offset2 - Offset1) < 16); +bool AMDGPUInstrInfo::enableClusterLoads() const { + return true; +} + +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } bool @@ -320,7 +319,10 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1); + Offset = MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getFrameIndexOffset(MF, -1); return getIndirectIndexBegin(MF) + Offset; } @@ -335,12 +337,12 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { } // Wrapper for Tablegen'd function. enum Subtarget is not defined in any -// header files, so we need to wrap it in a function that takes unsigned +// header files, so we need to wrap it in a function that takes unsigned // instead. namespace llvm { namespace AMDGPU { int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcode(Opcode); + return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); } } } diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h index d5041f558163..da9833d25a52 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ b/lib/Target/R600/AMDGPUInstrInfo.h @@ -13,10 +13,9 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUINSTRUCTIONINFO_H -#define AMDGPUINSTRUCTIONINFO_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H -#include "AMDGPUInstrInfo.h" #include "AMDGPURegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" #include <map> @@ -41,8 +40,6 @@ class MachineInstrBuilder; class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { private: const AMDGPURegisterInfo RI; - bool getNextBranchInstr(MachineBasicBlock::iterator &iter, - MachineBasicBlock &MBB) const; virtual void anchor(); protected: const AMDGPUSubtarget &ST; @@ -74,11 +71,6 @@ public: LiveVariables *LV) const override; - virtual void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const = 0; - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, @@ -101,6 +93,7 @@ protected: MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, MachineInstr *LoadMI) const override; +public: /// \returns the smallest register index that will be accessed by an indirect /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexBegin(const MachineFunction &MF) const; @@ -109,7 +102,6 @@ protected: /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexEnd(const MachineFunction &MF) const; -public: bool canFoldMemoryOperand(const MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, @@ -120,6 +112,9 @@ public: unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex = nullptr) const override; + + bool enableClusterLoads() const override; + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; @@ -196,4 +191,4 @@ namespace AMDGPU { #define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) #define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) -#endif // AMDGPUINSTRINFO_H +#endif diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 820f1a80d75e..0e34392bd50d 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -23,6 +23,14 @@ def AMDGPUTrigPreOp : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] >; +def AMDGPULdExpOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUFPClassOp : SDTypeProfile<1, 2, + [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] +>; + def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; @@ -52,12 +60,20 @@ def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; -// out = max(a, b) a and b are floats -def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative] +def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; + +def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; + +// out = max(a, b) a and b are floats, where a nan comparison fails. +// This is not commutative because this gives the second operand: +// x < nan ? x : nan -> nan +// nan < x ? nan : x -> x +def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, + [] >; def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; +def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>; // out = max(a, b) a and b are signed ints def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, @@ -69,12 +85,12 @@ def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; -// out = min(a, b) a and b are floats -def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative] +// out = min(a, b) a and b are floats, where a nan comparison fails. +def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, + [] >; -// out = min(a, b) a snd b are signed ints +// out = min(a, b) a and b are signed ints def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; @@ -84,6 +100,37 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; +// FIXME: TableGen doesn't like commutative instructions with more +// than 2 operands. +// out = max(a, b, c) a, b and c are floats +def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b, and c are signed ints +def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b and c are unsigned ints +def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are floats +def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are signed ints +def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; @@ -130,7 +177,7 @@ def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", // MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src) // // src0: vec4(src, 0, 0, mask) -// src1: dst - rat offset (aka pointer) in dwords +// src1: dst - rat offset (aka pointer) in dwords def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index cd3560378e57..4e536c37b0bd 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -23,6 +23,8 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio let Pattern = pattern; let Itinerary = NullALU; + let isCodeGenOnly = 1; + let TSFlags{63} = isRegisterLoad; let TSFlags{62} = isRegisterStore; } @@ -71,6 +73,11 @@ def COND_OEQ : PatLeaf < [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] >; +def COND_ONE : PatLeaf < + (cond), + [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] +>; + def COND_OGT : PatLeaf < (cond), [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] @@ -91,23 +98,28 @@ def COND_OLE : PatLeaf < [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] >; -def COND_UNE : PatLeaf < - (cond), - [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] ->; def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; //===----------------------------------------------------------------------===// -// PatLeafs for unsigned comparisons +// PatLeafs for unsigned / unordered comparisons //===----------------------------------------------------------------------===// +def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; +def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; +// XXX - For some reason R600 version is preferring to use unordered +// for setne? +def COND_UNE_NE : PatLeaf < + (cond), + [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] +>; + //===----------------------------------------------------------------------===// // PatLeafs for signed comparisons //===----------------------------------------------------------------------===// @@ -133,7 +145,7 @@ def COND_NE : PatLeaf < def COND_NULL : PatLeaf < (cond), - [{return false;}] + [{(void)N; return false;}] >; //===----------------------------------------------------------------------===// @@ -195,6 +207,14 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; @@ -223,6 +243,14 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); }]>; @@ -248,6 +276,11 @@ def az_extloadi32_global : PatFrag<(ops node:$ptr), return isGlobalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def az_extloadi32_flat : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + def az_extloadi32_constant : PatFrag<(ops node:$ptr), (az_extloadi32 node:$ptr), [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); @@ -263,6 +296,16 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), return isGlobalStore(dyn_cast<StoreSDNode>(N)); }]>; +def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + def local_store : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ return isLocalStore(dyn_cast<StoreSDNode>(N)); @@ -282,6 +325,17 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; +class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAlignment() % 8 == 0; +}]>; + +def local_load_aligned8bytes : Aligned8Bytes < + (ops node:$ptr), (local_load node:$ptr) +>; + +def local_store_aligned8bytes : Aligned8Bytes < + (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) +>; class local_binary_atomic_op<SDNode atomic_op> : PatFrag<(ops node:$ptr, node:$value), @@ -307,6 +361,7 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr), return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; + def atomic_cmp_swap_32_local : PatFrag<(ops node:$ptr, node:$cmp, node:$swap), (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ @@ -323,6 +378,45 @@ def atomic_cmp_swap_64_local : AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isFlatLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +def flat_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def mskor_flat : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; +}]>; + +class global_binary_atomic_op<SDNode atomic_op> : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] +>; + +def atomic_swap_global : global_binary_atomic_op<atomic_swap>; +def atomic_add_global : global_binary_atomic_op<atomic_load_add>; +def atomic_and_global : global_binary_atomic_op<atomic_load_and>; +def atomic_max_global : global_binary_atomic_op<atomic_load_max>; +def atomic_min_global : global_binary_atomic_op<atomic_load_min>; +def atomic_or_global : global_binary_atomic_op<atomic_load_or>; +def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>; +def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; +def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; +def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; + +//===----------------------------------------------------------------------===// +// Misc Pattern Fragments +//===----------------------------------------------------------------------===// + +def fmad : PatFrag < + (ops node:$src0, node:$src1, node:$src2), + (fadd (fmul node:$src0, node:$src1), node:$src2) +>; class Constants { int TWO_PI = 0x40c90fdb; @@ -442,8 +536,9 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < // BFI_INT patterns -multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> { - +multiclass BFIPatterns <Instruction BFI_INT, + Instruction LoadImm32, + RegisterClass RC64> { // Definition from ISA doc: // (y & x) | (z & ~x) def : Pat < @@ -465,8 +560,8 @@ multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> { def : Pat < (f64 (fcopysign f64:$src0, f64:$src1)), - (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (i32 (EXTRACT_SUBREG $src0, sub0)), sub0), + (REG_SEQUENCE RC64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, (BFI_INT (LoadImm32 0x7fffffff), (i32 (EXTRACT_SUBREG $src0, sub1)), (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp index 58916a995496..e94bb6013d83 100644 --- a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; #include "AMDGPUGenIntrinsics.inc" #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() : TargetIntrinsicInfo() {} std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h index 5be68a217da5..4c95b5ec0974 100644 --- a/lib/Target/R600/AMDGPUIntrinsicInfo.h +++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h @@ -11,8 +11,8 @@ /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. // //===-----------------------------------------------------------------------===// -#ifndef AMDGPU_INTRINSICINFO_H -#define AMDGPU_INTRINSICINFO_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H #include "llvm/IR/Intrinsics.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -33,7 +33,7 @@ enum ID { class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { public: - AMDGPUIntrinsicInfo(TargetMachine *tm); + AMDGPUIntrinsicInfo(); std::string getName(unsigned IntrId, Type **Tys = nullptr, unsigned numTys = 0) const override; unsigned lookupName(const char *Name, unsigned Len) const override; @@ -45,4 +45,4 @@ public: } // end namespace llvm -#endif // AMDGPU_INTRINSICINFO_H +#endif diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp index ce5c41ceb267..1995ef2b0c9e 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ b/lib/Target/R600/AMDGPUMCInstLower.cpp @@ -40,8 +40,13 @@ AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): { } enum AMDGPUMCInstLower::SISubtarget -AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const { - return AMDGPUMCInstLower::SI; +AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned Gen) const { + switch (Gen) { + default: + return AMDGPUMCInstLower::SI; + case AMDGPUSubtarget::VOLCANIC_ISLANDS: + return AMDGPUMCInstLower::VI; + } } unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const { @@ -63,13 +68,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { switch (MO.getType()) { default: llvm_unreachable("unknown operand type"); - case MachineOperand::MO_FPImmediate: { - const APFloat &FloatValue = MO.getFPImm()->getValueAPF(); - assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle && - "Only floating point immediates are supported at the moment."); - MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat()); - break; - } case MachineOperand::MO_Immediate: MCOp = MCOperand::CreateImm(MO.getImm()); break; @@ -104,7 +102,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { #ifdef _DEBUG StringRef Err; - if (!TM.getInstrInfo()->verifyInstruction(MI, Err)) { + if (!TM.getSubtargetImpl()->getInstrInfo()->verifyInstruction(MI, Err)) { errs() << "Warning: Illegal instruction detected: " << Err << "\n"; MI->dump(); } @@ -128,8 +126,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { std::string &DisasmLine = DisasmLines.back(); raw_string_ostream DisasmStream(DisasmLine); - AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *TM.getInstrInfo(), - *TM.getRegisterInfo()); + AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), + *TM.getSubtargetImpl()->getInstrInfo(), + *TM.getSubtargetImpl()->getRegisterInfo()); InstPrinter.printInst(&TmpInst, DisasmStream, StringRef()); // Disassemble instruction/operands to hex representation. diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h index 58fe34d32d31..0ae4d11bf1d1 100644 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ b/lib/Target/R600/AMDGPUMCInstLower.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef AMDGPU_MCINSTLOWER_H -#define AMDGPU_MCINSTLOWER_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H +#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H namespace llvm { @@ -22,7 +22,8 @@ class AMDGPUMCInstLower { // This must be kept in sync with the SISubtarget class in SIInstrInfo.td enum SISubtarget { - SI = 0 + SI = 0, + VI = 1 }; MCContext &Ctx; @@ -45,4 +46,4 @@ public: } // End namespace llvm -#endif //AMDGPU_MCINSTLOWER_H +#endif diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp index 90af80113ece..0f3f9e26528b 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.cpp +++ b/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -12,7 +12,9 @@ void AMDGPUMachineFunction::anchor() {} AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), ShaderType(ShaderType::COMPUTE), - LDSSize(0) { + LDSSize(0), + ScratchSize(0), + IsKernel(true) { AttributeSet Set = MF.getFunction()->getAttributes(); Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, ShaderTypeAttribute); diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h index 0854d588eeba..f5e4694e76f6 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.h +++ b/lib/Target/R600/AMDGPUMachineFunction.h @@ -10,8 +10,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef AMDGPUMACHINEFUNCTION_H -#define AMDGPUMACHINEFUNCTION_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H +#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H #include "llvm/CodeGen/MachineFunction.h" #include <map> @@ -30,10 +30,16 @@ public: /// Number of bytes in the LDS that are being used. unsigned LDSSize; + /// Start of implicit kernel args + unsigned ABIArgOffset; + unsigned getShaderType() const { return ShaderType; } + + unsigned ScratchSize; + bool IsKernel; }; } -#endif // AMDGPUMACHINEFUNCTION_H +#endif diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp index 218750d445e6..b81fef47d55a 100644 --- a/lib/Target/R600/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp @@ -36,11 +36,9 @@ class AMDGPUPromoteAlloca : public FunctionPass, public: AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), LocalMemAvailable(0) { } - virtual bool doInitialization(Module &M); - virtual bool runOnFunction(Function &F); - virtual const char *getPassName() const { - return "AMDGPU Promote Alloca"; - } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Promote Alloca"; } void visitAlloca(AllocaInst &I); }; @@ -107,14 +105,16 @@ static VectorType *arrayTypeToVecType(const Type *ArrayTy) { ArrayTy->getArrayNumElements()); } -static Value* calculateVectorIndex(Value *Ptr, - std::map<GetElementPtrInst*, Value*> GEPIdx) { +static Value * +calculateVectorIndex(Value *Ptr, + const std::map<GetElementPtrInst *, Value *> &GEPIdx) { if (isa<AllocaInst>(Ptr)) return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); - return GEPIdx[GEP]; + auto I = GEPIdx.find(GEP); + return I == GEPIdx.end() ? nullptr : I->second; } static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { @@ -234,7 +234,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { return true; } -static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { +static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { + bool Success = true; for (User *User : Val->users()) { if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) continue; @@ -242,11 +243,20 @@ static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { WorkList.push_back(User); continue; } + + // FIXME: Correctly handle ptrtoint instructions. + Instruction *UseInst = dyn_cast<Instruction>(User); + if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) + return false; + if (!User->getType()->isPointerTy()) continue; + WorkList.push_back(User); - collectUsesWithPtrTypes(User, WorkList); + + Success &= collectUsesWithPtrTypes(User, WorkList); } + return Success; } void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { @@ -274,6 +284,13 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { return; } + std::vector<Value*> WorkList; + + if (!collectUsesWithPtrTypes(&I, WorkList)) { + DEBUG(dbgs() << " Do not know how to convert all uses\n"); + return; + } + DEBUG(dbgs() << "Promoting alloca to local memory\n"); LocalMemAvailable -= AllocaSize; @@ -320,10 +337,6 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { I.replaceAllUsesWith(Offset); I.eraseFromParent(); - std::vector<Value*> WorkList; - - collectUsesWithPtrTypes(Offset, WorkList); - for (std::vector<Value*>::iterator i = WorkList.begin(), e = WorkList.end(); i != e; ++i) { Value *V = *i; @@ -331,6 +344,13 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { if (!Call) { Type *EltTy = V->getType()->getPointerElementType(); PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + + // The operand's value should be corrected on its own. + if (isa<AddrSpaceCastInst>(V)) + continue; + + // FIXME: It doesn't really make sense to try to do this for all + // instructions. V->mutateType(NewTy); continue; } diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp index 34332808f865..57b054bc2a61 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ b/lib/Target/R600/AMDGPURegisterInfo.cpp @@ -42,8 +42,7 @@ void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { - assert(!"Subroutines not supported yet"); - return 0; + return AMDGPU::NoRegister; } unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h index 46aa7a17dfca..f27576ab9736 100644 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ b/lib/Target/R600/AMDGPURegisterInfo.h @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUREGISTERINFO_H -#define AMDGPUREGISTERINFO_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H #include "llvm/ADT/BitVector.h" #include "llvm/Target/TargetRegisterInfo.h" @@ -51,7 +51,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { unsigned getSubRegFromChannel(unsigned Channel) const; const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; unsigned getFrameRegister(const MachineFunction &MF) const override; @@ -62,4 +62,4 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { } // End namespace llvm -#endif // AMDIDSAREGISTERINFO_H +#endif diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index e3c2a50ab828..597e558e6634 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -13,12 +13,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" #include "R600InstrInfo.h" +#include "R600MachineScheduler.h" +#include "SIISelLowering.h" #include "SIInstrInfo.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallString.h" - using namespace llvm; #define DEBUG_TYPE "amdgpu-subtarget" @@ -28,26 +29,23 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" -AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) : - AMDGPUGenSubtargetInfo(TT, GPU, FS), - DevName(GPU), - Is64bit(false), - DumpCode(false), - R600ALUInst(false), - HasVertexCache(false), - TexVTXClauseSize(0), - Gen(AMDGPUSubtarget::R600), - FP64(false), - FP64Denormals(false), - FP32Denormals(false), - CaymanISA(false), - EnableIRStructurizer(true), - EnablePromoteAlloca(false), - EnableIfCvt(true), - WavefrontSize(0), - CFALUBug(false), - LocalMemorySize(0), - InstrItins(getInstrItineraryForCPU(GPU)) { +static std::string computeDataLayout(const AMDGPUSubtarget &ST) { + std::string Ret = "e-p:32:32"; + + if (ST.is64bit()) { + // 32-bit private, local, and region pointers. 64-bit global and constant. + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; + } + + Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" + "-v512:512-v1024:1024-v2048:2048-n32:64"; + + return Ret; +} + +AMDGPUSubtarget & +AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) { + // Determine default and user-specified characteristics // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be // enabled, but some instructions do not respect them and they run at the // double precision rate, so don't enable by default. @@ -61,16 +59,37 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) : ParseSubtargetFeatures(GPU, FullFS); + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere + // if someone tries to enable these? if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - InstrInfo.reset(new R600InstrInfo(*this)); - - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere if - // someone tries to enable these? FP32Denormals = false; FP64Denormals = false; + } + return *this; +} + +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, + TargetMachine &TM) + : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), + DumpCode(false), R600ALUInst(false), HasVertexCache(false), + TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), + FP64Denormals(false), FP32Denormals(false), CaymanISA(false), + FlatAddressSpace(false), EnableIRStructurizer(true), + EnablePromoteAlloca(false), EnableIfCvt(true), + EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), + DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))), + FrameLowering(TargetFrameLowering::StackGrowsUp, + 64 * 16, // Maximum stack alignment (long16) + 0), + InstrItins(getInstrItineraryForCPU(GPU)), + TargetTriple(TT) { + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + InstrInfo.reset(new R600InstrInfo(*this)); + TLInfo.reset(new R600TargetLowering(TM)); } else { InstrInfo.reset(new SIInstrInfo(*this)); + TLInfo.reset(new SITargetLowering(TM)); } } @@ -87,3 +106,10 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const { llvm_unreachable("Illegal wavefront size."); } } + +unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { + switch(getGeneration()) { + default: llvm_unreachable("ChipID unknown"); + case SEA_ISLANDS: return 12; + } +} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index a844b37b6be5..90179d79d25d 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -12,25 +12,26 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUSUBTARGET_H -#define AMDGPUSUBTARGET_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H #include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetSubtargetInfo.h" #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" -#define MAX_CB_SIZE (1 << 16) - namespace llvm { class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { - std::unique_ptr<AMDGPUInstrInfo> InstrInfo; - public: enum Generation { R600 = 0, @@ -38,7 +39,8 @@ public: EVERGREEN, NORTHERN_ISLANDS, SOUTHERN_ISLANDS, - SEA_ISLANDS + SEA_ISLANDS, + VOLCANIC_ISLANDS, }; private: @@ -53,24 +55,41 @@ private: bool FP64Denormals; bool FP32Denormals; bool CaymanISA; + bool FlatAddressSpace; bool EnableIRStructurizer; bool EnablePromoteAlloca; bool EnableIfCvt; + bool EnableLoadStoreOpt; unsigned WavefrontSize; bool CFALUBug; int LocalMemorySize; + const DataLayout DL; + AMDGPUFrameLowering FrameLowering; + std::unique_ptr<AMDGPUTargetLowering> TLInfo; + std::unique_ptr<AMDGPUInstrInfo> InstrInfo; InstrItineraryData InstrItins; + Triple TargetTriple; public: - AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS); + AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM); + AMDGPUSubtarget &initializeSubtargetDependencies(StringRef GPU, StringRef FS); - const AMDGPUInstrInfo *getInstrInfo() const { + const AMDGPUFrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const AMDGPUInstrInfo *getInstrInfo() const override { return InstrInfo.get(); } - - const InstrItineraryData &getInstrItineraryData() const { - return InstrItins; + const AMDGPURegisterInfo *getRegisterInfo() const override { + return &InstrInfo->getRegisterInfo(); + } + AMDGPUTargetLowering *getTargetLowering() const override { + return TLInfo.get(); + } + const DataLayout *getDataLayout() const override { return &DL; } + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; } void ParseSubtargetFeatures(StringRef CPU, StringRef FS); @@ -107,6 +126,10 @@ public: return FP64Denormals; } + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } @@ -158,6 +181,10 @@ public: return EnableIfCvt; } + bool loadStoreOptEnabled() const { + return EnableLoadStoreOpt; + } + unsigned getWavefrontSize() const { return WavefrontSize; } @@ -173,6 +200,8 @@ public: return LocalMemorySize; } + unsigned getAmdKernelCodeChipID() const; + bool enableMachineScheduler() const override { return getGeneration() <= NORTHERN_ISLANDS; } @@ -192,8 +221,11 @@ public: bool r600ALUEncoding() const { return R600ALUInst; } + bool isAmdHsaOS() const { + return TargetTriple.getOS() == Triple::AMDHSA; + } }; } // End namespace llvm -#endif // AMDGPUSUBTARGET_H +#endif diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 56ba719e6863..a1da7172d53f 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -22,6 +22,7 @@ #include "SIInstrInfo.h" #include "llvm/Analysis/Passes.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Verifier.h" @@ -38,6 +39,7 @@ using namespace llvm; extern "C" void LLVMInitializeR600Target() { // Register the target RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget); + RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { @@ -48,46 +50,20 @@ static MachineSchedRegistry SchedCustomRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); -static std::string computeDataLayout(const AMDGPUSubtarget &ST) { - std::string Ret = "e-p:32:32"; - - if (ST.is64bit()) { - // 32-bit local, and region pointers. 64-bit private, global, and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; - } - - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; - - return Ret; -} - AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - TargetOptions Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OptLevel -) -: - LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), - Subtarget(TT, CPU, FS), - Layout(computeDataLayout(Subtarget)), - FrameLowering(TargetFrameLowering::StackGrowsUp, - 64 * 16 // Maximum stack alignment (long16) - , 0), - IntrinsicInfo(this), - InstrItins(&Subtarget.getInstrItineraryData()) { - // TLInfo uses InstrInfo so it must be initialized after. - if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - TLInfo.reset(new R600TargetLowering(*this)); - } else { - TLInfo.reset(new SITargetLowering(*this)); - } + StringRef CPU, StringRef FS, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OptLevel) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), + TLOF(new TargetLoweringObjectFileELF()), + Subtarget(TT, CPU, FS, *this), IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } AMDGPUTargetMachine::~AMDGPUTargetMachine() { + delete TLOF; } namespace { @@ -108,13 +84,14 @@ public: return nullptr; } - virtual void addCodeGenPrepare(); + void addIRPasses() override; + void addCodeGenPrepare() override; bool addPreISel() override; bool addInstSelector() override; - bool addPreRegAlloc() override; - bool addPostRegAlloc() override; - bool addPreSched2() override; - bool addPreEmitPass() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; }; } // End of anonymous namespace @@ -134,6 +111,19 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { PM.add(createAMDGPUTargetTransformInfoPass(this)); } +void AMDGPUPassConfig::addIRPasses() { + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerPass()); + // We need to add the barrier noop pass, otherwise adding the function + // inlining pass will cause all of the PassConfigs passes to be run + // one function at a time, which means if we have a nodule with two + // functions, then we will generate code for the first function + // without ever running any passes on the second. + addPass(createBarrierNoopPass()); + TargetPassConfig::addIRPasses(); +} + void AMDGPUPassConfig::addCodeGenPrepare() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); if (ST.isPromoteAllocaEnabled()) { @@ -161,61 +151,82 @@ AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); - addPass(createSILowerI1CopiesPass()); + + if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + addPass(createSILowerI1CopiesPass()); + addPass(createSIFixSGPRCopiesPass(*TM)); + addPass(createSIFoldOperandsPass()); + } + return false; } -bool AMDGPUPassConfig::addPreRegAlloc() { +void AMDGPUPassConfig::addPreRegAlloc() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { addPass(createR600VectorRegMerger(*TM)); } else { - addPass(createSIFixSGPRCopiesPass(*TM)); - // SIFixSGPRCopies can generate a lot of duplicate instructions, - // so we need to run MachineCSE afterwards. - addPass(&MachineCSEID); - addPass(createSIShrinkInstructionsPass()); - initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); - insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID); + if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + // Don't do this with no optimizations since it throws away debug info by + // merging nonadjacent loads. + + // This should be run after scheduling, but before register allocation. It + // also need extra copies to the address operand to be eliminated. + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); + } + + addPass(createSIShrinkInstructionsPass(), false); + addPass(createSIFixSGPRLiveRangesPass(), false); } - return false; } -bool AMDGPUPassConfig::addPostRegAlloc() { +void AMDGPUPassConfig::addPostRegAlloc() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); - addPass(createSIShrinkInstructionsPass()); if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { - addPass(createSIInsertWaits(*TM)); + addPass(createSIPrepareScratchRegs(), false); + addPass(createSIShrinkInstructionsPass(), false); } - return false; } -bool AMDGPUPassConfig::addPreSched2() { +void AMDGPUPassConfig::addPreSched2() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - addPass(createR600EmitClauseMarkers()); + addPass(createR600EmitClauseMarkers(), false); if (ST.isIfCvtEnabled()) - addPass(&IfConverterID); + addPass(&IfConverterID, false); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - addPass(createR600ClauseMergePass(*TM)); - return false; + addPass(createR600ClauseMergePass(*TM), false); + if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + addPass(createSIInsertWaits(*TM), false); + } } -bool AMDGPUPassConfig::addPreEmitPass() { +void AMDGPUPassConfig::addPreEmitPass() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - addPass(createAMDGPUCFGStructurizerPass()); - addPass(createR600ExpandSpecialInstrsPass(*TM)); - addPass(&FinalizeMachineBundlesID); - addPass(createR600Packetizer(*TM)); - addPass(createR600ControlFlowFinalizer(*TM)); + addPass(createAMDGPUCFGStructurizerPass(), false); + addPass(createR600ExpandSpecialInstrsPass(*TM), false); + addPass(&FinalizeMachineBundlesID, false); + addPass(createR600Packetizer(*TM), false); + addPass(createR600ControlFlowFinalizer(*TM), false); } else { - addPass(createSILowerControlFlowPass(*TM)); + addPass(createSILowerControlFlowPass(*TM), false); } - - return false; } + + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +GCNTargetMachine::GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : + AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { } diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index 3bb15beb6bf1..66b30700d883 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPU_TARGET_MACHINE_H -#define AMDGPU_TARGET_MACHINE_H +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H #include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" @@ -24,48 +24,48 @@ namespace llvm { -class AMDGPUTargetMachine : public LLVMTargetMachine { +//===----------------------------------------------------------------------===// +// AMDGPU Target Machine (R600+) +//===----------------------------------------------------------------------===// +class AMDGPUTargetMachine : public LLVMTargetMachine { +protected: + TargetLoweringObjectFile *TLOF; AMDGPUSubtarget Subtarget; - const DataLayout Layout; - AMDGPUFrameLowering FrameLowering; AMDGPUIntrinsicInfo IntrinsicInfo; - std::unique_ptr<AMDGPUTargetLowering> TLInfo; - const InstrItineraryData *InstrItins; public: AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS, StringRef CPU, TargetOptions Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~AMDGPUTargetMachine(); - const AMDGPUFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { - return &IntrinsicInfo; - } - const AMDGPUInstrInfo *getInstrInfo() const override { - return getSubtargetImpl()->getInstrInfo(); - } const AMDGPUSubtarget *getSubtargetImpl() const override { return &Subtarget; } - const AMDGPURegisterInfo *getRegisterInfo() const override { - return &getInstrInfo()->getRegisterInfo(); - } - AMDGPUTargetLowering *getTargetLowering() const override { - return TLInfo.get(); - } - const InstrItineraryData *getInstrItineraryData() const override { - return InstrItins; + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { + return &IntrinsicInfo; } - const DataLayout *getDataLayout() const override { return &Layout; } TargetPassConfig *createPassConfig(PassManagerBase &PM) override; /// \brief Register R600 analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF; + } +}; + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +class GCNTargetMachine : public AMDGPUTargetMachine { + +public: + GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); }; } // End namespace llvm -#endif // AMDGPU_TARGET_MACHINE_H +#endif diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp index 88934b65876e..e7bc00635f75 100644 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -52,7 +52,7 @@ public: AMDGPUTTI(const AMDGPUTargetMachine *TM) : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), - TLI(TM->getTargetLowering()) { + TLI(TM->getSubtargetImpl()->getTargetLowering()) { initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry()); } @@ -74,16 +74,14 @@ public: bool hasBranchDivergence() const override; - void getUnrollingPreferences(Loop *L, + void getUnrollingPreferences(const Function *F, Loop *L, UnrollingPreferences &UP) const override; PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override; unsigned getNumberOfRegisters(bool Vector) const override; unsigned getRegisterBitWidth(bool Vector) const override; - unsigned getMaximumUnrollFactor() const override; - - /// @} + unsigned getMaxInterleaveFactor() const override; }; } // end anonymous namespace @@ -99,8 +97,14 @@ llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) { bool AMDGPUTTI::hasBranchDivergence() const { return true; } -void AMDGPUTTI::getUnrollingPreferences(Loop *L, +void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L, UnrollingPreferences &UP) const { + UP.Threshold = 300; // Twice the default. + UP.Count = UINT_MAX; + UP.Partial = true; + + // TODO: Do we want runtime unrolling? + for (const BasicBlock *BB : L->getBlocks()) { for (const Instruction &I : *BB) { const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); @@ -120,7 +124,7 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L, // // Don't use the maximum allowed value here as it will make some // programs way too big. - UP.Threshold = 500; + UP.Threshold = 800; } } } @@ -147,7 +151,7 @@ unsigned AMDGPUTTI::getRegisterBitWidth(bool) const { return 32; } -unsigned AMDGPUTTI::getMaximumUnrollFactor() const { +unsigned AMDGPUTTI::getMaxInterleaveFactor() const { // Semi-arbitrary large amount. return 64; } diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp index f3a03914391c..ee6e8ecfb29d 100644 --- a/lib/Target/R600/AMDILCFGStructurizer.cpp +++ b/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -11,6 +11,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "R600InstrInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallVector.h" @@ -160,7 +161,7 @@ public: bool prepare(); bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); TRI = &TII->getRegisterInfo(); DEBUG(MF.dump();); OrderedBlks.clear(); @@ -337,7 +338,7 @@ protected: void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&); - /// This is work around solution for findNearestCommonDominator not avaiable + /// This is work around solution for findNearestCommonDominator not available /// to post dom a proper fix should go to Dominators.h. MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2); diff --git a/lib/Target/R600/AMDKernelCodeT.h b/lib/Target/R600/AMDKernelCodeT.h new file mode 100644 index 000000000000..4d3041ff3db8 --- /dev/null +++ b/lib/Target/R600/AMDKernelCodeT.h @@ -0,0 +1,704 @@ +//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file AMDKernelCodeT.h +//===----------------------------------------------------------------------===// + +#ifndef AMDKERNELCODET_H +#define AMDKERNELCODET_H + +#include <cstddef> +#include <cstdint> + +//---------------------------------------------------------------------------// +// AMD Kernel Code, and its dependencies // +//---------------------------------------------------------------------------// + +typedef uint8_t hsa_powertwo8_t; +typedef uint32_t hsa_ext_code_kind_t; +typedef uint8_t hsa_ext_brig_profile8_t; +typedef uint8_t hsa_ext_brig_machine_model8_t; +typedef uint64_t hsa_ext_control_directive_present64_t; +typedef uint16_t hsa_ext_exception_kind16_t; +typedef uint32_t hsa_ext_code_kind32_t; + +typedef struct hsa_dim3_s { + uint32_t x; + uint32_t y; + uint32_t z; +} hsa_dim3_t; + +/// The version of the amd_*_code_t struct. Minor versions must be +/// backward compatible. +typedef uint32_t amd_code_version32_t; +enum amd_code_version_t { + AMD_CODE_VERSION_MAJOR = 0, + AMD_CODE_VERSION_MINOR = 1 +}; + +/// The values used to define the number of bytes to use for the +/// swizzle element size. +enum amd_element_byte_size_t { + AMD_ELEMENT_2_BYTES = 0, + AMD_ELEMENT_4_BYTES = 1, + AMD_ELEMENT_8_BYTES = 2, + AMD_ELEMENT_16_BYTES = 3 +}; + +/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and +/// COMPUTE_PGM_RSRC2 registers. +typedef uint64_t amd_compute_pgm_resource_register64_t; + +/// Every amd_*_code_t has the following properties, which are composed of +/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), +/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount +/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. +/// +/// (Note that bit fields cannot be used as their layout is +/// implementation defined in the C standard and so cannot be used to +/// specify an ABI) +typedef uint32_t amd_code_property32_t; +enum amd_code_property_mask_t { + + /// Enable the setup of the SGPR user data registers + /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t + /// for initial register state. + /// + /// The total number of SGPRuser data registers requested must not + /// exceed 16. Any requests beyond 16 will be ignored. + /// + /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of + /// SGPR user data registers enabled up to 16). + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, + + /// Control wave ID base counter for GDS ordered-append. Used to set + /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if + /// ORDERED_APPEND_MODE also needs to be settable) + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, + + /// The interleave (swizzle) element size in bytes required by the + /// code for private memory. This must be 2, 4, 8 or 16. This value + /// is provided to the finalizer when it is invoked and is recorded + /// here. The hardware will interleave the memory requests of each + /// lane of a wavefront by this element size to ensure each + /// work-item gets a distinct memory memory location. Therefore, the + /// finalizer ensures that all load and store operations done to + /// private memory do not exceed this size. For example, if the + /// element size is 4 (32-bits or dword) and a 64-bit value must be + /// loaded, the finalizer will generate two 32-bit loads. This + /// ensures that the interleaving will get the the work-item + /// specific dword for both halves of the 64-bit value. If it just + /// did a 64-bit load then it would get one dword which belonged to + /// its own work-item, but the second dword would belong to the + /// adjacent lane work-item since the interleaving is in dwords. + /// + /// The value used must match the value that the runtime configures + /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This + /// is generally DWORD. + /// + /// Use values from the amd_element_byte_size_t enum. + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, + + /// Are global memory addresses 64 bits. Must match + /// amd_kernel_code_t.hsail_machine_model == + /// HSA_MACHINE_LARGE. Must also match + /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), + /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). + AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, + AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, + AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, + + /// Indicate if the generated ISA is using a dynamically sized call + /// stack. This can happen if calls are implemented using a call + /// stack and recursion, alloca or calls to indirect functions are + /// present. In these cases the Finalizer cannot compute the total + /// private segment size at compile time. In this case the + /// workitem_private_segment_byte_size only specifies the statically + /// know private segment size, and additional space must be added + /// for the call stack. + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, + + /// Indicate if code generated has support for debugging. + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT +}; + +/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL +/// control directives. These control how the finalizer generates code. This +/// struct is used both as an argument to hsaFinalizeKernel to specify values for +/// the control directives, and is used in HsaKernelCode to record the values of +/// the control directives that the finalize used when generating the code which +/// either came from the finalizer argument or explicit HSAIL control +/// directives. See the definition of the control directives in HSA Programmer's +/// Reference Manual which also defines how the values specified as finalizer +/// arguments have to agree with the control directives in the HSAIL code. +typedef struct hsa_ext_control_directives_s { + /// This is a bit set indicating which control directives have been + /// specified. If the value is 0 then there are no control directives specified + /// and the rest of the fields can be ignored. The bits are accessed using the + /// hsa_ext_control_directives_present_mask_t. Any control directive that is not + /// enabled in this bit set must have the value of all 0s. + hsa_ext_control_directive_present64_t enabled_control_directives; + + /// If enableBreakExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. If the kernel being finalized + /// has any enablebreakexceptions control directives, then the values specified + /// by this argument are unioned with the values in these control + /// directives. If any of the functions the kernel calls have an + /// enablebreakexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_break_exceptions; + + /// If enableDetectExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. However, an implementation + /// should endeavour to make the performance impact small. If the kernel being + /// finalized has any enabledetectexceptions control directives, then the + /// values specified by this argument are unioned with the values in these + /// control directives. If any of the functions the kernel calls have an + /// enabledetectexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_detect_exceptions; + + /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of + /// dynamic group segment can be allocated for a dispatch, otherwise the value + /// specifies the maximum number of bytes of dynamic group segment that can be + /// allocated for a dispatch. If the kernel being finalized has any + /// maxdynamicsize control directives, then the values must be the same, and + /// must be the same as this argument if it is enabled. This value can be used + /// by the finalizer to determine the maximum number of bytes of group memory + /// used by each work-group by adding this value to the group memory required + /// for all group segment variables used by the kernel and all functions it + /// calls, and group memory used to implement other HSAIL features such as + /// fbarriers and the detect exception operations. This can allow the finalizer + /// to determine the expected number of work-groups that can be executed by a + /// compute unit and allow more resources to be allocated to the work-items if + /// it is known that fewer work-groups can be executed due to group memory + /// limitations. + uint32_t max_dynamic_group_size; + + /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater + /// than 0. See HSA Programmer's Reference Manual description of + /// maxflatgridsize control directive. + uint32_t max_flat_grid_size; + + /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be + /// greater than 0. See HSA Programmer's Reference Manual description of + /// maxflatworkgroupsize control directive. + uint32_t max_flat_workgroup_size; + + /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the + /// finalizer is free to generate ISA that may result in any number of + /// work-groups executing on a single compute unit. Otherwise, the finalizer + /// should attempt to generate ISA that will allow the specified number of + /// work-groups to execute on a single compute unit. This is only a hint and + /// can be ignored by the finalizer. If the kernel being finalized, or any of + /// the functions it calls, has a requested control directive, then the values + /// must be the same. This can be used to determine the number of resources + /// that should be allocated to a single work-group and work-item. For example, + /// a low value may allow more resources to be allocated, resulting in higher + /// per work-item performance, as it is known there will never be more than the + /// specified number of work-groups actually executing on the compute + /// unit. Conversely, a high value may allocate fewer resources, resulting in + /// lower per work-item performance, which is offset by the fact it allows more + /// work-groups to actually execute on the compute unit. + uint32_t requested_workgroups_per_cu; + + /// If not enabled then all elements for Dim3 must be 0, otherwise every + /// element must be greater than 0. See HSA Programmer's Reference Manual + /// description of requiredgridsize control directive. + hsa_dim3_t required_grid_size; + + /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be + /// 0, and the produced code can be dispatched with any legal work-group range + /// consistent with the dispatch dimensions. Otherwise, the code produced must + /// always be dispatched with the specified work-group range. No element of the + /// specified range must be 0. It must be consistent with required_dimensions + /// and max_flat_workgroup_size. If the kernel being finalized, or any of the + /// functions it calls, has a requiredworkgroupsize control directive, then the + /// values must be the same. Specifying a value can allow the finalizer to + /// optimize work-group id operations, and if the number of work-items in the + /// work-group is less than the WAVESIZE then barrier operations can be + /// optimized to just a memory fence. + hsa_dim3_t required_workgroup_size; + + /// If requiredDim is not enabled then must be 0 and the produced kernel code + /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is + /// 1..3 and the code produced must only be dispatched with a dimension that + /// matches. Other values are illegal. If the kernel being finalized, or any of + /// the functions it calls, has a requireddimsize control directive, then the + /// values must be the same. This can be used to optimize the code generated to + /// compute the absolute and flat work-group and work-item id, and the dim + /// HSAIL operations. + uint8_t required_dim; + + /// Reserved. Must be 0. + uint8_t reserved[75]; +} hsa_ext_control_directives_t; + +/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel +/// Code Object to set up the hardware to execute the kernel dispatch. +/// +/// Initial Kernel Register State. +/// +/// Initial kernel register state will be set up by CP/SPI prior to the start +/// of execution of every wavefront. This is limited by the constraints of the +/// current hardware. +/// +/// The order of the SGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enable_sgpr_* bit fields. The register numbers used for enabled registers +/// are dense starting at SGPR0: the first enabled register is SGPR0, the next +/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR +/// number. +/// +/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and +/// apply to all waves of the grid. It is possible to specify more than 16 User +/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 +/// are actually initialized. These are then immediately followed by the System +/// SGPRs that are set up by ADC/SPI and can have different values for each wave +/// of the grid dispatch. +/// +/// SGPR register initial state is defined as follows: +/// +/// Private Segment Buffer (enable_sgpr_private_segment_buffer): +/// Number of User SGPR registers: 4. V# that can be used, together with +/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg +/// segments using a segment address. It must be set as follows: +/// - Base address: of the scratch memory area used by the dispatch. It +/// does not include the scratch wave offset. It will be the per process +/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for +/// example there may be a per pipe offset, or per AQL Queue offset). +/// - Stride + data_format: Element Size * Index Stride (???) +/// - Cache swizzle: ??? +/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for +/// scratch) +/// - Num records: Flat Scratch Work Item Size / Element Size (???) +/// - Dst_sel_*: ??? +/// - Num_format: ??? +/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must +/// agree with amd_kernel_code_t.privateElementSize) +/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must +/// be number of wavefront lanes for scratch, must agree with +/// amd_kernel_code_t.wavefrontSize) +/// - Add tid enable: 1 +/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, +/// - Hash_enable: ??? +/// - Heap: ??? +/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE +/// - Type: 0 (a buffer) (???) +/// +/// Dispatch Ptr (enable_sgpr_dispatch_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet +/// for kernel actually executing. +/// +/// Queue Ptr (enable_sgpr_queue_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for +/// AQL queue on which the dispatch packet was queued. +/// +/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): +/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This +/// is directly copied from the kernargPtr in the dispatch packet. Having CP +/// load it once avoids loading it at the beginning of every wavefront. +/// +/// Dispatch Id (enable_sgpr_dispatch_id): +/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch +/// packet being executed. +/// +/// Flat Scratch Init (enable_sgpr_flat_scratch_init): +/// Number of User SGPR registers: 2. This is 2 SGPRs. +/// +/// For CI/VI: +/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE +/// to base of memory for scratch for this dispatch. This is the same offset +/// used in computing the Scratch Segment Buffer base address. The value of +/// Scratch Wave Offset must be added by the kernel code and moved to +/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. +/// +/// The second SGPR is 32 bit byte size of a single work-item’s scratch +/// memory usage. This is directly loaded from the dispatch packet Private +/// Segment Byte Size and rounded up to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in +/// flat memory instructions. Having CP load it once avoids loading it at +/// the beginning of every wavefront. +/// +/// For PI: +/// This is the 64 bit base address of the scratch backing memory for +/// allocated by CP for this dispatch. +/// +/// Private Segment Size (enable_sgpr_private_segment_size): +/// Number of User SGPR registers: 1. The 32 bit byte size of a single +/// work-item’s scratch memory allocation. This is the value from the dispatch +/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// Having CP load it once avoids loading it at the beginning of every +/// wavefront. +/// +/// \todo [This will not be used for CI/VI since it is the same value as +/// the second SGPR of Flat Scratch Init. However, it is need for PI which +/// changes meaning of Flat Scratchg Init..] +/// +/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the X dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). +/// +/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Y dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Z dimension for the grid being executed. Computed +/// from the fields in the HsaDispatchPacket as +/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Work-Group Id X (enable_sgpr_workgroup_id_x): +/// Number of System SGPR registers: 1. 32 bit work group id in X dimension +/// of grid for wavefront. Always present. +/// +/// Work-Group Id Y (enable_sgpr_workgroup_id_y): +/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension +/// of grid for wavefront. +/// +/// Work-Group Id Z (enable_sgpr_workgroup_id_z): +/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension +/// of grid for wavefront. If present then Work-group Id Y will also be +/// present +/// +/// Work-Group Info (enable_sgpr_workgroup_info): +/// Number of System SGPR registers: 1. {first_wave, 14’b0000, +/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} +/// +/// Private Segment Wave Byte Offset +/// (enable_sgpr_private_segment_wave_byte_offset): +/// Number of System SGPR registers: 1. 32 bit byte offset from base of +/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg +/// segment address when using Scratch Segment Buffer. It must be added to +/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. +/// +/// +/// The order of the VGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enableVgpr* bit fields. The register numbers used for enabled registers +/// are dense starting at VGPR0: the first enabled register is VGPR0, the next +/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR +/// number. +/// +/// VGPR register initial state is defined as follows: +/// +/// Work-Item Id X (always initialized): +/// Number of registers: 1. 32 bit work item id in X dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Y dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Z dimension of work-group +/// for wavefront lane. +/// +/// +/// The setting of registers is being done by existing GPU hardware as follows: +/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data +/// registers. +/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any +/// combination including none. +/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot +/// be added into the value Flat Scratch Offset which would avoid the +/// Finalizer generated prolog having to do the add. +/// 4) The VGPRs are set by SPI which only supports specifying either (X), +/// (X, Y) or (X, Y, Z). +/// +/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so +/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and +/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. +/// +/// The global segment can be accessed either using flat operations or buffer +/// operations. If buffer operations are used then the Global Buffer used to +/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a +/// segment address is not passed into the kernel code by CP since its base +/// address is always 0. Instead the Finalizer generates prolog code to +/// initialize 4 SGPRs with a V# that has the following properties, and then +/// uses that in the buffer instructions: +/// - base address of 0 +/// - no swizzle +/// - ATC=1 +/// - MTYPE set to support memory coherence specified in +/// amd_kernel_code_t.globalMemoryCoherence +/// +/// When the Global Buffer is used to access the Kernarg segment, must add the +/// dispatch packet kernArgPtr to a kernarg segment address before using this V#. +/// Alternatively scalar loads can be used if the kernarg offset is uniform, as +/// the kernarg segment is constant for the duration of the kernel execution. +/// +typedef struct amd_kernel_code_s { + /// The AMD major version of the Code Object. Must be the value + /// AMD_CODE_VERSION_MAJOR. + amd_code_version32_t amd_code_version_major; + + /// The AMD minor version of the Code Object. Minor versions must be + /// backward compatible. Must be the value + /// AMD_CODE_VERSION_MINOR. + amd_code_version32_t amd_code_version_minor; + + /// The byte size of this struct. Must be set to + /// sizeof(amd_kernel_code_t). Used for backward + /// compatibility. + uint32_t struct_byte_size; + + /// The target chip instruction set for which code has been + /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration + /// in sc/Interface/SCCommon.h. + uint32_t target_chip; + + /// Byte offset (possibly negative) from start of amd_kernel_code_t + /// object to kernel's entry point instruction. The actual code for + /// the kernel is required to be 256 byte aligned to match hardware + /// requirements (SQ cache line is 16). The code must be position + /// independent code (PIC) for AMD devices to give runtime the + /// option of copying code to discrete GPU memory or APU L2 + /// cache. The Finalizer should endeavour to allocate all kernel + /// machine code in contiguous memory pages so that a device + /// pre-fetcher will tend to only pre-fetch Kernel Code objects, + /// improving cache performance. + int64_t kernel_code_entry_byte_offset; + + /// Range of bytes to consider prefetching expressed as an offset + /// and size. The offset is from the start (possibly negative) of + /// amd_kernel_code_t object. Set both to 0 if no prefetch + /// information is available. + /// + /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did + /// not make the size a uint64_t as prefetching more than 4GiB seems + /// excessive. + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + + /// Number of bytes of scratch backing memory required for full + /// occupancy of target chip. This takes into account the number of + /// bytes of scratch per work-item, the wavefront size, the maximum + /// number of wavefronts per CU, and the number of CUs. This is an + /// upper limit on scratch. If the grid being dispatched is small it + /// may only need less than this. If the kernel uses no scratch, or + /// the Finalizer has not computed this value, it must be 0. + uint64_t max_scratch_backing_memory_byte_size; + + /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and + /// COMPUTE_PGM_RSRC2 registers. + amd_compute_pgm_resource_register64_t compute_pgm_resource_registers; + + /// Code properties. See amd_code_property_mask_t for a full list of + /// properties. + amd_code_property32_t code_properties; + + /// The amount of memory required for the combined private, spill + /// and arg segments for a work-item in bytes. If + /// is_dynamic_callstack is 1 then additional space must be added to + /// this value for the call stack. + uint32_t workitem_private_segment_byte_size; + + /// The amount of group segment memory required by a work-group in + /// bytes. This does not include any dynamically allocated group + /// segment memory that may be added when the kernel is + /// dispatched. + uint32_t workgroup_group_segment_byte_size; + + /// Number of byte of GDS required by kernel dispatch. Must be 0 if + /// not using GDS. + uint32_t gds_segment_byte_size; + + /// The size in bytes of the kernarg segment that holds the values + /// of the arguments to the kernel. This could be used by CP to + /// prefetch the kernarg segment pointed to by the dispatch packet. + uint64_t kernarg_segment_byte_size; + + /// Number of fbarrier's used in the kernel and all functions it + /// calls. If the implementation uses group memory to allocate the + /// fbarriers then that amount must already be included in the + /// workgroup_group_segment_byte_size total. + uint32_t workgroup_fbarrier_count; + + /// Number of scalar registers used by a wavefront. This includes + /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size + /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a + /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. + uint16_t wavefront_sgpr_count; + + /// Number of vector registers used by each work-item. Used to set + /// COMPUTE_PGM_RSRC1.VGPRS. + uint16_t workitem_vgpr_count; + + /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed VGPR number reserved. + uint16_t reserved_vgpr_first; + + /// The number of consecutive VGPRs reserved by the client. If + /// is_debug_supported then this count includes VGPRs reserved + /// for debugger use. + uint16_t reserved_vgpr_count; + + /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed SGPR number reserved. + uint16_t reserved_sgpr_first; + + /// The number of consecutive SGPRs reserved by the client. If + /// is_debug_supported then this count includes SGPRs reserved + /// for debugger use. + uint16_t reserved_sgpr_count; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number used to hold the wave scratch offset for the + /// entire kernel execution, or uint16_t(-1) if the register is not + /// used or not known. + uint16_t debug_wavefront_private_segment_offset_sgpr; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number of the first of 4 SGPRs used to hold the + /// scratch V# used for the entire kernel execution, or uint16_t(-1) + /// if the registers are not used or not known. + uint16_t debug_private_segment_buffer_sgpr; + + /// The maximum byte alignment of variables used by the kernel in + /// the specified memory segment. Expressed as a power of two. Must + /// be at least HSA_POWERTWO_16. + hsa_powertwo8_t kernarg_segment_alignment; + hsa_powertwo8_t group_segment_alignment; + hsa_powertwo8_t private_segment_alignment; + + uint8_t reserved3; + + /// Type of code object. + hsa_ext_code_kind32_t code_type; + + /// Reserved for code properties if any are defined in the future. + /// There are currently no code properties so this field must be 0. + uint32_t reserved4; + + /// Wavefront size expressed as a power of two. Must be a power of 2 + /// in range 1..64 inclusive. Used to support runtime query that + /// obtains wavefront size, which may be used by application to + /// allocated dynamic group memory and set the dispatch work-group + /// size. + hsa_powertwo8_t wavefront_size; + + /// The optimization level specified when the kernel was + /// finalized. + uint8_t optimization_level; + + /// The HSAIL profile defines which features are used. This + /// information is from the HSAIL version directive. If this + /// amd_kernel_code_t is not generated from an HSAIL compilation + /// unit then must be 0. + hsa_ext_brig_profile8_t hsail_profile; + + /// The HSAIL machine model gives the address sizes used by the + /// code. This information is from the HSAIL version directive. If + /// not generated from an HSAIL compilation unit then must still + /// indicate for what machine mode the code is generated. + hsa_ext_brig_machine_model8_t hsail_machine_model; + + /// The HSAIL major version. This information is from the HSAIL + /// version directive. If this amd_kernel_code_t is not + /// generated from an HSAIL compilation unit then must be 0. + uint32_t hsail_version_major; + + /// The HSAIL minor version. This information is from the HSAIL + /// version directive. If this amd_kernel_code_t is not + /// generated from an HSAIL compilation unit then must be 0. + uint32_t hsail_version_minor; + + /// Reserved for HSAIL target options if any are defined in the + /// future. There are currently no target options so this field + /// must be 0. + uint16_t reserved5; + + /// Reserved. Must be 0. + uint16_t reserved6; + + /// The values should be the actually values used by the finalizer + /// in generating the code. This may be the union of values + /// specified as finalizer arguments and explicit HSAIL control + /// directives. If the finalizer chooses to ignore a control + /// directive, and not generate constrained code, then the control + /// directive should not be marked as enabled even though it was + /// present in the HSAIL or finalizer argument. The values are + /// intended to reflect the constraints that the code actually + /// requires to correctly execute, not the values that were + /// actually specified at finalize time. + hsa_ext_control_directives_t control_directive; + + /// The code can immediately follow the amd_kernel_code_t, or can + /// come after subsequent amd_kernel_code_t structs when there are + /// multiple kernels in the compilation unit. + +} amd_kernel_code_t; + +#endif // AMDKERNELCODET_H diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp new file mode 100644 index 000000000000..3b4ba1a8e8e9 --- /dev/null +++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp @@ -0,0 +1,320 @@ +//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class AMDGPUAsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + MCAsmParser &Parser; + + + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "AMDGPUGenAsmMatcher.inc" + + /// } + +public: + AMDGPUAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, + const MCInstrInfo &_MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + bool ParseDirective(AsmToken DirectiveID) override; + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + bool parseCnt(int64_t &IntVal); + OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); +}; + +class AMDGPUOperand : public MCParsedAsmOperand { + enum KindTy { + Token, + Immediate + } Kind; + +public: + AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct ImmOp { + int64_t Val; + }; + + union { + TokOp Tok; + ImmOp Imm; + }; + + void addImmOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::CreateImm(getImm())); + } + void addRegOperands(MCInst &Inst, unsigned N) const { + llvm_unreachable("addRegOperands"); + } + StringRef getToken() const { + return StringRef(Tok.Data, Tok.Length); + } + bool isToken() const override { + return Kind == Token; + } + + bool isImm() const override { + return Kind == Immediate; + } + + int64_t getImm() const { + return Imm.Val; + } + + bool isReg() const override { + return false; + } + + unsigned getReg() const override { + return 0; + } + + bool isMem() const override { + return false; + } + + SMLoc getStartLoc() const override { + return SMLoc(); + } + + SMLoc getEndLoc() const override { + return SMLoc(); + } + + void print(raw_ostream &OS) const override { } + + static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val) { + auto Op = llvm::make_unique<AMDGPUOperand>(Immediate); + Op->Imm.Val = Val; + return Op; + } + + static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc) { + auto Res = llvm::make_unique<AMDGPUOperand>(Token); + Res->Tok.Data = Str.data(); + Res->Tok.Length = Str.size(); + return Res; + } + + bool isSWaitCnt() const; +}; + +} + +bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + return true; +} + + +bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; + + switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { + case Match_Success: + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, STI); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction use requires an option to be enabled"); + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_InvalidOperand: { + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + } + return Error(IDLoc, "invalid operand for instruction"); + } + } + llvm_unreachable("Implement any new match types added!"); +} + +bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { + return true; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { + + // Try to parse with a custom parser + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + + // If we successfully parsed the operand or if there as an error parsing, + // we are done. + if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail) + return ResTy; + + switch(getLexer().getKind()) { + case AsmToken::Integer: { + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(IntVal)); + return MatchOperand_Success; + } + default: + return MatchOperand_NoMatch; + } +} + +bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + // Add the instruction mnemonic + Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); + + if (getLexer().is(AsmToken::EndOfStatement)) + return false; + + AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); + switch (Res) { + case MatchOperand_Success: return false; + case MatchOperand_ParseFail: return Error(NameLoc, + "Failed parsing operand"); + case MatchOperand_NoMatch: return Error(NameLoc, "Not a valid operand"); + } + return true; +} + +//===----------------------------------------------------------------------===// +// s_waitcnt +//===----------------------------------------------------------------------===// + +bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { + StringRef CntName = Parser.getTok().getString(); + int64_t CntVal; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LParen)) + return true; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return true; + + if (getParser().parseAbsoluteExpression(CntVal)) + return true; + + if (getLexer().isNot(AsmToken::RParen)) + return true; + + Parser.Lex(); + if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) + Parser.Lex(); + + int CntShift; + int CntMask; + + if (CntName == "vmcnt") { + CntMask = 0xf; + CntShift = 0; + } else if (CntName == "expcnt") { + CntMask = 0x7; + CntShift = 4; + } else if (CntName == "lgkmcnt") { + CntMask = 0x7; + CntShift = 8; + } else { + return true; + } + + IntVal &= ~(CntMask << CntShift); + IntVal |= (CntVal << CntShift); + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { + // Disable all counters by default. + // vmcnt [3:0] + // expcnt [6:4] + // lgkmcnt [10:8] + int64_t CntVal = 0x77f; + + switch(getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(CntVal)) + return MatchOperand_ParseFail; + break; + + case AsmToken::Identifier: + do { + if (parseCnt(CntVal)) + return MatchOperand_ParseFail; + } while(getLexer().isNot(AsmToken::EndOfStatement)); + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(CntVal)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isSWaitCnt() const { + return isImm(); +} + +/// Force static initialization. +extern "C" void LLVMInitializeR600AsmParser() { + RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget); + RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "AMDGPUGenAsmMatcher.inc" + diff --git a/lib/Target/R600/AsmParser/CMakeLists.txt b/lib/Target/R600/AsmParser/CMakeLists.txt new file mode 100644 index 000000000000..1b42af73740e --- /dev/null +++ b/lib/Target/R600/AsmParser/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMR600AsmParser + AMDGPUAsmParser.cpp + ) diff --git a/lib/Target/R600/AsmParser/LLVMBuild.txt b/lib/Target/R600/AsmParser/LLVMBuild.txt new file mode 100644 index 000000000000..940e4cee6dfd --- /dev/null +++ b/lib/Target/R600/AsmParser/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/R600/AsmParser/LLVMBuild.txt -------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = R600AsmParser +parent = R600 +required_libraries = MC MCParser R600Desc R600Info Support +add_to_library_groups = R600 diff --git a/lib/Target/R600/AsmParser/Makefile b/lib/Target/R600/AsmParser/Makefile new file mode 100644 index 000000000000..e6689b54b6ba --- /dev/null +++ b/lib/Target/R600/AsmParser/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/R600/AsmParser/Makefile ----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600AsmParser + +# Hack: we need to include 'main' R600 target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/CIInstructions.td b/lib/Target/R600/CIInstructions.td new file mode 100644 index 000000000000..fdb58bbebd54 --- /dev/null +++ b/lib/Target/R600/CIInstructions.td @@ -0,0 +1,42 @@ +//===-- CIInstructions.td - CI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for CI and newer. +//===----------------------------------------------------------------------===// + + +def isCIVI : Predicate < + "Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " + "Subtarget.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" +>; + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isCIVI in { + +defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", + VOP_F64_F64, ftrunc +>; +defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64", + VOP_F64_F64, fceil +>; +defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", + VOP_F64_F64, ffloor +>; +defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", + VOP_F64_F64, frint +>; +defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32", + VOP_F32_F32 +>; +defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32", + VOP_F32_F32 +>; +} // End SubtargetPredicate = isCIVI diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 49a7f8aa18c8..5a4bae2f93cc 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -6,13 +6,15 @@ tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic) -tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter) +tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) add_public_tablegen_target(AMDGPUCommonTableGen) add_llvm_target(R600CodeGen AMDILCFGStructurizer.cpp + AMDGPUAlwaysInlinePass.cpp AMDGPUAsmPrinter.cpp AMDGPUFrameLowering.cpp AMDGPUIntrinsicInfo.cpp @@ -41,17 +43,21 @@ add_llvm_target(R600CodeGen SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp SIFixSGPRLiveRanges.cpp + SIFoldOperands.cpp SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp + SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp SIMachineFunctionInfo.cpp + SIPrepareScratchRegs.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp ) +add_subdirectory(AsmParser) add_subdirectory(InstPrinter) add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td index 26303452c101..58b5ce24b4a3 100644 --- a/lib/Target/R600/CaymanInstructions.td +++ b/lib/Target/R600/CaymanInstructions.td @@ -46,6 +46,8 @@ def SIN_cm : SIN_Common<0x8D>; def COS_cm : COS_Common<0x8E>; } // End isVector = 1 +defm : RsqPat<RECIPSQRT_IEEE_cm, f32>; + def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td index 484e52250d1b..f24f76b7fe16 100644 --- a/lib/Target/R600/EvergreenInstructions.td +++ b/lib/Target/R600/EvergreenInstructions.td @@ -69,6 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; +defm : RsqPat<RECIPSQRT_IEEE_eg, f32>; def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; @@ -256,6 +257,12 @@ def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, let Predicates = [isEGorCayman] in { +// Should be predicated on FeatureFP64 +// def FMA_64 : R600_3OP < +// 0xA, "FMA_64", +// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] +// >; + // BFE_UINT - bit_extract, an optimization for mask and shift // Src0 = Input // Src1 = Offset @@ -295,7 +302,7 @@ def : Pat<(i32 (sext_inreg i32:$src, i8)), def : Pat<(i32 (sext_inreg i32:$src, i16)), (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; -defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>; +defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>; def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], @@ -312,6 +319,7 @@ def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; def : ROTRPattern <BIT_ALIGN_INT_eg>; def MULADD_eg : MULADD_Common<0x14>; def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; +def FMA_eg : FMA_Common<0x7>; def ASHR_eg : ASHR_Common<0x15>; def LSHR_eg : LSHR_Common<0x16>; def LSHL_eg : LSHL_Common<0x17>; @@ -466,21 +474,47 @@ class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> : let DisableEncoding = "$dst"; } -class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> : +class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern, + string dst =""> : R600_LDS < - lds_op, - (outs), + lds_op, outs, (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel, LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle), - " "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", + " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", pattern> { + + field string BaseOp; + + let LDS_1A1D = 0; let LDS_1A2D = 1; } +class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A2D <lds_op, (outs), name, pattern> { + let BaseOp = name; +} + +class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; +} + def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >; +def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >; +def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >; +def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >; +def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >; +def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >; +def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >; +def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >; +def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >; +def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >; def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] >; @@ -496,6 +530,33 @@ def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] >; +def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", + [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] +>; +def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", + [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] +>; +def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", + [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", + [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", + [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", + [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", + [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] +>; +def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", + [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] +>; +def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", + [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))] +>; def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] >; @@ -529,7 +590,7 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; // SHA-256 Patterns def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; -def : FROUNDPat <CNDGE_eg>; +def : FROUNDPat <CNDGE_eg, CNDGT_eg>; def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 0927040cb5bc..8271c6f45fb9 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -10,8 +10,10 @@ #include "AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/MathExtras.h" @@ -40,6 +42,81 @@ void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); } +void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " offen"; +} + +void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " idxen"; +} + +void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " addr64"; +} + +void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << " offset0:"; + printU8ImmDecOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << " offset1:"; + printU8ImmDecOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " glc"; +} + +void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " slc"; +} + +void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " tfe"; +} + void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { switch (reg) { case AMDGPU::VCC: @@ -54,6 +131,27 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { case AMDGPU::M0: O << "m0"; return; + case AMDGPU::FLAT_SCR: + O << "flat_scratch"; + return; + case AMDGPU::VCC_LO: + O << "vcc_lo"; + return; + case AMDGPU::VCC_HI: + O << "vcc_hi"; + return; + case AMDGPU::EXEC_LO: + O << "exec_lo"; + return; + case AMDGPU::EXEC_HI: + O << "exec_hi"; + return; + case AMDGPU::FLAT_SCR_LO: + O << "flat_scratch_lo"; + return; + case AMDGPU::FLAT_SCR_HI: + O << "flat_scratch_hi"; + return; default: break; } @@ -110,26 +208,62 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) { O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; } -void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) { +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { int32_t SImm = static_cast<int32_t>(Imm); if (SImm >= -16 && SImm <= 64) { O << SImm; return; } - if (Imm == FloatToBits(1.0f) || - Imm == FloatToBits(-1.0f) || - Imm == FloatToBits(0.5f) || - Imm == FloatToBits(-0.5f) || - Imm == FloatToBits(2.0f) || - Imm == FloatToBits(-2.0f) || - Imm == FloatToBits(4.0f) || - Imm == FloatToBits(-4.0f)) { - O << BitsToFloat(Imm); + if (Imm == FloatToBits(0.0f)) + O << "0.0"; + else if (Imm == FloatToBits(1.0f)) + O << "1.0"; + else if (Imm == FloatToBits(-1.0f)) + O << "-1.0"; + else if (Imm == FloatToBits(0.5f)) + O << "0.5"; + else if (Imm == FloatToBits(-0.5f)) + O << "-0.5"; + else if (Imm == FloatToBits(2.0f)) + O << "2.0"; + else if (Imm == FloatToBits(-2.0f)) + O << "-2.0"; + else if (Imm == FloatToBits(4.0f)) + O << "4.0"; + else if (Imm == FloatToBits(-4.0f)) + O << "-4.0"; + else + O << formatHex(static_cast<uint64_t>(Imm)); +} + +void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { + int64_t SImm = static_cast<int64_t>(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; return; } - O << formatHex(static_cast<uint64_t>(Imm)); + if (Imm == DoubleToBits(0.0)) + O << "0.0"; + else if (Imm == DoubleToBits(1.0)) + O << "1.0"; + else if (Imm == DoubleToBits(-1.0)) + O << "-1.0"; + else if (Imm == DoubleToBits(0.5)) + O << "0.5"; + else if (Imm == DoubleToBits(-0.5)) + O << "-0.5"; + else if (Imm == DoubleToBits(2.0)) + O << "2.0"; + else if (Imm == DoubleToBits(-2.0)) + O << "-2.0"; + else if (Imm == DoubleToBits(4.0)) + O << "4.0"; + else if (Imm == DoubleToBits(-4.0)) + O << "-4.0"; + else + llvm_unreachable("64-bit literal constants not supported"); } void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, @@ -147,27 +281,55 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, break; } } else if (Op.isImm()) { - printImmediate(Op.getImm(), O); + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + int RCID = Desc.OpInfo[OpNo].RegClass; + if (RCID != -1) { + const MCRegisterClass &ImmRC = MRI.getRegClass(RCID); + if (ImmRC.getSize() == 4) + printImmediate32(Op.getImm(), O); + else if (ImmRC.getSize() == 8) + printImmediate64(Op.getImm(), O); + else + llvm_unreachable("Invalid register class size"); + } else { + // We hit this for the immediate instruction bits that don't yet have a + // custom printer. + // TODO: Eventually this should be unnecessary. + O << formatDec(Op.getImm()); + } } else if (Op.isFPImm()) { - O << Op.getFPImm(); + // We special case 0.0 because otherwise it will be printed as an integer. + if (Op.getFPImm() == 0.0) + O << "0.0"; + else { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass); + + if (ImmRC.getSize() == 4) + printImmediate32(FloatToBits(Op.getFPImm()), O); + else if (ImmRC.getSize() == 8) + printImmediate64(DoubleToBits(Op.getFPImm()), O); + else + llvm_unreachable("Invalid register class size"); + } } else if (Op.isExpr()) { const MCExpr *Exp = Op.getExpr(); Exp->print(O); } else { - assert(!"unknown operand type in printOperand"); + llvm_unreachable("unknown operand type in printOperand"); } } void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned InputModifiers = MI->getOperand(OpNo).getImm(); - if (InputModifiers & 0x1) - O << "-"; - if (InputModifiers & 0x2) - O << "|"; + if (InputModifiers & SISrcMods::NEG) + O << '-'; + if (InputModifiers & SISrcMods::ABS) + O << '|'; printOperand(MI, OpNo + 1, O); - if (InputModifiers & 0x2) - O << "|"; + if (InputModifiers & SISrcMods::ABS) + O << '|'; } void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, @@ -181,7 +343,7 @@ void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, } else if (Imm == 0) { O << "P10"; } else { - assert(!"Invalid interpolation parameter slot"); + llvm_unreachable("Invalid interpolation parameter slot"); } } @@ -214,6 +376,23 @@ void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, printIfSet(MI, OpNo, O, "_SAT"); } +void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " clamp"; +} + +void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int Imm = MI->getOperand(OpNo).getImm(); + if (Imm == SIOutMods::MUL2) + O << " mul:2"; + else if (Imm == SIOutMods::MUL4) + O << " mul:4"; + else if (Imm == SIOutMods::DIV2) + O << " div:2"; +} + void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O) { int32_t Imm = MI->getOperand(OpNo).getImm(); @@ -281,7 +460,7 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, sel -= 512; int cb = sel >> 12; sel &= 4095; - O << cb << "[" << sel << "]"; + O << cb << '[' << sel << ']'; } else if (sel >= 448) { sel -= 448; O << sel; @@ -290,7 +469,7 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, } if (sel >= 0) - O << "." << chans[chan]; + O << '.' << chans[chan]; } void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, @@ -323,25 +502,25 @@ void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, unsigned Sel = MI->getOperand(OpNo).getImm(); switch (Sel) { case 0: - O << "X"; + O << 'X'; break; case 1: - O << "Y"; + O << 'Y'; break; case 2: - O << "Z"; + O << 'Z'; break; case 3: - O << "W"; + O << 'W'; break; case 4: - O << "0"; + O << '0'; break; case 5: - O << "1"; + O << '1'; break; case 7: - O << "_"; + O << '_'; break; default: break; @@ -353,10 +532,10 @@ void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, unsigned CT = MI->getOperand(OpNo).getImm(); switch (CT) { case 0: - O << "U"; + O << 'U'; break; case 1: - O << "N"; + O << 'N'; break; default: break; @@ -368,10 +547,10 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, int KCacheMode = MI->getOperand(OpNo).getImm(); if (KCacheMode > 0) { int KCacheBank = MI->getOperand(OpNo - 2).getImm(); - O << "CB" << KCacheBank <<":"; + O << "CB" << KCacheBank << ':'; int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); - int LineSize = (KCacheMode == 1)?16:32; - O << KCacheAddr * 16 << "-" << KCacheAddr * 16 + LineSize; + int LineSize = (KCacheMode == 1) ? 16 : 32; + O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; } } @@ -415,12 +594,26 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, unsigned Vmcnt = SImm16 & 0xF; unsigned Expcnt = (SImm16 >> 4) & 0xF; unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; - if (Vmcnt != 0xF) - O << "vmcnt(" << Vmcnt << ") "; - if (Expcnt != 0x7) - O << "expcnt(" << Expcnt << ") "; - if (Lgkmcnt != 0x7) - O << "lgkmcnt(" << Lgkmcnt << ")"; + + bool NeedSpace = false; + + if (Vmcnt != 0xF) { + O << "vmcnt(" << Vmcnt << ')'; + NeedSpace = true; + } + + if (Expcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "expcnt(" << Expcnt << ')'; + NeedSpace = true; + } + + if (Lgkmcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "lgkmcnt(" << Lgkmcnt << ')'; + } } #include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 6ca717076cde..1d43c7acbe74 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -10,8 +10,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef AMDGPUINSTPRINTER_H -#define AMDGPUINSTPRINTER_H +#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" @@ -34,9 +34,22 @@ public: private: void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); - void printImmediate(uint32_t Imm, raw_ostream &O); + void printImmediate32(uint32_t I, raw_ostream &O); + void printImmediate64(uint64_t I, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); @@ -45,6 +58,8 @@ private: StringRef Asm, StringRef Default = ""); static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -65,4 +80,4 @@ private: } // End namespace llvm -#endif // AMDGPUINSTRPRINTER_H +#endif diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt index 908872b55cd2..f3f254fdcbad 100644 --- a/lib/Target/R600/LLVMBuild.txt +++ b/lib/Target/R600/LLVMBuild.txt @@ -16,17 +16,18 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = InstPrinter MCTargetDesc TargetInfo +subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo [component_0] type = TargetGroup name = R600 parent = Target +has_asmparser = 1 has_asmprinter = 1 [component_1] type = Library name = R600CodeGen parent = R600 -required_libraries = Analysis AsmPrinter CodeGen Core MC R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target +required_libraries = Analysis AsmPrinter CodeGen Core IPO MC R600AsmParser R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils add_to_library_groups = R600 diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index d55f27b04554..d0c634fb7e42 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -29,7 +29,7 @@ public: const MCAsmLayout &Layout) override { //XXX: Implement if necessary. } - void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout, + void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) override { @@ -57,9 +57,7 @@ public: assert(!"Not implemented"); } bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } - bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override { - return true; - } + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; }; @@ -116,6 +114,13 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( return Infos[Kind - FirstTargetFixupKind]; } +bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + for (unsigned i = 0; i < Count; ++i) + OW->Write8(0); + + return true; +} + //===----------------------------------------------------------------------===// // ELFAMDGPUAsmBackend class //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h index 4b12e548a56f..01021d67ffd9 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_AMDGPUFIXUPKINDS_H -#define LLVM_AMDGPUFIXUPKINDS_H +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H #include "llvm/MC/MCFixup.h" @@ -31,4 +31,4 @@ enum Fixups { } } -#endif // LLVM_AMDGPUFIXUPKINDS_H +#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 78bbe0a163c9..19d89fb27caa 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -11,21 +11,15 @@ #include "AMDGPUMCAsmInfo.h" using namespace llvm; -AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() { +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() { HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// - HasSubsectionsViaSymbols = true; - HasMachoZeroFillDirective = false; - HasMachoTBSSDirective = false; - HasStaticCtorDtorReferenceInStaticMode = false; - LinkerRequiresNonEmptyDwarfLines = true; MaxInstLength = 16; SeparatorString = "\n"; CommentString = ";"; - LabelSuffix = ":"; + PrivateLabelPrefix = ""; InlineAsmStart = ";#ASMSTART"; InlineAsmEnd = ";#ASMEND"; - AssemblerDialect = 0; //===--- Data Emission Directives -------------------------------------===// ZeroDirective = ".zero"; @@ -35,28 +29,15 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() { Data16bitsDirective = ".short\t"; Data32bitsDirective = ".long\t"; Data64bitsDirective = ".quad\t"; - GPRel32Directive = nullptr; SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; - //===--- Alignment Information ----------------------------------------===// - AlignmentIsInBytes = true; - TextAlignFillValue = 0; - //===--- Global Variable Emission Directives --------------------------===// - GlobalDirective = ".global"; - HasSetDirective = false; HasAggressiveSymbolFolding = true; COMMDirectiveAlignmentIsInBytes = false; HasDotTypeDotSizeDirective = false; HasNoDeadStrip = true; WeakRefDirective = ".weakref\t"; //===--- Dwarf Emission Directives -----------------------------------===// - HasLEB128 = true; SupportsDebugInformation = true; } - -const MCSection* -AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const { - return nullptr; -} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h index 59aebece540a..8f75c76c4257 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -1,4 +1,4 @@ -//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface ----------===// +//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,18 +11,22 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUMCASMINFO_H -#define AMDGPUMCASMINFO_H +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" namespace llvm { class StringRef; -class AMDGPUMCAsmInfo : public MCAsmInfo { +// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, +// you will need to make sure your new class sets PrivateGlobalPrefix to +// a prefix that won't appeary in a fuction name. The default value +// for PrivateGlobalPrefix is 'L', so it will consider any function starting +// with 'L' as a local symbol. +class AMDGPUMCAsmInfo : public MCAsmInfoELF { public: explicit AMDGPUMCAsmInfo(StringRef &TT); - const MCSection* getNonexecutableStackSection(MCContext &CTX) const override; }; } // namespace llvm -#endif // AMDGPUMCASMINFO_H +#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h index d5e432de564c..c95742762233 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AMDGPUCODEEMITTER_H -#define AMDGPUCODEEMITTER_H +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H #include "llvm/MC/MCCodeEmitter.h" #include "llvm/Support/raw_ostream.h" @@ -47,4 +47,4 @@ public: } // End namespace llvm -#endif // AMDGPUCODEEMITTER_H +#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 38a295659f96..83403ba04870 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -15,6 +15,7 @@ #include "AMDGPUMCTargetDesc.h" #include "AMDGPUMCAsmInfo.h" #include "InstPrinter/AMDGPUInstPrinter.h" +#include "SIDefines.h" #include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -84,31 +85,37 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, static MCStreamer *createMCStreamer(const Target &T, StringRef TT, MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, - MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, - bool RelaxAll, - bool NoExecStack) { - return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false); + raw_ostream &_OS, MCCodeEmitter *_Emitter, + const MCSubtargetInfo &STI, bool RelaxAll) { + return createELFStreamer(Ctx, MAB, _OS, _Emitter, false); } extern "C" void LLVMInitializeR600TargetMC() { RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget); + RegisterMCAsmInfo<AMDGPUMCAsmInfo> Z(TheGCNTarget); TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(TheGCNTarget, createAMDGPUMCCodeGenInfo); TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheGCNTarget, createAMDGPUMCInstrInfo); TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheGCNTarget, createAMDGPUMCRegisterInfo); TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheGCNTarget, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheGCNTarget, createAMDGPUMCInstPrinter); TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createAMDGPUMCCodeEmitter); TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheGCNTarget, createAMDGPUAsmBackend); TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheGCNTarget, createMCStreamer); } diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h index f6b3376da32c..bc8cd53d84b4 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// // -#ifndef AMDGPUMCTARGETDESC_H -#define AMDGPUMCTARGETDESC_H +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H #include "llvm/ADT/StringRef.h" @@ -30,6 +30,7 @@ class Target; class raw_ostream; extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, @@ -55,4 +56,4 @@ MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS); #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" -#endif // AMDGPUMCTARGETDESC_H +#endif diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp index 78776c11d75d..640de3f9fc84 100644 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp @@ -14,9 +14,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixup.h" @@ -84,12 +85,10 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const { + unsigned OpType = Desc.OpInfo[OpNo].OperandType; - unsigned RegClass = Desc.OpInfo[OpNo].RegClass; - return (AMDGPU::SSrc_32RegClassID == RegClass) || - (AMDGPU::SSrc_64RegClassID == RegClass) || - (AMDGPU::VSrc_32RegClassID == RegClass) || - (AMDGPU::VSrc_64RegClassID == RegClass); + return OpType == AMDGPU::OPERAND_REG_IMM32 || + OpType == AMDGPU::OPERAND_REG_INLINE_C; } uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const { diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile index 1b3ebbe8c8f3..64a7c8c045c5 100644 --- a/lib/Target/R600/Makefile +++ b/lib/Target/R600/Makefile @@ -16,8 +16,8 @@ BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \ AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \ AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \ AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \ - AMDGPUGenAsmWriter.inc + AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc -DIRS = InstPrinter TargetInfo MCTargetDesc +DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index ce17d7cb7f13..cff97cdb3beb 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -83,28 +83,38 @@ def : Proc<"cayman", R600_VLIW4_Itin, // Southern Islands //===----------------------------------------------------------------------===// -def : Proc<"SI", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"SI", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"tahiti", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"tahiti", SIFullSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"pitcairn", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"verde", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"oland", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>; -def : Proc<"hainan", SI_Itin, [FeatureSouthernIslands]>; +def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; //===----------------------------------------------------------------------===// // Sea Islands //===----------------------------------------------------------------------===// -def : Proc<"bonaire", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, [FeatureSeaIslands]>; -def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"kabini", SIQuarterSpeedModel, [FeatureSeaIslands]>; -def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, [FeatureSeaIslands]>; -def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"hawaii", SIFullSpeedModel, [FeatureSeaIslands]>; -def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>; +def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>; + +//===----------------------------------------------------------------------===// +// Volcanic Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"tonga", SIFullSpeedModel, [FeatureVolcanicIslands]>; + +def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; + +def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp index 92bf0df96254..f07be0001fb8 100644 --- a/lib/Target/R600/R600ClauseMergePass.cpp +++ b/lib/Target/R600/R600ClauseMergePass.cpp @@ -18,6 +18,7 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -167,7 +168,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, } bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index e37767a0719d..edaf27841ca7 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -336,7 +336,7 @@ private: getHWInstrDesc(IsTex?CF_TC:CF_VC)) .addImm(0) // ADDR .addImm(AluInstCount - 1); // COUNT - return ClauseFile(MIb, ClauseContent); + return ClauseFile(MIb, std::move(ClauseContent)); } void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { @@ -426,7 +426,7 @@ private: } assert(ClauseContent.size() < 128 && "ALU clause is too big"); ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); - return ClauseFile(ClauseHead, ClauseContent); + return ClauseFile(ClauseHead, std::move(ClauseContent)); } void @@ -459,11 +459,9 @@ private: void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); } - void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr) - const { - for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end(); - It != E; ++It) { - MachineInstr *MI = *It; + void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, + unsigned Addr) const { + for (MachineInstr *MI : MIs) { CounterPropagateAddr(MI, Addr); } } @@ -477,8 +475,9 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); - TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = static_cast<const R600RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); CFStack CFStack(ST, MFI->getShaderType()); @@ -542,7 +541,7 @@ public: std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, std::set<MachineInstr *>()); Pair.second.insert(MIb); - LoopStack.push_back(Pair); + LoopStack.push_back(std::move(Pair)); MI->eraseFromParent(); CfCount++; break; @@ -550,7 +549,7 @@ public: case AMDGPU::ENDLOOP: { CFStack.popLoop(); std::pair<unsigned, std::set<MachineInstr *> > Pair = - LoopStack.back(); + std::move(LoopStack.back()); LoopStack.pop_back(); CounterPropagateAddr(Pair.second, CfCount); BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index f2f28fe469b5..51d87eda31d1 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef R600DEFINES_H_ -#define R600DEFINES_H_ +#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H +#define LLVM_LIB_TARGET_R600_R600DEFINES_H #include "llvm/MC/MCRegisterInfo.h" @@ -168,4 +168,4 @@ namespace OpName { #define R_0288E8_SQ_LDS_ALLOC 0x0288E8 -#endif // R600DEFINES_H_ +#endif diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index 38afebef400e..fdc20302f4a3 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -19,6 +19,7 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -297,7 +298,7 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index 732b06dc15c7..211d392e8fcc 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -19,6 +19,7 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -65,7 +66,7 @@ void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, } bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); const R600RegisterInfo &TRI = TII->getRegisterInfo(); diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 52315bf0f338..595f69884544 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -122,12 +122,19 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address // spaces, so it is custom lowered to handle those where it isn't. - setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); + } setOperationAction(ISD::STORE, MVT::i8, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); @@ -181,8 +188,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SUBE, VT, Expand); } - setBooleanContents(ZeroOrNegativeOneBooleanContent); - setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::Source); } @@ -192,7 +197,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock::iterator I = *MI; const R600InstrInfo *TII = - static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo()); + static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo()); switch (MI->getOpcode()) { default: @@ -202,7 +207,10 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); assert(DstIdx != -1); MachineInstrBuilder NewMI; - if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) + // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add + // LDS_1A2D support and remove this special case. + if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || + MI->getOpcode() == AMDGPU::LDS_CMPST_RET) return BB; NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), @@ -645,8 +653,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const MachineSDNode *interp; if (ijb < 0) { const MachineFunction &MF = DAG.getMachineFunction(); - const R600InstrInfo *TII = - static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo()); + const R600InstrInfo *TII = static_cast<const R600InstrInfo *>( + MF.getSubtarget().getInstrInfo()); interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); return DAG.getTargetExtractSubreg( @@ -806,6 +814,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); + case Intrinsic::AMDGPU_read_workdim: + return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); + case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T1_X, VT); @@ -901,74 +912,7 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::UDIVREM: { SDValue Op = SDValue(N, 0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - SDValue one = DAG.getConstant(1, HalfVT); - SDValue zero = DAG.getConstant(0, HalfVT); - - //HiLo split - SDValue LHS = N->getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - - SDValue RHS = N->getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - - // Get Speculative values - SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - - SDValue REM_Hi = zero; - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; - - const unsigned halfBitWidth = HalfVT.getSizeInBits(); - - for (unsigned i = 0; i < halfBitWidth; ++i) { - SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); - // Get Value of high bit - SDValue HBit; - if (halfBitWidth == 32 && Subtarget->hasBFE()) { - HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); - } else { - HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); - } - - SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, - DAG.getConstant(halfBitWidth - 1, HalfVT)); - REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); - REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); - - REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); - REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); - - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - - SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); - - DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); - - // Update REM - - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); - REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); - REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); - } - - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); - Results.push_back(DIV); - Results.push_back(REM); + LowerUDIVREM64(Op, DAG, Results); break; } } @@ -1176,6 +1120,13 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const SDValue CC = Op.getOperand(4); SDValue Temp; + if (VT == MVT::f32) { + DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); + SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + if (MinMax) + return MinMax; + } + // LHS and RHS are guaranteed to be the same value type EVT CompareVT = LHS.getValueType(); @@ -1430,8 +1381,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( - getTargetMachine().getFrameLowering()); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + getTargetMachine().getSubtargetImpl()->getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1543,7 +1494,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { SDValue MergedValues[2] = { - SplitVectorLoad(Op, DAG), + ScalarizeVectorLoad(Op, DAG), Chain }; return DAG.getMergeValues(MergedValues, DL); @@ -1613,6 +1564,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const LoadNode->getPointerInfo(), MemVT, LoadNode->isVolatile(), LoadNode->isNonTemporal(), + LoadNode->isInvariant(), LoadNode->getAlignment()); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); @@ -1627,8 +1579,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( - getTargetMachine().getFrameLowering()); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + getTargetMachine().getSubtargetImpl()->getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1691,10 +1643,10 @@ SDValue R600TargetLowering::LowerFormalArguments( SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); MachineFunction &MF = DAG.getMachineFunction(); - unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); SmallVector<ISD::InputArg, 8> LocalIns; @@ -1704,10 +1656,15 @@ SDValue R600TargetLowering::LowerFormalArguments( for (unsigned i = 0, e = Ins.size(); i < e; ++i) { CCValAssign &VA = ArgLocs[i]; - EVT VT = Ins[i].VT; - EVT MemVT = LocalIns[i].VT; + const ISD::InputArg &In = Ins[i]; + EVT VT = In.VT; + EVT MemVT = VA.getLocVT(); + if (!VT.isVector() && MemVT.isVector()) { + // Get load source type if scalarized. + MemVT = MemVT.getVectorElementType(); + } - if (ShaderType != ShaderType::COMPUTE) { + if (MFI->getShaderType() != ShaderType::COMPUTE) { unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Register); @@ -1715,7 +1672,7 @@ SDValue R600TargetLowering::LowerFormalArguments( } PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); + AMDGPUAS::CONSTANT_BUFFER_0); // i64 isn't a legal type, so the register type used ends up as i32, which // isn't expected here. It attempts to create this sextload, but it ends up @@ -1724,18 +1681,33 @@ SDValue R600TargetLowering::LowerFormalArguments( // The first 36 bytes of the input buffer contains information about // thread group and global sizes. + ISD::LoadExtType Ext = ISD::NON_EXTLOAD; + if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { + // FIXME: This should really check the extload type, but the handling of + // extload vector parameters seems to be broken. + + // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + Ext = ISD::SEXTLOAD; + } + + // Compute the offset from the value. + // XXX - I think PartOffset should give you this, but it seems to give the + // size of the register which isn't useful. + + unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset(); + unsigned PartOffset = VA.getLocMemOffset(); + unsigned Offset = 36 + VA.getLocMemOffset(); - // FIXME: This should really check the extload type, but the handling of - // extload vecto parameters seems to be broken. - //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - ISD::LoadExtType Ext = ISD::SEXTLOAD; - SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain, - DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32), - MachinePointerInfo(UndefValue::get(PtrTy)), - MemVT, false, false, 4); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); + SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, + DAG.getConstant(Offset, MVT::i32), + DAG.getUNDEF(MVT::i32), + PtrInfo, + MemVT, false, true, true, 4); // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); + MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); } return Chain; } @@ -2081,7 +2053,7 @@ static bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); + static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); if (!Src.isMachineOpcode()) return false; switch (Src.getMachineOpcode()) { @@ -2206,7 +2178,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); + static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); if (!Node->isMachineOpcode()) return Node; unsigned Opcode = Node->getMachineOpcode(); diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index d22c8c98a542..10ebc10ccdba 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef R600ISELLOWERING_H -#define R600ISELLOWERING_H +#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H +#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H #include "AMDGPUISelLowering.h" @@ -74,4 +74,4 @@ private: } // End namespace llvm; -#endif // R600ISELLOWERING_H +#endif diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td index 9428babcbefd..0ffd485476ec 100644 --- a/lib/Target/R600/R600InstrFormats.td +++ b/lib/Target/R600/R600InstrFormats.td @@ -38,6 +38,9 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, let Pattern = pattern; let Itinerary = itin; + // No AsmMatcher support. + let isCodeGenOnly = 1; + let TSFlags{4} = Trig; let TSFlags{5} = Op3; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 99920b7761a7..653fd0d52757 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -571,7 +571,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, if (!isLastAluTrans) return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); - TransOps = IGSrcs.back(); + TransOps = std::move(IGSrcs.back()); IGSrcs.pop_back(); ValidSwizzle.pop_back(); @@ -654,10 +654,10 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) return fitsConstReadLimitations(Consts); } -DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM, - const ScheduleDAG *DAG) const { - const InstrItineraryData *II = TM->getInstrItineraryData(); - return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II); +DFAPacketizer * +R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { + const InstrItineraryData *II = STI.getInstrItineraryData(); + return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II); } static bool @@ -1082,9 +1082,8 @@ bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { - const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering*>( - MF.getTarget().getFrameLowering()); + const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( + MF.getSubtarget().getFrameLowering()); unsigned StackWidth = TFL->getStackWidth(MF); int End = getIndirectIndexEnd(MF); diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 1c3cb637a178..d3dc0e58daa1 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef R600INSTRUCTIONINFO_H_ -#define R600INSTRUCTIONINFO_H_ +#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H +#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H #include "AMDGPUInstrInfo.h" #include "R600Defines.h" @@ -154,8 +154,8 @@ namespace llvm { bool isMov(unsigned Opcode) const override; - DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM, - const ScheduleDAG *DAG) const override; + DFAPacketizer * + CreateTargetScheduleState(const TargetSubtargetInfo &) const override; bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; @@ -206,7 +206,7 @@ namespace llvm { int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const override { return 1;} - virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; /// \brief Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, @@ -298,4 +298,4 @@ int getLDSNoRetOp(uint16_t Opcode); } // End llvm namespace -#endif // R600INSTRINFO_H_ +#endif diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 704507d368ec..b1d3ce276eee 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -475,13 +475,13 @@ class ExportBufWord1 { multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), 0, 61, 0, 7, 7, 7, cf_inst, 0) >; def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), 0, 61, 7, 0, 7, 7, cf_inst, 0) >; @@ -513,17 +513,17 @@ multiclass SteamOutputExportPattern<Instruction ExportInst, // Stream1 def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), - (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf1inst, 0)>; // Stream2 def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), - (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf2inst, 0)>; // Stream3 def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), - (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf3inst, 0)>; } @@ -674,8 +674,9 @@ def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; // Non-IEEE MUL: 0 * anything = 0 def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; -def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>; -def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>; +// TODO: Do these actually match the regular fmin/fmax behavior? +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; // For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, // so some of the instruction names don't match the asm string. @@ -697,7 +698,7 @@ def SGE : R600_2OP < def SNE : R600_2OP < 0xB, "SETNE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))] >; def SETE_DX10 : R600_2OP < @@ -715,9 +716,10 @@ def SETGE_DX10 : R600_2OP < [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))] >; +// FIXME: This should probably be COND_ONE def SETNE_DX10 : R600_2OP < 0xF, "SETNE_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] >; def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; @@ -915,6 +917,11 @@ class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] >; +class FMA_Common <bits<5> inst> : R600_3OP < + inst, "FMA", + [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU +>; + class CNDE_Common <bits<5> inst> : R600_3OP < inst, "CNDE", [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))] @@ -1068,7 +1075,7 @@ class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP < } class RECIP_IEEE_Common <bits<11> inst> : R600_1OP < - inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] + inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))] > { let Itinerary = TransALU; } @@ -1114,6 +1121,7 @@ def FNEG_R600 : FNEG<R600_Reg32>; // Helper patterns for complex intrinsics //===----------------------------------------------------------------------===// +// FIXME: Should be predicated on unsafe fp math. multiclass DIV_Common <InstR600 recip_ieee> { def : Pat< (int_AMDGPU_div f32:$src0, f32:$src1), @@ -1124,6 +1132,8 @@ def : Pat< (fdiv f32:$src0, f32:$src1), (MUL_IEEE $src0, (recip_ieee $src1)) >; + +def : RcpPat<recip_ieee, f32>; } class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> @@ -1133,9 +1143,12 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie >; // FROUND pattern -class FROUNDPat<Instruction CNDGE> : Pat < +class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat < (AMDGPUround f32:$x), - (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)) + (CNDGE $x, + (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)), + (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)) + ) >; @@ -1180,7 +1193,9 @@ let Predicates = [isR600] in { def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; - def : FROUNDPat <CNDGE_r600>; + defm : RsqPat<RECIPSQRT_IEEE_r600, f32>; + + def : FROUNDPat <CNDGE_r600, CNDGT_r600>; def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT @@ -1350,7 +1365,7 @@ def CONST_COPY : Instruction { let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; let AsmString = "CONST_COPY"; - let neverHasSideEffects = 1; + let hasSideEffects = 0; let isAsCheapAsAMove = 1; let Itinerary = NullALU; } @@ -1482,6 +1497,7 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; + let isCodeGenOnly = 1; } multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h index b0ae22e806a9..263561edd30d 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.h +++ b/lib/Target/R600/R600MachineFunctionInfo.h @@ -10,8 +10,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef R600MACHINEFUNCTIONINFO_H -#define R600MACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" #include "llvm/ADT/BitVector.h" @@ -31,4 +31,4 @@ public: } // End llvm namespace -#endif //R600MACHINEFUNCTIONINFO_H +#endif diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 7ea654cb14cd..d782713cab65 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -14,7 +14,6 @@ #include "R600MachineScheduler.h" #include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/PassManager.h" @@ -76,21 +75,25 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { float ALUFetchRationEstimate = (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / (FetchInstCount + Available[IDFetch].size()); - unsigned NeededWF = 62.5f / ALUFetchRationEstimate; - DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); - // We assume the local GPR requirements to be "dominated" by the requirement - // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and - // after TEX are indeed likely to consume or generate values from/for the - // TEX clause. - // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause - // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need - // one GPR) or TmXYZW = TnXYZW (need 2 GPR). - // (TODO : use RegisterPressure) - // If we are going too use too many GPR, we flush Fetch instruction to lower - // register pressure on 128 bits regs. - unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); - if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + if (ALUFetchRationEstimate == 0) { AllowSwitchFromAlu = true; + } else { + unsigned NeededWF = 62.5f / ALUFetchRationEstimate; + DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); + // We assume the local GPR requirements to be "dominated" by the requirement + // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and + // after TEX are indeed likely to consume or generate values from/for the + // TEX clause. + // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need + // one GPR) or TmXYZW = TnXYZW (need 2 GPR). + // (TODO : use RegisterPressure) + // If we are going too use too many GPR, we flush Fetch instruction to lower + // register pressure on 128 bits regs. + unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + AllowSwitchFromAlu = true; + } } if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h index fd475af2bf8f..fc5b95c28e71 100644 --- a/lib/Target/R600/R600MachineScheduler.h +++ b/lib/Target/R600/R600MachineScheduler.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef R600MACHINESCHEDULER_H_ -#define R600MACHINESCHEDULER_H_ +#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H +#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H #include "R600InstrInfo.h" #include "llvm/ADT/PriorityQueue.h" diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp index 2314136f2227..742c0e0451cb 100644 --- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/Debug.h" #include "AMDGPU.h" #include "R600InstrInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -279,9 +280,8 @@ bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, continue; if (PreviousRegSeqByReg[MOp->getReg()].empty()) continue; - std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()]; - for (unsigned i = 0, e = MIs.size(); i < e; i++) { - CompatibleRSI = PreviousRegSeq[MIs[i]]; + for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) { + CompatibleRSI = PreviousRegSeq[MI]; if (RSI == CompatibleRSI) continue; if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan)) @@ -314,7 +314,7 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { } bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast<const R600InstrInfo *>(Fn.getTarget().getInstrInfo()); + TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo()); MRI = &(Fn.getRegInfo()); for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); MBB != MBBe; ++MBB) { diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index 74cf30974d58..ddf68c91cdf3 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -148,11 +148,11 @@ private: } public: // Ctor. - R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, - MachineDominatorTree &MDT) - : VLIWPacketizerList(MF, MLI, MDT, true), - TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())), - TRI(TII->getRegisterInfo()) { + R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) + : VLIWPacketizerList(MF, MLI, true), + TII(static_cast<const R600InstrInfo *>( + MF.getSubtarget().getInstrInfo())), + TRI(TII->getRegisterInfo()) { VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); } @@ -328,12 +328,11 @@ public: }; bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { - const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo(); + const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); - MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); // Instantiate the packetizer. - R600PacketizerList Packetizer(Fn, MLI, MDT); + R600PacketizerList Packetizer(Fn, MLI); // DFA state table should not be empty. assert(Packetizer.getResourceTracker() && "Empty DFA table!"); diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h index 247808b6e7b6..f1a8a41b9a5d 100644 --- a/lib/Target/R600/R600RegisterInfo.h +++ b/lib/Target/R600/R600RegisterInfo.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef R600REGISTERINFO_H_ -#define R600REGISTERINFO_H_ +#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H +#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H #include "AMDGPURegisterInfo.h" @@ -46,4 +46,4 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { } // End namespace llvm -#endif // AMDIDSAREGISTERINFO_H_ +#endif diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index b7e7a2d000b3..73a9c73d8e7b 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -8,21 +8,88 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef SIDEFINES_H_ -#define SIDEFINES_H_ +#include "llvm/MC/MCInstrDesc.h" + +#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H +#define LLVM_LIB_TARGET_R600_SIDEFINES_H namespace SIInstrFlags { +// This needs to be kept in sync with the field bits in InstSI. enum { - MIMG = 1 << 3, - SMRD = 1 << 4, - VOP1 = 1 << 5, - VOP2 = 1 << 6, - VOP3 = 1 << 7, - VOPC = 1 << 8, - SALU = 1 << 9 + SALU = 1 << 3, + VALU = 1 << 4, + + SOP1 = 1 << 5, + SOP2 = 1 << 6, + SOPC = 1 << 7, + SOPK = 1 << 8, + SOPP = 1 << 9, + + VOP1 = 1 << 10, + VOP2 = 1 << 11, + VOP3 = 1 << 12, + VOPC = 1 << 13, + + MUBUF = 1 << 14, + MTBUF = 1 << 15, + SMRD = 1 << 16, + DS = 1 << 17, + MIMG = 1 << 18, + FLAT = 1 << 19 }; } +namespace llvm { +namespace AMDGPU { + enum OperandType { + /// Operand with register or 32-bit immediate + OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, + /// Operand with register or inline constant + OPERAND_REG_INLINE_C + }; +} +} + +namespace SIInstrFlags { + enum Flags { + // First 4 bits are the instruction encoding + VM_CNT = 1 << 0, + EXP_CNT = 1 << 1, + LGKM_CNT = 1 << 2 + }; + + // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. + // The result is true if any of these tests are true. + enum ClassFlags { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity + }; +} + +namespace SISrcMods { + enum { + NEG = 1 << 0, + ABS = 1 << 1 + }; +} + +namespace SIOutMods { + enum { + NONE = 0, + MUL2 = 1, + MUL4 = 2, + DIV2 = 3 + }; +} + #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C #define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) @@ -32,7 +99,14 @@ enum { #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) #define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C -#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) +#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) +#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) +#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) +#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) + #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC @@ -89,4 +163,4 @@ enum { #define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 #define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) -#endif // SIDEFINES_H_ +#endif diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp index 5f714535abeb..cd1b3acc5c87 100644 --- a/lib/Target/R600/SIFixSGPRCopies.cpp +++ b/lib/Target/R600/SIFixSGPRCopies.cpp @@ -66,6 +66,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -135,12 +136,12 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( const MachineRegisterInfo &MRI, unsigned Reg, unsigned SubReg) const { - // The Reg parameter to the function must always be defined by either a PHI - // or a COPY, therefore it cannot be a physical register. - assert(TargetRegisterInfo::isVirtualRegister(Reg) && - "Reg cannot be a physical register"); - const TargetRegisterClass *RC = MRI.getRegClass(Reg); + const TargetRegisterClass *RC + = TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + TRI->getRegClass(Reg); + RC = TRI->getSubRegClass(RC, SubReg); for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { @@ -181,7 +182,12 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, unsigned DstReg = Copy.getOperand(0).getReg(); unsigned SrcReg = Copy.getOperand(1).getReg(); unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); - const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + + const TargetRegisterClass *DstRC + = TargetRegisterInfo::isVirtualRegister(DstReg) ? + MRI.getRegClass(DstReg) : + TRI->getRegClass(DstReg); + const TargetRegisterClass *SrcRC; if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || @@ -195,10 +201,10 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( - MF.getTarget().getRegisterInfo()); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - MF.getTarget().getInstrInfo()); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -216,20 +222,21 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { switch (MI.getOpcode()) { default: continue; case AMDGPU::PHI: { - DEBUG(dbgs() << " Fixing PHI:\n"); - DEBUG(MI.print(dbgs())); + DEBUG(dbgs() << "Fixing PHI: " << MI); - for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { - unsigned Reg = MI.getOperand(i).getReg(); - const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg, - MI.getOperand(0).getSubReg()); - MRI.constrainRegClass(Reg, RC); + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + const MachineOperand &Op = MI.getOperand(i); + unsigned Reg = Op.getReg(); + const TargetRegisterClass *RC + = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); + + MRI.constrainRegClass(Op.getReg(), RC); } unsigned Reg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, MI.getOperand(0).getSubReg()); - if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) { - MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass); + if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { + MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); } if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) @@ -237,14 +244,66 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { // If a PHI node defines an SGPR and any of its operands are VGPRs, // then we need to move it to the VALU. + // + // Also, if a PHI node defines an SGPR and has all SGPR operands + // we must move it to the VALU, because the SGPR operands will + // all end up being assigned the same register, which means + // there is a potential for a conflict if different threads take + // different control flow paths. + // + // For Example: + // + // sgpr0 = def; + // ... + // sgpr1 = def; + // ... + // sgpr2 = PHI sgpr0, sgpr1 + // use sgpr2; + // + // Will Become: + // + // sgpr2 = def; + // ... + // sgpr2 = def; + // ... + // use sgpr2 + // + // FIXME: This is OK if the branching decision is made based on an + // SGPR value. + bool SGPRBranch = false; + + // The one exception to this rule is when one of the operands + // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK + // instruction. In this case, there we know the program will + // never enter the second block (the loop) without entering + // the first block (where the condition is computed), so there + // is no chance for values to be over-written. + + bool HasBreakDef = false; for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { unsigned Reg = MI.getOperand(i).getReg(); if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { TII->moveToVALU(MI); break; } + MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); + assert(DefInstr); + switch(DefInstr->getOpcode()) { + + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + // If we see a PHI instruction that defines an SGPR, then that PHI + // instruction has already been considered and should have + // a *_BREAK as an operand. + case AMDGPU::PHI: + HasBreakDef = true; + break; + } } + if (!SGPRBranch && !HasBreakDef) + TII->moveToVALU(MI); break; } case AMDGPU::REG_SEQUENCE: { @@ -252,8 +311,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { !hasVGPROperands(MI, TRI)) continue; - DEBUG(dbgs() << "Fixing REG_SEQUENCE:\n"); - DEBUG(MI.print(dbgs())); + DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); TII->moveToVALU(MI); break; @@ -265,8 +323,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { - DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n"); - DEBUG(MI.print(dbgs())); + DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); TII->moveToVALU(MI); } break; @@ -274,5 +331,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } } } - return false; + + return true; } diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp index 7d116eef396c..f34c37580432 100644 --- a/lib/Target/R600/SIFixSGPRLiveRanges.cpp +++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp @@ -9,18 +9,49 @@ // /// \file /// SALU instructions ignore control flow, so we need to modify the live ranges -/// of the registers they define. +/// of the registers they define in some cases. /// -/// The strategy is to view the entire program as if it were a single basic -/// block and calculate the intervals accordingly. We implement this -/// by walking this list of segments for each LiveRange and setting the -/// end of each segment equal to the start of the segment that immediately -/// follows it. +/// The main case we need to handle is when a def is used in one side of a +/// branch and not another. For example: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// +/// Here we need the register allocator to avoid assigning any of the defs +/// inside of the IF to the same register as %def. In traditional live +/// interval analysis %def is not live inside the IF branch, however, since +/// SALU instructions inside of IF will be executed even if the branch is not +/// taken, there is the chance that one of the instructions will overwrite the +/// value of %def, so the use in ELSE will see the wrong value. +/// +/// The strategy we use for solving this is to add an extra use after the ENDIF: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// %use +/// +/// Adding this use will make the def live thoughout the IF branch, which is +/// what we want. #include "AMDGPU.h" +#include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" @@ -40,16 +71,15 @@ public: initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override; - virtual const char *getPassName() const override { + const char *getPassName() const override { return "SI Fix SGPR live ranges"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LiveIntervals>(); - AU.addPreserved<LiveIntervals>(); - AU.addPreserved<SlotIndexes>(); + AU.addRequired<MachinePostDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -60,6 +90,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) @@ -73,36 +104,86 @@ FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( - MF.getTarget().getRegisterInfo()); + MF.getSubtarget().getRegisterInfo()); LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); + std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges; + + // First pass, collect all live intervals for SGPRs + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.defs()) { + if (MO.isImplicit()) + continue; + unsigned Def = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Def)) { + if (TRI->isSGPRClass(MRI.getRegClass(Def))) + SGPRLiveRanges.push_back( + std::make_pair(Def, &LIS->getInterval(Def))); + } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { + SGPRLiveRanges.push_back( + std::make_pair(Def, &LIS->getRegUnit(Def))); + } + } + } + } + // Second pass fix the intervals for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC); - if (ExecUse) + if (MBB.succ_size() < 2) + continue; + + // We have structured control flow, so number of succesors should be two. + assert(MBB.succ_size() == 2); + MachineBasicBlock *SuccA = *MBB.succ_begin(); + MachineBasicBlock *SuccB = *(++MBB.succ_begin()); + MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); + + if (!NCD) + continue; + + MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); + + if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { + assert(NCD->succ_size() == 2); + // We want to make sure we insert the Use after the ENDIF, not after + // the ELSE. + NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), + *(++NCD->succ_begin())); + } + assert(SuccA && SuccB); + for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) { + unsigned Reg = RegLR.first; + LiveRange *LR = RegLR.second; + + // FIXME: We could be smarter here. If the register is Live-In to + // one block, but the other doesn't have any SGPR defs, then there + // won't be a conflict. Also, if the branch decision is based on + // a value in an SGPR, then there will be no conflict. + bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); + bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); + + if ((!LiveInToA && !LiveInToB) || + (LiveInToA && LiveInToB)) continue; - for (const MachineOperand &Def : MI.operands()) { - if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg())) - continue; - - const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg()); - - if (!TRI->isSGPRClass(RC)) - continue; - LiveInterval &LI = LIS->getInterval(Def.getReg()); - for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) { - LiveRange::Segment &Seg = LI.segments[i]; - LiveRange::Segment &Next = LI.segments[i + 1]; - Seg.end = Next.start; - } - } + // This interval is live in to one successor, but not the other, so + // we need to update its range so it is live in to both. + DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << + " BB#" << SuccA->getNumber() << ", BB#" << + SuccB->getNumber() << + " with NCD = " << NCD->getNumber() << '\n'); + + // FIXME: Need to figure out how to update LiveRange here so this pass + // will be able to preserve LiveInterval analysis. + BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::SGPR_USE)) + .addReg(Reg, RegState::Implicit); + DEBUG(NCD->getFirstNonPHI()->dump()); } } diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp new file mode 100644 index 000000000000..d8ffa4f75505 --- /dev/null +++ b/lib/Target/R600/SIFoldOperands.cpp @@ -0,0 +1,275 @@ +//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-fold-operands" +using namespace llvm; + +namespace { + +class SIFoldOperands : public MachineFunctionPass { +public: + static char ID; + +public: + SIFoldOperands() : MachineFunctionPass(ID) { + initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fold Operands"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +struct FoldCandidate { + MachineInstr *UseMI; + unsigned UseOpNo; + MachineOperand *OpToFold; + uint64_t ImmToFold; + + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : + UseMI(MI), UseOpNo(OpNo) { + + if (FoldOp->isImm()) { + OpToFold = nullptr; + ImmToFold = FoldOp->getImm(); + } else { + assert(FoldOp->isReg()); + OpToFold = FoldOp; + } + } + + bool isImm() const { + return !OpToFold; + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) + +char SIFoldOperands::ID = 0; + +char &llvm::SIFoldOperandsID = SIFoldOperands::ID; + +FunctionPass *llvm::createSIFoldOperandsPass() { + return new SIFoldOperands(); +} + +static bool isSafeToFold(unsigned Opcode) { + switch(Opcode) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + +static bool updateOperand(FoldCandidate &Fold, + const TargetRegisterInfo &TRI) { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + assert(Old.isReg()); + + if (Fold.isImm()) { + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } + + MachineOperand *New = Fold.OpToFold; + if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && + TargetRegisterInfo::isVirtualRegister(New->getReg())) { + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + return true; + } + + // FIXME: Handle physical registers. + + return false; +} + +static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, + MachineInstr *MI, unsigned OpNo, + MachineOperand *OpToFold, + const SIInstrInfo *TII) { + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + // Operand is not legal, so try to commute the instruction to + // see if this makes it possible to fold. + unsigned CommuteIdx0; + unsigned CommuteIdx1; + bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); + + if (CanCommute) { + if (CommuteIdx0 == OpNo) + OpNo = CommuteIdx1; + else if (CommuteIdx1 == OpNo) + OpNo = CommuteIdx0; + } + + if (!CanCommute || !TII->commuteInstruction(MI)) + return false; + + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) + return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + return true; +} + +bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (!isSafeToFold(MI.getOpcode())) + continue; + + MachineOperand &OpToFold = MI.getOperand(1); + bool FoldingImm = OpToFold.isImm(); + + // FIXME: We could also be folding things like FrameIndexes and + // TargetIndexes. + if (!FoldingImm && !OpToFold.isReg()) + continue; + + // Folding immediates with more than one use will increase program side. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (FoldingImm && !TII->isInlineConstant(OpToFold) && + !MRI.hasOneUse(MI.getOperand(0).getReg())) + continue; + + // FIXME: Fold operands with subregs. + if (OpToFold.isReg() && + (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || + OpToFold.getSubReg())) + continue; + + std::vector<FoldCandidate> FoldList; + for (MachineRegisterInfo::use_iterator + Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); + Use != E; ++Use) { + + MachineInstr *UseMI = Use->getParent(); + const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) { + continue; + } + + APInt Imm; + + if (FoldingImm) { + const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg()); + Imm = APInt(64, OpToFold.getImm()); + + // Split 64-bit constants into 32-bits for folding. + if (UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + continue; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned MovOp = TII->getMovOpcode( + MRI.getRegClass(UseMI->getOperand(0).getReg())); + if (MovOp == AMDGPU::COPY) + continue; + + UseMI->setDesc(TII->get(MovOp)); + } + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) + continue; + + if (FoldingImm) { + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); + continue; + } + + tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + } + + for (FoldCandidate &Fold : FoldList) { + if (updateOperand(Fold, TRI)) { + // Clear kill flags. + if (!Fold.isImm()) { + assert(Fold.OpToFold && Fold.OpToFold->isReg()); + Fold.OpToFold->setIsKill(false); + } + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); + } + } + } + } + return false; +} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 5a148a24810a..0a3fa2f930d7 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -25,6 +25,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -43,7 +44,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); @@ -52,29 +53,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); - addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); computeRegisterProperties(); - // Condition Codes - setCondCodeAction(ISD::SETONE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::f32, Expand); - - setCondCodeAction(ISD::SETONE, MVT::f64, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); - setCondCodeAction(ISD::SETULE, MVT::f64, Expand); - setCondCodeAction(ISD::SETULT, MVT::f64, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); @@ -89,6 +75,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FSIN, MVT::f32, Custom); setOperationAction(ISD::FCOS, MVT::f32, Custom); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); @@ -102,8 +93,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setOperationAction(ISD::SELECT, MVT::f32, Promote); - AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); @@ -116,6 +105,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); @@ -128,8 +119,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); - + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -140,23 +130,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); + for (MVT VT : MVT::integer_valuetypes()) { + if (VT == MVT::i64) + continue; + + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); + } + + for (MVT VT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); setTruncStoreAction(MVT::i32, MVT::i8, Custom); setTruncStoreAction(MVT::i32, MVT::i16, Custom); @@ -167,9 +167,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::i1, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -196,10 +193,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : case ISD::BITCAST: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: - case ISD::CONCAT_VECTORS: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: break; + case ISD::CONCAT_VECTORS: + setOperationAction(Op, VT, Custom); + break; default: setOperationAction(Op, VT, Expand); break; @@ -207,13 +206,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : } } - for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) { - MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I); - setOperationAction(ISD::FTRUNC, VT, Expand); - setOperationAction(ISD::FCEIL, VT, Expand); - setOperationAction(ISD::FFLOOR, VT, Expand); - } - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -221,18 +213,38 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::FRINT, MVT::f64, Legal); } - // FIXME: These should be removed and handled the same was as f32 fneg. Source - // modifiers also work for the double instructions. - setOperationAction(ISD::FNEG, MVT::f64, Expand); - setOperationAction(ISD::FABS, MVT::f64, Expand); - setOperationAction(ISD::FDIV, MVT::f32, Custom); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); - + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::UINT_TO_FP); + // All memory operations. Some folding on the pointer operand is done to help + // matching the constant offsets in the addressing modes. + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::ATOMIC_LOAD); + setTargetDAGCombine(ISD::ATOMIC_STORE); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); + setTargetDAGCombine(ISD::ATOMIC_SWAP); + setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); + setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); + setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); + setSchedulingPreference(Sched::RegPressure); } @@ -240,15 +252,63 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : // TargetLowering queries //===----------------------------------------------------------------------===// -bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - bool *IsFast) const { +bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, + EVT) const { + // SI has some legal vector types, but no legal vector operations. Say no + // shuffles are legal in order to prefer scalarizing some vector operations. + return false; +} + +// FIXME: This really needs an address space argument. The immediate offset +// size is different for different sets of memory instruction sets. + +// The single offset DS instructions have a 16-bit unsigned byte offset. +// +// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r + +// r + i with addr64. 32-bit has more addressing mode options. Depending on the +// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i). +// +// SMRD instructions have an 8-bit, dword offset. +// +bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const { + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + // Allow a 16-bit unsigned immediate field, since this is what DS instructions + // use. + if (!isUInt<16>(AM.BaseOffs)) + return false; + + // Only support r+r, + switch (AM.Scale) { + case 0: // "r+i" or just "i", depending on HasBaseReg. + break; + case 1: + if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. + return false; + // Otherwise we have r+r or r+i. + break; + case 2: + if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. + return false; + // Allow 2*r as r+r. + break; + default: // Don't allow n * r + return false; + } + + return true; +} + +bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { if (IsFast) *IsFast = false; - // XXX: This depends on the address space and also we may want to revist - // the alignment values we specify in the DataLayout. - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, // which isn't a simple VT. if (!VT.isSimple() || VT == MVT::Other) @@ -261,8 +321,12 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, // XXX - The only mention I see of this in the ISA manual is for LDS direct // reads the "byte address and must be dword aligned". Is it also true for the // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) - return false; + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte + // aligned, 8 byte access in a single operation using ds_read2/write2_b32 + // with adjacent offsets. + return Align % 4 == 0; + } // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the // byte-address are ignored, thus forcing Dword alignment. @@ -272,6 +336,26 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, return VT.bitsGT(MVT::i32); } +EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // FIXME: Should account for address space here. + + // The default fallback uses the private pointer size as a guess for a type to + // use. Make sure we switch these to 64-bit accesses. + + if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global + return MVT::v4i32; + + if (Size >= 8 && DstAlign >= 4) + return MVT::v2i32; + + // Use the default. + return MVT::Other; +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -282,25 +366,37 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); return TII->isInlineConstant(Imm); } SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - SDLoc DL, SDValue Chain, + SDLoc SL, SDValue Chain, unsigned Offset, bool Signed) const { + const DataLayout *DL = getDataLayout(); + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_ADDRESS); - SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, - MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, + MRI.getLiveInVirtReg(InputPtrReg), MVT::i64); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr, DAG.getConstant(Offset, MVT::i64)); - return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr, - MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, - false, false, MemVT.getSizeInBits() >> 3); - + SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + + return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, + VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + DL->getABITypeAlignment(Ty)); // Alignment } SDValue SITargetLowering::LowerFormalArguments( @@ -311,7 +407,9 @@ SDValue SITargetLowering::LowerFormalArguments( SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetMachine &TM = getTargetMachine(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); @@ -320,7 +418,7 @@ SDValue SITargetLowering::LowerFormalArguments( assert(CallConv == CallingConv::C); SmallVector<ISD::InputArg, 16> Splits; - uint32_t Skipped = 0; + BitVector Skipped(Ins.size()); for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; @@ -333,7 +431,7 @@ SDValue SITargetLowering::LowerFormalArguments( if (!Arg.Used) { // We can savely skip PS inputs - Skipped |= 1 << i; + Skipped.set(i); ++PSInputNum; continue; } @@ -364,8 +462,8 @@ SDValue SITargetLowering::LowerFormalArguments( } SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // At least one interpolation mode must be enabled or else the GPU will hang. if (Info->getShaderType() == ShaderType::PIXEL && @@ -378,13 +476,31 @@ SDValue SITargetLowering::LowerFormalArguments( // The pointer to the list of arguments is stored in SGPR0, SGPR1 // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { - Info->NumUserSGPRs = 4; - CCInfo.AllocateReg(AMDGPU::SGPR0); - CCInfo.AllocateReg(AMDGPU::SGPR1); - CCInfo.AllocateReg(AMDGPU::SGPR2); - CCInfo.AllocateReg(AMDGPU::SGPR3); - MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass); + if (Subtarget->isAmdHsaOS()) + Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. + else + Info->NumUserSGPRs = 4; + + unsigned InputPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrRegLo = + TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned InputPtrRegHi = + TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); + + unsigned ScratchPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchPtrRegLo = + TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned ScratchPtrRegHi = + TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); + + CCInfo.AllocateReg(InputPtrRegLo); + CCInfo.AllocateReg(InputPtrRegHi); + CCInfo.AllocateReg(ScratchPtrRegLo); + CCInfo.AllocateReg(ScratchPtrRegHi); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -397,23 +513,36 @@ SDValue SITargetLowering::LowerFormalArguments( for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; - if (Skipped & (1 << i)) { + if (Skipped[i]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); continue; } CCValAssign &VA = ArgLocs[ArgIdx++]; - EVT VT = VA.getLocVT(); + MVT VT = VA.getLocVT(); if (VA.isMemLoc()) { VT = Ins[i].VT; EVT MemVT = Splits[i].VT; + const unsigned Offset = 36 + VA.getLocMemOffset(); // The first 36 bytes of the input buffer contains information about // thread group and global sizes. SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), - 36 + VA.getLocMemOffset(), - Ins[i].Flags.isSExt()); + Offset, Ins[i].Flags.isSExt()); + + const PointerType *ParamTy = + dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex)); + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // On SI local pointers are just offsets into LDS, so they are always + // less than 16-bits. On CI and newer they could potentially be + // real pointers, so we can't guarantee their size. + Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, + DAG.getValueType(MVT::i16)); + } + InVals.push_back(Arg); + Info->ABIArgOffset = Offset + MemVT.getStoreSize(); continue; } assert(VA.isRegLoc() && "Parameter must be in a register!"); @@ -466,69 +595,13 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_ADDR64_RSRC: { - unsigned SuperReg = MI->getOperand(0).getReg(); - unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); - unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); - unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) - .addOperand(MI->getOperand(1)); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) - .addImm(0); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) - .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) - .addReg(SubRegHiLo) - .addImm(AMDGPU::sub0) - .addReg(SubRegHiHi) - .addImm(AMDGPU::sub1); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) - .addReg(SubRegLo) - .addImm(AMDGPU::sub0_sub1) - .addReg(SubRegHi) - .addImm(AMDGPU::sub2_sub3); - MI->eraseFromParent(); - break; - } - case AMDGPU::SI_BUFFER_RSRC: { - unsigned SuperReg = MI->getOperand(0).getReg(); - unsigned Args[4]; - for (unsigned i = 0, e = 4; i < e; ++i) { - MachineOperand &Arg = MI->getOperand(i + 1); - - if (Arg.isReg()) { - Args[i] = Arg.getReg(); - continue; - } - - assert(Arg.isImm()); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg) - .addImm(Arg.getImm()); - Args[i] = Reg; - } - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), - SuperReg) - .addReg(Args[0]) - .addImm(AMDGPU::sub0) - .addReg(Args[1]) - .addImm(AMDGPU::sub1) - .addReg(Args[2]) - .addImm(AMDGPU::sub2) - .addReg(Args[3]) - .addImm(AMDGPU::sub3); - MI->eraseFromParent(); - break; - } case AMDGPU::V_SUB_F64: { unsigned DestReg = MI->getOperand(0).getReg(); BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) @@ -536,8 +609,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( .addReg(MI->getOperand(1).getReg()) .addImm(1) // SRC1 modifiers .addReg(MI->getOperand(2).getReg()) - .addImm(0) // SRC2 modifiers - .addImm(0) // src2 .addImm(0) // CLAMP .addImm(0); // OMOD MI->eraseFromParent(); @@ -555,58 +626,15 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI->eraseFromParent(); break; } - case AMDGPU::FABS_SI: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), - Reg) - .addImm(0x7fffffff); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addReg(Reg); - MI->eraseFromParent(); - break; - } - case AMDGPU::FNEG_SI: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), - Reg) - .addImm(0x80000000); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addReg(Reg); - MI->eraseFromParent(); - break; - } - case AMDGPU::FCLAMP_SI: { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64), - MI->getOperand(0).getReg()) - .addImm(0) // SRC0 modifiers - .addOperand(MI->getOperand(1)) - .addImm(0) // SRC1 modifiers - .addImm(0) // SRC1 - .addImm(1) // CLAMP - .addImm(0); // OMOD - MI->eraseFromParent(); - } } return BB; } -EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { +EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { if (!VT.isVector()) { return MVT::i1; } - return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); } MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { @@ -636,8 +664,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { //===----------------------------------------------------------------------===// SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); @@ -656,114 +682,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntrinsicID = - cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - EVT VT = Op.getValueType(); - SDLoc DL(Op); - switch (IntrinsicID) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case Intrinsic::r600_read_ngroups_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false); - case Intrinsic::r600_read_ngroups_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false); - case Intrinsic::r600_read_ngroups_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false); - case Intrinsic::r600_read_global_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false); - case Intrinsic::r600_read_global_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false); - case Intrinsic::r600_read_global_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false); - case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false); - case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false); - case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); - case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT); - case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT); - case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT); - case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, - AMDGPU::VGPR0, VT); - case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, - AMDGPU::VGPR1, VT); - case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, - AMDGPU::VGPR2, VT); - case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops [] = { - Op.getOperand(1), - Op.getOperand(2) - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, - VT.getSizeInBits() / 8, 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, VT, MMO); - } - case AMDGPUIntrinsic::SI_sample: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); - case AMDGPUIntrinsic::SI_sampleb: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); - case AMDGPUIntrinsic::SI_sampled: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); - case AMDGPUIntrinsic::SI_samplel: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); - case AMDGPUIntrinsic::SI_vs_load_input: - return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - } + case ISD::GlobalAddress: { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + return LowerGlobalAddress(MFI, Op, DAG); } - - case ISD::INTRINSIC_VOID: - SDValue Chain = Op.getOperand(0); - unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - - switch (IntrinsicID) { - case AMDGPUIntrinsic::SI_tbuffer_store: { - SDLoc DL(Op); - SDValue Ops [] = { - Chain, - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10), - Op.getOperand(11), - Op.getOperand(12), - Op.getOperand(13), - Op.getOperand(14) - }; - EVT VT = Op.getOperand(3).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getSizeInBits() / 8, 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); - } - default: - break; - } + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); } return SDValue(); } @@ -786,16 +711,9 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FINode->getIndex(); - CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32); - return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); } @@ -849,7 +767,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, BR->getOperand(0), BRCOND.getOperand(2) }; - DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops); + SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); + DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); + BR = NewBR.getNode(); } SDValue Chain = SDValue(Result, Result->getNumValues() - 1); @@ -905,6 +825,139 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); } +SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + + switch (IntrinsicID) { + case Intrinsic::r600_read_ngroups_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_X, false); + case Intrinsic::r600_read_ngroups_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Y, false); + case Intrinsic::r600_read_ngroups_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Z, false); + case Intrinsic::r600_read_global_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + case Intrinsic::r600_read_global_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + case Intrinsic::r600_read_global_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + case Intrinsic::r600_read_local_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_X, false); + case Intrinsic::r600_read_local_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + case Intrinsic::r600_read_local_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_Z, false); + + case Intrinsic::AMDGPU_read_workdim: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset, + false); + + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + case AMDGPUIntrinsic::SI_load_const: { + SDValue Ops[] = { + Op.getOperand(1), + Op.getOperand(2) + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + Op->getVTList(), Ops, VT, MMO); + } + case AMDGPUIntrinsic::SI_sample: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); + case AMDGPUIntrinsic::SI_sampleb: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); + case AMDGPUIntrinsic::SI_sampled: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); + case AMDGPUIntrinsic::SI_samplel: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); + case AMDGPUIntrinsic::SI_vs_load_input: + return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + default: + return AMDGPUTargetLowering::LowerOperation(Op, DAG); + } +} + +SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + + switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_tbuffer_store: { + SDLoc DL(Op); + SDValue Ops[] = { + Chain, + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10), + Op.getOperand(11), + Op.getOperand(12), + Op.getOperand(13), + Op.getOperand(14) + }; + + EVT VT = Op.getOperand(3).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } + default: + return SDValue(); + } +} + SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); @@ -923,7 +976,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { break; // fall-through case AMDGPUAS::LOCAL_ADDRESS: - return SplitVectorLoad(Op, DAG); + return ScalarizeVectorLoad(Op, DAG); } } @@ -1027,7 +1080,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const APFloat K1Val(BitsToFloat(0x2f800000)); const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); - const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, MVT::f32); EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); @@ -1073,7 +1126,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { if (VT.isVector() && VT.getVectorNumElements() > 4) - return SplitVectorStore(Op, DAG); + return ScalarizeVectorStore(Op, DAG); return SDValue(); } @@ -1082,7 +1135,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return Ret; if (VT.isVector() && VT.getVectorNumElements() >= 8) - return SplitVectorStore(Op, DAG); + return ScalarizeVectorStore(Op, DAG); if (VT == MVT::i1) return DAG.getTruncStore(Store->getChain(), DL, @@ -1114,7 +1167,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { //===----------------------------------------------------------------------===// SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) { + DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); EVT ScalarVT = VT.getScalarType(); if (ScalarVT != MVT::f32) @@ -1162,8 +1215,21 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast<LoadSDNode>(Src); + + unsigned AS = Load->getAddressSpace(); + unsigned Align = Load->getAlignment(); + Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + + // Don't try to replace the load if we have to expand it due to alignment + // problems. Otherwise we will end up scalarizing the load, and trying to + // repack into the vector for no real reason. + if (Align < ABIAlignment && + !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { + return SDValue(); + } + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, Load->getChain(), Load->getBasePtr(), @@ -1203,33 +1269,259 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, return SDValue(); } +// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) + +// This is a variant of +// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), +// +// The normal DAG combiner will do this, but only if the add has one use since +// that would increase the number of instructions. +// +// This prevents us from seeing a constant offset that can be folded into a +// memory instruction's addressing mode. If we know the resulting add offset of +// a pointer can be folded into an addressing offset, we can replace the pointer +// operand with the add of new constant offset. This eliminates one of the uses, +// and may allow the remaining use to also be simplified. +// +SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, + unsigned AddrSpace, + DAGCombinerInfo &DCI) const { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N0.getOpcode() != ISD::ADD) + return SDValue(); + + const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); + if (!CN1) + return SDValue(); + + const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!CAdd) + return SDValue(); + + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); + + // If the resulting offset is too large, we can't fold it into the addressing + // mode offset. + APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); + if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + + SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); + SDValue COffset = DAG.getConstant(Offset, MVT::i32); + + return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); +} + +SDValue SITargetLowering::performAndCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + + // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> + // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::SETCC && + RHS.getOpcode() == ISD::SETCC) { + ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); + + SDValue X = LHS.getOperand(0); + SDValue Y = RHS.getOperand(0); + if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) + return SDValue(); + + if (LCC == ISD::SETO) { + if (X != LHS.getOperand(1)) + return SDValue(); + + if (RCC == ISD::SETUNE) { + const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); + if (!C1 || !C1->isInfinity() || C1->isNegative()) + return SDValue(); + + const uint32_t Mask = SIInstrFlags::N_NORMAL | + SIInstrFlags::N_SUBNORMAL | + SIInstrFlags::N_ZERO | + SIInstrFlags::P_ZERO | + SIInstrFlags::P_SUBNORMAL | + SIInstrFlags::P_NORMAL; + + static_assert(((~(SIInstrFlags::S_NAN | + SIInstrFlags::Q_NAN | + SIInstrFlags::N_INFINITY | + SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, + "mask not equal"); + + return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, + X, DAG.getConstant(Mask, MVT::i32)); + } + } + } + + return SDValue(); +} + +SDValue SITargetLowering::performOrCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1, + Src, DAG.getConstant(NewMask, MVT::i32)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performClassCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue Mask = N->getOperand(1); + + // fp_class x, 0 -> false + if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { + if (CMask->isNullValue()) + return DAG.getConstant(0, MVT::i1); + } + + return SDValue(); +} + +static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return AMDGPUISD::FMAX3; + case AMDGPUISD::SMAX: + return AMDGPUISD::SMAX3; + case AMDGPUISD::UMAX: + return AMDGPUISD::UMAX3; + case ISD::FMINNUM: + return AMDGPUISD::FMIN3; + case AMDGPUISD::SMIN: + return AMDGPUISD::SMIN3; + case AMDGPUISD::UMIN: + return AMDGPUISD::UMIN3; + default: + llvm_unreachable("Not a min/max opcode"); + } +} + +SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Only do this if the inner op has one use since this will just increases + // register pressure for no benefit. + + // max(max(a, b), c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // max(a, max(b, c)) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performSetCCCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = LHS.getValueType(); + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Match isinf pattern + // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { + const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + if (!CRHS) + return SDValue(); + + const APFloat &APF = CRHS->getValueAPF(); + if (APF.isInfinity() && !APF.isNegative()) { + unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; + return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, + LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32)); + } + } + + return SDValue(); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); - EVT VT = N->getValueType(0); switch (N->getOpcode()) { - default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - case ISD::SETCC: { - SDValue Arg0 = N->getOperand(0); - SDValue Arg1 = N->getOperand(1); - SDValue CC = N->getOperand(2); - ConstantSDNode * C = nullptr; - ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); - - // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) - if (VT == MVT::i1 - && Arg0.getOpcode() == ISD::SIGN_EXTEND - && Arg0.getOperand(0).getValueType() == MVT::i1 - && (C = dyn_cast<ConstantSDNode>(Arg1)) - && C->isNullValue() - && CCOp == ISD::SETNE) { - return SimplifySetCC(VT, Arg0.getOperand(0), - DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); - } - break; - } + default: + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::SETCC: + return performSetCCCombine(N, DCI); + case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMINNUM: + case AMDGPUISD::SMAX: + case AMDGPUISD::SMIN: + case AMDGPUISD::UMAX: + case AMDGPUISD::UMIN: { + if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + getTargetMachine().getOptLevel() > CodeGenOpt::None) + return performMin3Max3Combine(N, DCI); + break; + } case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: @@ -1254,22 +1546,155 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: { return performUCharToFloatCombine(N, DCI); + + case ISD::FADD: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + if (VT != MVT::f32) + break; + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // These should really be instruction patterns, but writing patterns with + // source modiifiers is a pain. + + // fadd (fadd (a, a), b) -> mad 2.0, a, b + if (LHS.getOpcode() == ISD::FADD) { + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS); + } + } + + // fadd (b, fadd (a, a)) -> mad 2.0, a, b + if (RHS.getOpcode() == ISD::FADD) { + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS); + } + } + + break; + } + case ISD::FSUB: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + + // Try to get the fneg to fold into the source modifier. This undoes generic + // DAG combines and folds them into the mad. + if (VT == MVT::f32) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::FMUL) { + // (fsub (fmul a, b), c) -> mad a, b, (fneg c) + + SDValue A = LHS.getOperand(0); + SDValue B = LHS.getOperand(1); + SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS); + + return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); + } + + if (RHS.getOpcode() == ISD::FMUL) { + // (fsub c, (fmul a, b)) -> mad (fneg a), b, c + + SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0)); + SDValue B = RHS.getOperand(1); + SDValue C = LHS; + + return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); + } + + if (LHS.getOpcode() == ISD::FADD) { + // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) + + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, MVT::f32); + SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); + + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS); + } + } + + if (RHS.getOpcode() == ISD::FADD) { + // (fsub c, (fadd a, a)) -> mad -2.0, a, c + + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32); + return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS); + } + } + } + + break; } } + case ISD::LOAD: + case ISD::STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: + case ISD::ATOMIC_CMP_SWAP: + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + case ISD::ATOMIC_SWAP: + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + if (DCI.isBeforeLegalize()) + break; + + MemSDNode *MemNode = cast<MemSDNode>(N); + SDValue Ptr = MemNode->getBasePtr(); + // TODO: We could also do this for multiplies. + unsigned AS = MemNode->getAddressSpace(); + if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); + if (NewPtr) { + SmallVector<SDValue, 8> NewOps; + for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I) + NewOps.push_back(MemNode->getOperand(I)); + + NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); + } + } + break; + } + case ISD::AND: + return performAndCombine(N, DCI); + case ISD::OR: + return performOrCombine(N, DCI); + case AMDGPUISD::FP_CLASS: + return performClassCombine(N, DCI); + } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } /// \brief Test if RegClass is one of the VSrc classes static bool isVSrc(unsigned RegClass) { - return AMDGPU::VSrc_32RegClassID == RegClass || - AMDGPU::VSrc_64RegClassID == RegClass; -} - -/// \brief Test if RegClass is one of the SSrc classes -static bool isSSrc(unsigned RegClass) { - return AMDGPU::SSrc_32RegClassID == RegClass || - AMDGPU::SSrc_64RegClassID == RegClass; + switch(RegClass) { + default: return false; + case AMDGPU::VS_32RegClassID: + case AMDGPU::VS_64RegClassID: + return true; + } } /// \brief Analyze the possible immediate value Op @@ -1278,75 +1703,36 @@ static bool isSSrc(unsigned RegClass) { /// and the immediate value if it's a literal immediate int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - union { - int32_t I; - float F; - } Imm; + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { - if (Node->getZExtValue() >> 32) { - return -1; - } - Imm.I = Node->getSExtValue(); - } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { - if (N->getValueType(0) != MVT::f32) + if (Node->getZExtValue() >> 32) return -1; - Imm.F = Node->getValueAPF().convertToFloat(); - } else - return -1; // It isn't an immediate - - if ((Imm.I >= -16 && Imm.I <= 64) || - Imm.F == 0.5f || Imm.F == -0.5f || - Imm.F == 1.0f || Imm.F == -1.0f || - Imm.F == 2.0f || Imm.F == -2.0f || - Imm.F == 4.0f || Imm.F == -4.0f) - return 0; // It's an inline immediate - - return Imm.I; // It's a literal immediate -} - -/// \brief Try to fold an immediate directly into an instruction -bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, - bool &ScalarSlotUsed) const { - - MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - if (!Mov || !TII->isMov(Mov->getMachineOpcode())) - return false; - const SDValue &Op = Mov->getOperand(0); - int32_t Value = analyzeImmediate(Op.getNode()); - if (Value == -1) { - // Not an immediate at all - return false; + if (TII->isInlineConstant(Node->getAPIntValue())) + return 0; - } else if (Value == 0) { - // Inline immediates can always be fold - Operand = Op; - return true; + return Node->getZExtValue(); + } - } else if (Value == Immediate) { - // Already fold literal immediate - Operand = Op; - return true; + if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { + if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) + return 0; - } else if (!ScalarSlotUsed && !Immediate) { - // Fold this literal immediate - ScalarSlotUsed = true; - Immediate = Value; - Operand = Op; - return true; + if (Node->getValueType(0) == MVT::f32) + return FloatToBits(Node->getValueAPF().convertToFloat()); + return -1; } - return false; + return -1; } const TargetRegisterClass *SITargetLowering::getRegClassForNode( SelectionDAG &DAG, const SDValue &Op) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); if (!Op->isMachineOpcode()) { @@ -1375,10 +1761,9 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode( // If the COPY_TO_REGCLASS instruction is copying to a VSrc register // class, then the register class for the value could be either a // VReg or and SReg. In order to get a more accurate - if (OpClassID == AMDGPU::VSrc_32RegClassID || - OpClassID == AMDGPU::VSrc_64RegClassID) { + if (isVSrc(OpClassID)) return getRegClassForNode(DAG, Op.getOperand(0)); - } + return TRI.getRegClass(OpClassID); case AMDGPU::EXTRACT_SUBREG: { int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -1398,7 +1783,8 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode( /// \brief Does "Op" fit into register class "RegClass" ? bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, unsigned RegClass) const { - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); if (!RC) { return false; @@ -1406,242 +1792,6 @@ bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, return TRI->getRegClass(RegClass)->hasSubClassEq(RC); } -/// \brief Make sure that we don't exeed the number of allowed scalars -void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, - unsigned RegClass, - bool &ScalarSlotUsed) const { - - // First map the operands register class to a destination class - if (RegClass == AMDGPU::VSrc_32RegClassID) - RegClass = AMDGPU::VReg_32RegClassID; - else if (RegClass == AMDGPU::VSrc_64RegClassID) - RegClass = AMDGPU::VReg_64RegClassID; - else - return; - - // Nothing to do if they fit naturally - if (fitsRegClass(DAG, Operand, RegClass)) - return; - - // If the scalar slot isn't used yet use it now - if (!ScalarSlotUsed) { - ScalarSlotUsed = true; - return; - } - - // This is a conservative aproach. It is possible that we can't determine the - // correct register class and copy too often, but better safe than sorry. - - SDNode *Node; - // We can't use COPY_TO_REGCLASS with FrameIndex arguments. - if (isa<FrameIndexSDNode>(Operand)) { - unsigned Opcode = Operand.getValueType() == MVT::i32 ? - AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(), - Operand); - } else { - SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); - Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), - Operand.getValueType(), Operand, RC); - } - Operand = SDValue(Node, 0); -} - -/// \returns true if \p Node's operands are different from the SDValue list -/// \p Ops -static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { - for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { - if (Ops[i].getNode() != Node->getOperand(i).getNode()) { - return true; - } - } - return false; -} - -/// \brief Try to fold the Nodes operands into the Node -SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, - SelectionDAG &DAG) const { - - // Original encoding (either e32 or e64) - int Opcode = Node->getMachineOpcode(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - const MCInstrDesc *Desc = &TII->get(Opcode); - - unsigned NumDefs = Desc->getNumDefs(); - unsigned NumOps = Desc->getNumOperands(); - - // Commuted opcode if available - int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; - const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev); - - assert(!DescRev || DescRev->getNumDefs() == NumDefs); - assert(!DescRev || DescRev->getNumOperands() == NumOps); - - // e64 version if available, -1 otherwise - int OpcodeE64 = AMDGPU::getVOPe64(Opcode); - const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64); - int InputModifiers[3] = {0}; - - assert(!DescE64 || DescE64->getNumDefs() == NumDefs); - - int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; - bool HaveVSrc = false, HaveSSrc = false; - - // First figure out what we already have in this instruction. - for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; - i != e && Op < NumOps; ++i, ++Op) { - - unsigned RegClass = Desc->OpInfo[Op].RegClass; - if (isVSrc(RegClass)) - HaveVSrc = true; - else if (isSSrc(RegClass)) - HaveSSrc = true; - else - continue; - - int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); - if (Imm != -1 && Imm != 0) { - // Literal immediate - Immediate = Imm; - } - } - - // If we neither have VSrc nor SSrc, it makes no sense to continue. - if (!HaveVSrc && !HaveSSrc) - return Node; - - // No scalar allowed when we have both VSrc and SSrc - bool ScalarSlotUsed = HaveVSrc && HaveSSrc; - - // Second go over the operands and try to fold them - std::vector<SDValue> Ops; - bool Promote2e64 = false; - for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; - i != e && Op < NumOps; ++i, ++Op) { - - const SDValue &Operand = Node->getOperand(i); - Ops.push_back(Operand); - - // Already folded immediate? - if (isa<ConstantSDNode>(Operand.getNode()) || - isa<ConstantFPSDNode>(Operand.getNode())) - continue; - - // Is this a VSrc or SSrc operand? - unsigned RegClass = Desc->OpInfo[Op].RegClass; - if (isVSrc(RegClass) || isSSrc(RegClass)) { - // Try to fold the immediates - if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { - // Folding didn't work, make sure we don't hit the SReg limit. - ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); - } - continue; - } else { - // If it's not a VSrc or SSrc operand check if we have a GlobalAddress. - // These will be lowered to immediates, so we will need to insert a MOV. - if (isa<GlobalAddressSDNode>(Ops[i])) { - SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(), - Operand.getValueType(), Operand); - Ops[i] = SDValue(Node, 0); - } - } - - if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { - - unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; - assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); - - // Test if it makes sense to swap operands - if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || - (!fitsRegClass(DAG, Ops[1], RegClass) && - fitsRegClass(DAG, Ops[1], OtherRegClass))) { - - // Swap commutable operands - std::swap(Ops[0], Ops[1]); - - Desc = DescRev; - DescRev = nullptr; - continue; - } - } - - if (Immediate) - continue; - - if (DescE64) { - // Test if it makes sense to switch to e64 encoding - unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; - if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) - continue; - - int32_t TmpImm = -1; - if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || - (!fitsRegClass(DAG, Ops[i], RegClass) && - fitsRegClass(DAG, Ops[1], OtherRegClass))) { - - // Switch to e64 encoding - Immediate = -1; - Promote2e64 = true; - Desc = DescE64; - DescE64 = nullptr; - } - } - - if (!DescE64 && !Promote2e64) - continue; - if (!Operand.isMachineOpcode()) - continue; - if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) { - Ops.pop_back(); - Ops.push_back(Operand.getOperand(0)); - InputModifiers[i] = 1; - Promote2e64 = true; - if (!DescE64) - continue; - Desc = DescE64; - DescE64 = nullptr; - } - else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) { - Ops.pop_back(); - Ops.push_back(Operand.getOperand(0)); - InputModifiers[i] = 2; - Promote2e64 = true; - if (!DescE64) - continue; - Desc = DescE64; - DescE64 = nullptr; - } - } - - if (Promote2e64) { - std::vector<SDValue> OldOps(Ops); - Ops.clear(); - for (unsigned i = 0; i < OldOps.size(); ++i) { - // src_modifier - Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32)); - Ops.push_back(OldOps[i]); - } - // Add the modifier flags while promoting - for (unsigned i = 0; i < 2; ++i) - Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); - } - - // Add optional chain and glue - for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) - Ops.push_back(Node->getOperand(i)); - - // Nodes that have a glue result are not CSE'd by getMachineNode(), so in - // this case a brand new node is always be created, even if the operands - // are the same as before. So, manually check if anything has been changed. - if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { - return Node; - } - - // Create a complete new instruction - return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); -} - /// \brief Helper function for adjustWritemask static unsigned SubIdx2Lane(unsigned Idx) { switch (Idx) { @@ -1706,7 +1856,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // If we only got one lane, replace it with a copy // (if NewDmask has only one bit set...) if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); + SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), Users[Lane]->getValueType(0), SDValue(Node, 0), RC); @@ -1733,46 +1883,185 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, } } +/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) +/// with frame index operands. +/// LLVM assumes that inputs are to these instructions are registers. +void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, + SelectionDAG &DAG) const { + + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < Node->getNumOperands(); ++i) { + if (!isa<FrameIndexSDNode>(Node->getOperand(i))) { + Ops.push_back(Node->getOperand(i)); + continue; + } + + SDLoc DL(Node); + Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, + Node->getOperand(i).getValueType(), + Node->getOperand(i)), 0)); + } + + DAG.UpdateNodeOperands(Node, Ops); +} + /// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); Node = AdjustRegClass(Node, DAG); if (TII->isMIMG(Node->getMachineOpcode())) adjustWritemask(Node, DAG); - return foldOperands(Node, DAG); + if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || + Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + legalizeTargetIndependentNode(Node, DAG); + return Node; + } + return Node; } /// \brief Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); - if (!TII->isMIMG(MI->getOpcode())) + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); + + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + TII->legalizeOperands(MI); + + if (TII->isMIMG(MI->getOpcode())) { + unsigned VReg = MI->getOperand(0).getReg(); + unsigned Writemask = MI->getOperand(1).getImm(); + unsigned BitsSet = 0; + for (unsigned i = 0; i < 4; ++i) + BitsSet += Writemask & (1 << i) ? 1 : 0; + + const TargetRegisterClass *RC; + switch (BitsSet) { + default: return; + case 1: RC = &AMDGPU::VGPR_32RegClass; break; + case 2: RC = &AMDGPU::VReg_64RegClass; break; + case 3: RC = &AMDGPU::VReg_96RegClass; break; + } + + unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); + MI->setDesc(TII->get(NewOpcode)); + MRI.setRegClass(VReg, RC); return; + } - unsigned VReg = MI->getOperand(0).getReg(); - unsigned Writemask = MI->getOperand(1).getImm(); - unsigned BitsSet = 0; - for (unsigned i = 0; i < 4; ++i) - BitsSet += Writemask & (1 << i) ? 1 : 0; + // Replace unused atomics with the no return version. + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); + if (NoRetAtomicOp != -1) { + if (!Node->hasAnyUseOfValue(0)) { + MI->setDesc(TII->get(NoRetAtomicOp)); + MI->RemoveOperand(0); + } - const TargetRegisterClass *RC; - switch (BitsSet) { - default: return; - case 1: RC = &AMDGPU::VReg_32RegClass; break; - case 2: RC = &AMDGPU::VReg_64RegClass; break; - case 3: RC = &AMDGPU::VReg_96RegClass; break; + return; } +} - unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); - MI->setDesc(TII->get(NewOpcode)); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - MRI.setRegClass(VReg, RC); +static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { + SDValue K = DAG.getTargetConstant(Val, MVT::i32); + return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); +} + +MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); +#if 1 + // XXX - Workaround for moveToVALU not handling different register class + // inserts for REG_SEQUENCE. + + // Build the half of the subregister with the constants. + const SDValue Ops0[] = { + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub1, MVT::i32) + }; + + SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, Ops0), 0); + + // Combine the constants and the pointer. + const SDValue Ops1[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), + SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); +#else + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); + +#endif +} + +/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// The TID (Thread ID) is multipled by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to the +/// resource ponter. +MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const { + SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); + SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); + if (RsrcDword1) { + PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, + DAG.getConstant(RsrcDword1, MVT::i32)), 0); + } + + SDValue DataLo = buildSMovImm32(DAG, DL, + RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); + SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); + + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + PtrLo, + DAG.getTargetConstant(AMDGPU::sub0, MVT::i32), + PtrHi, + DAG.getTargetConstant(AMDGPU::sub1, MVT::i32), + DataLo, + DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), + DataHi, + DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); +} + +MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { + const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( + getTargetMachine().getSubtargetImpl()->getInstrInfo()); + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + return buildRSRC(DAG, DL, Ptr, 0, Rsrc); } MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, @@ -1800,12 +2089,26 @@ MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, return N; } ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); - SDValue Ops[] = { - SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, - DAG.getConstant(0, MVT::i64)), 0), - N->getOperand(0), - DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) - }; + + const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64); + SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0); + MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr); + + SmallVector<SDValue, 8> Ops; + Ops.push_back(SDValue(RSrc, 0)); + Ops.push_back(N->getOperand(0)); + + // The immediate offset is in dwords on SI and in bytes on VI. + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Ops.push_back(DAG.getTargetConstant(Offset->getSExtValue(), MVT::i32)); + else + Ops.push_back(DAG.getTargetConstant(Offset->getSExtValue() << 2, MVT::i32)); + + // Copy remaining operands so we keep any chain and glue nodes that follow + // the normal operands. + for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I) + Ops.push_back(N->getOperand(I)); + return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); } } diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index d106d4abb187..876fd8c9f369 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef SIISELLOWERING_H -#define SIISELLOWERING_H +#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H +#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H #include "AMDGPUISelLowering.h" #include "SIInstrInfo.h" @@ -27,6 +27,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SelectionDAG &DAG) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -34,30 +37,49 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - bool foldImm(SDValue &Operand, int32_t &Immediate, - bool &ScalarSlotUsed) const; const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG, const SDValue &Op) const; bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op, unsigned RegClass) const; - void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, - unsigned RegClass, bool &ScalarSlotUsed) const; - SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const; - static SDValue performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI); + SDValue performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue performSHLPtrCombine(SDNode *N, + unsigned AS, + DAGCombinerInfo &DCI) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; public: SITargetLowering(TargetMachine &tm); - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, - bool *IsFast) const override; + + bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, + EVT /*VT*/) const override; + + bool isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const override; + + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *IsFast) const override; + + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -85,8 +107,19 @@ public: int32_t analyzeImmediate(const SDNode *N) const; SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const override; + void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; + + MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; + MachineSDNode *buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const; + MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const; }; } // End namespace llvm -#endif //SIISELLOWERING_H +#endif diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 7dfc31bdfa01..181b11643bf3 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -17,6 +17,8 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -39,6 +41,12 @@ typedef union { } Counters; +typedef enum { + OTHER, + SMEM, + VMEM +} InstType; + typedef Counters RegCounters[512]; typedef std::pair<unsigned, unsigned> RegInterval; @@ -71,6 +79,9 @@ private: /// \brief Different export instruction types seen since last wait. unsigned ExpInstrTypesSeen; + /// \brief Type of the last opcode. + InstType LastOpcodeType; + /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -81,7 +92,8 @@ private: RegInterval getRegInterval(MachineOperand &Op); /// \brief Handle instructions async components - void pushInstruction(MachineInstr &MI); + void pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); /// \brief Insert the actual wait instruction bool insertWait(MachineBasicBlock &MBB, @@ -174,6 +186,29 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { if (!MI.getDesc().mayStore()) return false; + // Check if this operand is the value being stored. + // Special case for DS instructions, since the address + // operand comes before the value operand and it may have + // multiple data operands. + + if (TII->isDS(MI.getOpcode())) { + MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); + if (Data && Op.isIdenticalTo(*Data)) + return true; + + MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + if (Data0 && Op.isIdenticalTo(*Data0)) + return true; + + MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); + if (Data1 && Op.isIdenticalTo(*Data1)) + return true; + + return false; + } + + // NOTE: This assumes that the value operand is before the + // address operand, and that there is only one value operand. for (MachineInstr::mop_iterator I = MI.operands_begin(), E = MI.operands_end(); I != E; ++I) { @@ -201,10 +236,11 @@ RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { return Result; } -void SIInsertWaits::pushInstruction(MachineInstr &MI) { +void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { // Get the hardware counter increments and sum them up - Counters Increment = getHwCounts(MI); + Counters Increment = getHwCounts(*I); unsigned Sum = 0; for (unsigned i = 0; i < 3; ++i) { @@ -213,17 +249,42 @@ void SIInsertWaits::pushInstruction(MachineInstr &MI) { } // If we don't increase anything then that's it - if (Sum == 0) + if (Sum == 0) { + LastOpcodeType = OTHER; return; + } + + if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM + // or SMEM clause, respectively. + // + // The temporary workaround is to break the clauses with S_NOP. + // + // The proper solution would be to allocate registers such that all source + // and destination registers don't overlap, e.g. this is illegal: + // r0 = load r2 + // r2 = load r0 + if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || + (LastOpcodeType == VMEM && Increment.Named.VM)) { + // Insert a NOP to break the clause. + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + } + + if (TII->isSMRD(I->getOpcode())) + LastOpcodeType = SMEM; + else if (Increment.Named.VM) + LastOpcodeType = VMEM; + } // Remember which export instructions we have seen if (Increment.Named.EXP) { - ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2; + ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; } - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI.getOperand(i); + MachineOperand &Op = I->getOperand(i); if (!isOpRelevant(Op)) continue; @@ -300,6 +361,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, ((Counts.Named.EXP & 0x7) << 4) | ((Counts.Named.LGKM & 0x7) << 8)); + LastOpcodeType = OTHER; return true; } @@ -346,13 +408,15 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; - TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo()); - TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); MRI = &MF.getRegInfo(); WaitedOn = ZeroCounts; LastIssued = ZeroCounts; + LastOpcodeType = OTHER; memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); @@ -364,8 +428,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { - Changes |= insertWait(MBB, I, handleOperands(*I)); - pushInstruction(*I); + // Wait for everything before a barrier. + if (I->getOpcode() == AMDGPU::S_BARRIER) + Changes |= insertWait(MBB, I, LastIssued); + else + Changes |= insertWait(MBB, I, handleOperands(*I)); + pushInstruction(MBB, I); } // Wait for everything at the end of the MBB diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 00e69ddbeea4..99a1df36c1f4 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -17,24 +17,58 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> VM_CNT = 0; field bits<1> EXP_CNT = 0; field bits<1> LGKM_CNT = 0; - field bits<1> MIMG = 0; - field bits<1> SMRD = 0; + + field bits<1> SALU = 0; + field bits<1> VALU = 0; + + field bits<1> SOP1 = 0; + field bits<1> SOP2 = 0; + field bits<1> SOPC = 0; + field bits<1> SOPK = 0; + field bits<1> SOPP = 0; + field bits<1> VOP1 = 0; field bits<1> VOP2 = 0; field bits<1> VOP3 = 0; field bits<1> VOPC = 0; - field bits<1> SALU = 0; + field bits<1> MUBUF = 0; + field bits<1> MTBUF = 0; + field bits<1> SMRD = 0; + field bits<1> DS = 0; + field bits<1> MIMG = 0; + field bits<1> FLAT = 0; + + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; let TSFlags{2} = LGKM_CNT; - let TSFlags{3} = MIMG; - let TSFlags{4} = SMRD; - let TSFlags{5} = VOP1; - let TSFlags{6} = VOP2; - let TSFlags{7} = VOP3; - let TSFlags{8} = VOPC; - let TSFlags{9} = SALU; + + let TSFlags{3} = SALU; + let TSFlags{4} = VALU; + + let TSFlags{5} = SOP1; + let TSFlags{6} = SOP2; + let TSFlags{7} = SOPC; + let TSFlags{8} = SOPK; + let TSFlags{9} = SOPP; + + let TSFlags{10} = VOP1; + let TSFlags{11} = VOP2; + let TSFlags{12} = VOP3; + let TSFlags{13} = VOPC; + + let TSFlags{14} = MUBUF; + let TSFlags{15} = MTBUF; + let TSFlags{16} = SMRD; + let TSFlags{17} = DS; + let TSFlags{18} = MIMG; + let TSFlags{19} = FLAT; + + // Most instructions require adjustments after selection to satisfy + // operand requirements. + let hasPostISelHook = 1; + let SchedRW = [Write32Bit]; } class Enc32 { @@ -49,6 +83,44 @@ class Enc64 { int Size = 8; } +let Uses = [EXEC] in { + +class VOPCCommon <dag ins, string asm, list<dag> pattern> : + InstSI <(outs VCCReg:$dst), ins, asm, pattern> { + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOPC = 1; + let VALU = 1; + let Size = 4; +} + +class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOP1 = 1; + let VALU = 1; + let Size = 4; +} + +class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VOP2 = 1; + let VALU = 1; + let Size = 4; +} + class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern> { @@ -56,11 +128,20 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : let mayStore = 0; let hasSideEffects = 0; let UseNamedOperandTable = 1; + // Using complex patterns gives VOP3 patterns a very high complexity rating, + // but standalone patterns are almost always prefered, so we need to adjust the + // priority lower. The goal is to use a high number to reduce complexity to + // zero (or less than zero). + let AddedComplexity = -1000; + let VOP3 = 1; + let VALU = 1; int Size = 8; } +} // End Uses = [EXEC] + //===----------------------------------------------------------------------===// // Scalar operations //===----------------------------------------------------------------------===// @@ -134,22 +215,26 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 { let Inst{31-27} = 0x18; //encoding } -class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern>, SOP1e <op> { - +let SchedRW = [WriteSALU] in { +class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; + let SOP1 = 1; } -class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern>, SOP2e<op> { +class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; + let SOP2 = 1; + + let UseNamedOperandTable = 1; } class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : @@ -160,31 +245,48 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let mayStore = 0; let hasSideEffects = 0; let SALU = 1; + let SOPC = 1; + + let UseNamedOperandTable = 1; } -class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins , asm, pattern>, SOPKe<op> { +class SOPK <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins , asm, pattern> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; + let SOPK = 1; + + let UseNamedOperandTable = 1; } -class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : +class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> : InstSI <(outs), ins, asm, pattern >, SOPPe <op> { let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; + let isCodeGenOnly = 0; let SALU = 1; + let SOPP = 1; + + let UseNamedOperandTable = 1; } -class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm, - list<dag> pattern> : InstSI<outs, ins, asm, pattern>, SMRDe<op, imm> { +} // let SchedRW = [WriteSALU] + +class SMRD <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { let LGKM_CNT = 1; let SMRD = 1; + let mayStore = 0; + let mayLoad = 1; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteSMEM]; } //===----------------------------------------------------------------------===// @@ -410,8 +512,27 @@ class MIMGe <bits<7> op> : Enc64 { let Inst{57-53} = SSAMP{6-2}; } -class EXPe : Enc64 { +class FLATe<bits<7> op> : Enc64 { + bits<8> addr; + bits<8> data; + bits<8> vdst; + bits<1> slc; + bits<1> glc; + bits<1> tfe; + // 15-0 is reserved. + let Inst{16} = glc; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x37; // Encoding. + let Inst{39-32} = addr; + let Inst{47-40} = data; + // 54-48 is reserved. + let Inst{55} = tfe; + let Inst{63-56} = vdst; +} + +class EXPe : Enc64 { bits<4> EN; bits<6> TGT; bits<1> COMPR; @@ -437,48 +558,23 @@ class EXPe : Enc64 { let Uses = [EXEC] in { class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern>, VOP1e<op> { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP1 = 1; -} + VOP1Common <outs, ins, asm, pattern>, + VOP1e<op>; class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern>, VOP2e<op> { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOP2 = 1; -} - -class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : - VOP3Common <outs, ins, asm, pattern>, VOP3e<op>; + VOP2Common <outs, ins, asm, pattern>, VOP2e<op>; class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : VOP3Common <outs, ins, asm, pattern>, VOP3be<op>; class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> : - InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe <op> { + VOPCCommon <ins, asm, pattern>, VOPCe <op>; - let DisableEncoding = "$dst"; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VOPC = 1; -} - -class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern>, VINTRPe<op> { - - let neverHasSideEffects = 1; +class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { let mayLoad = 1; let mayStore = 0; + let hasSideEffects = 0; } } // End Uses = [EXEC] @@ -489,29 +585,56 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : let Uses = [EXEC] in { -class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> , DSe<op> { +class DS <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { let LGKM_CNT = 1; + let DS = 1; + let UseNamedOperandTable = 1; + let DisableEncoding = "$m0"; + let SchedRW = [WriteLDS]; } -class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern>, MUBUFe <op> { +class DS_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + DS <outs, ins, asm, pattern>, DSe<op>; + +class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { let VM_CNT = 1; let EXP_CNT = 1; + let MUBUF = 1; - let neverHasSideEffects = 1; + let hasSideEffects = 0; let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; } -class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI<outs, ins, asm, pattern>, MTBUFe <op> { +class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern> { let VM_CNT = 1; let EXP_CNT = 1; + let MTBUF = 1; - let neverHasSideEffects = 1; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; +} + +class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + InstSI<outs, ins, asm, pattern>, FLATe <op> { + let FLAT = 1; + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = 1; + + let Uses = [EXEC, FLAT_SCR]; // M0 + + let UseNamedOperandTable = 1; + let hasSideEffects = 0; } class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : @@ -520,16 +643,9 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MIMG = 1; -} -def EXP : InstSI< - (outs), - (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, - VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), - "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", - [] >, EXPe { - - let EXP_CNT = 1; + let hasSideEffects = 0; // XXX ???? } + } // End Uses = [EXEC] diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 51f453292da8..1a4c0d4e57b0 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -17,10 +17,13 @@ #include "AMDGPUTargetMachine.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" using namespace llvm; @@ -32,6 +35,259 @@ SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) // TargetInstrInfo callbacks //===----------------------------------------------------------------------===// +static unsigned getNumOperandsNoGlue(SDNode *Node) { + unsigned N = Node->getNumOperands(); + while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) + --N; + return N; +} + +static SDValue findChainOperand(SDNode *Load) { + SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); + assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); + return LastOp; +} + +/// \brief Returns true if both nodes have the same value for the given +/// operand \p Op, or if both nodes do not have this operand. +static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { + unsigned Opc0 = N0->getMachineOpcode(); + unsigned Opc1 = N1->getMachineOpcode(); + + int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); + int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); + + if (Op0Idx == -1 && Op1Idx == -1) + return true; + + + if ((Op0Idx == -1 && Op1Idx != -1) || + (Op1Idx == -1 && Op0Idx != -1)) + return false; + + // getNamedOperandIdx returns the index for the MachineInstr's operands, + // which includes the result as the first operand. We are indexing into the + // MachineSDNode's operands, so we need to skip the result operand to get + // the real index. + --Op0Idx; + --Op1Idx; + + return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); +} + +bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, + int64_t &Offset0, + int64_t &Offset1) const { + if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) + return false; + + unsigned Opc0 = Load0->getMachineOpcode(); + unsigned Opc1 = Load1->getMachineOpcode(); + + // Make sure both are actually loads. + if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) + return false; + + if (isDS(Opc0) && isDS(Opc1)) { + + // FIXME: Handle this case: + if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) + return false; + + // Check base reg. + if (Load0->getOperand(1) != Load1->getOperand(1)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + // Skip read2 / write2 variants for simplicity. + // TODO: We should report true if the used offsets are adjacent (excluded + // st64 versions). + if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || + AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) + return false; + + Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue(); + return true; + } + + if (isSMRD(Opc0) && isSMRD(Opc1)) { + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); + + // Check base reg. + if (Load0->getOperand(0) != Load1->getOperand(0)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue(); + return true; + } + + // MUBUF and MTBUF can access the same addresses. + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { + + // MUBUF and MTBUF have vaddr at different indices. + if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || + findChainOperand(Load0) != findChainOperand(Load1) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) + return false; + + int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); + int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); + + if (OffIdx0 == -1 || OffIdx1 == -1) + return false; + + // getNamedOperandIdx returns the index for MachineInstrs. Since they + // inlcude the output in the operand list, but SDNodes don't, we need to + // subtract the index by one. + --OffIdx0; + --OffIdx1; + + SDValue Off0 = Load0->getOperand(OffIdx0); + SDValue Off1 = Load1->getOperand(OffIdx1); + + // The offset might be a FrameIndexSDNode. + if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) + return false; + + Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); + Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); + return true; + } + + return false; +} + +static bool isStride64(unsigned Opc) { + switch (Opc) { + case AMDGPU::DS_READ2ST64_B32: + case AMDGPU::DS_READ2ST64_B64: + case AMDGPU::DS_WRITE2ST64_B32: + case AMDGPU::DS_WRITE2ST64_B64: + return true; + default: + return false; + } +} + +bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, + unsigned &BaseReg, unsigned &Offset, + const TargetRegisterInfo *TRI) const { + unsigned Opc = LdSt->getOpcode(); + if (isDS(Opc)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (OffsetImm) { + // Normal, single offset LDS instruction. + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + // The 2 offset instructions use offset0 and offset1 instead. We can treat + // these as a load with a single offset if the 2 offsets are consecutive. We + // will use this for some partially aligned loads. + const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset0); + const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset1); + + uint8_t Offset0 = Offset0Imm->getImm(); + uint8_t Offset1 = Offset1Imm->getImm(); + assert(Offset1 > Offset0); + + if (Offset1 - Offset0 == 1) { + // Each of these offsets is in element sized units, so we need to convert + // to bytes of the individual reads. + + unsigned EltSize; + if (LdSt->mayLoad()) + EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; + else { + assert(LdSt->mayStore()); + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); + } + + if (isStride64(Opc)) + EltSize *= 64; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + BaseReg = AddrReg->getReg(); + Offset = EltSize * Offset0; + return true; + } + + return false; + } + + if (isMUBUF(Opc) || isMTBUF(Opc)) { + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) + return false; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::vaddr); + if (!AddrReg) + return false; + + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + if (isSMRD(Opc)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (!OffsetImm) + return false; + + const MachineOperand *SBaseReg = getNamedOperand(*LdSt, + AMDGPU::OpName::sbase); + BaseReg = SBaseReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + return false; +} + +bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { + unsigned Opc0 = FirstLdSt->getOpcode(); + unsigned Opc1 = SecondLdSt->getOpcode(); + + // TODO: This needs finer tuning + if (NumLoads > 4) + return false; + + if (isDS(Opc0) && isDS(Opc1)) + return true; + + if (isSMRD(Opc0) && isSMRD(Opc1)) + return true; + + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) + return true; + + return false; +} + void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, @@ -70,26 +326,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode; const int16_t *SubIndices; - if (AMDGPU::M0 == DestReg) { - // Check if M0 isn't already set to this value - for (MachineBasicBlock::reverse_iterator E = MBB.rend(), - I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) { - - if (!I->definesRegister(AMDGPU::M0)) - continue; - - unsigned Opc = I->getOpcode(); - if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32) - break; - - if (!I->readsRegister(SrcReg)) - break; - - // The copy isn't necessary - return; - } - } - if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) @@ -117,8 +353,8 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opcode = AMDGPU::S_MOV_B32; SubIndices = Sub0_15; - } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_32RegClass.contains(SrcReg) || + } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); @@ -182,6 +418,27 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const { return Opcode; } +unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + + if (DstRC->getSize() == 4) { + return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; + } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + return AMDGPU::S_MOV_B64; + } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { + return AMDGPU::V_MOV_B64_PSEUDO; + } + return AMDGPU::COPY; +} + +static bool shouldTryToSpillVGPRs(MachineFunction *MF) { + + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + + // FIXME: Implement spilling for other shader types. + return MFI->getShaderType() == ShaderType::COMPUTE; + +} + void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, @@ -190,49 +447,49 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - unsigned KillFlag = isKill ? RegState::Kill : 0; + int Opcode = -1; - if (RI.hasVGPRs(RC)) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!"); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) - .addReg(SrcReg); - } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { - unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF); - unsigned TgtReg = MFI->SpillTracker.LaneVGPR; - - BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg) - .addReg(SrcReg, KillFlag) - .addImm(Lane); - MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane); - } else if (RI.isSGPRClass(RC)) { + if (RI.isSGPRClass(RC)) { // We are only allowed to create one new instruction when spilling - // registers, so we need to use pseudo instruction for vector - // registers. - // - // Reserve a spot in the spill tracker for each sub-register of - // the vector register. - unsigned NumSubRegs = RC->getSize() / 4; - unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs); - MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, - FirstLane); - - unsigned Opcode; + // registers, so we need to use pseudo instruction for spilling + // SGPRs. switch (RC->getSize() * 8) { - case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; - default: llvm_unreachable("Cannot spill register class"); + case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; } + } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) { + MFI->setHasSpilledVGPRs(); - BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR) + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; + } + } + + if (Opcode != -1) { + FrameInfo->setObjectAlignment(FrameIndex, 4); + BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) - .addImm(FrameIndex); + .addFrameIndex(FrameIndex) + // Place-holder registers, these will be filled in by + // SIPrepareScratchRegs. + .addReg(AMDGPU::SGPR0_SGPR1, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); } else { - llvm_unreachable("VGPR spilling not supported"); + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" + " spill register"); + BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) + .addReg(SrcReg); } } @@ -242,55 +499,142 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); + int Opcode = -1; - if (RI.hasVGPRs(RC)) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!"); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) - .addImm(0); - } else if (RI.isSGPRClass(RC)){ - unsigned Opcode; + if (RI.isSGPRClass(RC)){ switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; - default: llvm_unreachable("Cannot spill register class"); + case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; } + } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) { + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; + } + } - SIMachineFunctionInfo::SpilledReg Spill = - MFI->SpillTracker.getSpilledReg(FrameIndex); - + if (Opcode != -1) { + FrameInfo->setObjectAlignment(FrameIndex, 4); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addReg(Spill.VGPR) - .addImm(FrameIndex); + .addFrameIndex(FrameIndex) + // Place-holder registers, these will be filled in by + // SIPrepareScratchRegs. + .addReg(AMDGPU::SGPR0_SGPR1, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } else { - llvm_unreachable("VGPR spilling not supported"); + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" + " restore register"); + BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); } } -static unsigned getNumSubRegsForSpillOp(unsigned Op) { - - switch (Op) { - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S512_RESTORE: - return 16; - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S256_RESTORE: - return 8; - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S128_RESTORE: - return 4; - case AMDGPU::SI_SPILL_S64_SAVE: - case AMDGPU::SI_SPILL_S64_RESTORE: - return 2; - case AMDGPU::SI_SPILL_S32_RESTORE: - return 1; - default: llvm_unreachable("Invalid spill opcode"); +/// \param @Offset Offset in bytes of the FrameIndex being spilled +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, + unsigned Size) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); + unsigned WavefrontSize = ST.getWavefrontSize(); + + unsigned TIDReg = MFI->getTIDReg(); + if (!MFI->hasCalculatedTID()) { + MachineBasicBlock &Entry = MBB.getParent()->front(); + MachineBasicBlock::iterator Insert = Entry.front(); + DebugLoc DL = Insert->getDebugLoc(); + + TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); + if (TIDReg == AMDGPU::NoRegister) + return TIDReg; + + + if (MFI->getShaderType() == ShaderType::COMPUTE && + WorkGroupSize > WavefrontSize) { + + unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); + unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); + unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned InputPtrReg = + TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + static const unsigned TIDIGRegs[3] = { + TIDIGXReg, TIDIGYReg, TIDIGZReg + }; + for (unsigned Reg : TIDIGRegs) { + if (!Entry.isLiveIn(Reg)) + Entry.addLiveIn(Reg); + } + + RS->enterBasicBlock(&Entry); + unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Z); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Y); + + // NGROUPS.X * NGROUPS.Y + BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) + .addReg(STmp1) + .addReg(STmp0); + // (NGROUPS.X * NGROUPS.Y) * TIDIG.X + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) + .addReg(STmp1) + .addReg(TIDIGXReg); + // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) + .addReg(STmp0) + .addReg(TIDIGYReg) + .addReg(TIDReg); + // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z + BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) + .addReg(TIDReg) + .addReg(TIDIGZReg); + } else { + // Get the wave id + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), + TIDReg) + .addImm(-1) + .addImm(0); + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32), + TIDReg) + .addImm(-1) + .addReg(TIDReg); + } + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), + TIDReg) + .addImm(2) + .addReg(TIDReg); + MFI->setTIDReg(TIDReg); } + + // Add FrameIndex to LDS offset + unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) + .addImm(LDSOffset) + .addReg(TIDReg); + + return TmpReg; } void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, @@ -308,59 +652,11 @@ void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, } bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - SIMachineFunctionInfo *MFI = - MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>(); MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI->getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - // SGPR register spill - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S64_SAVE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - unsigned FrameIndex = MI->getOperand(2).getImm(); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - SIMachineFunctionInfo::SpilledReg Spill; - unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(), - &AMDGPU::SGPR_32RegClass, i); - Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); - - BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), - MI->getOperand(0).getReg()) - .addReg(SubReg) - .addImm(Spill.Lane + i); - } - MI->eraseFromParent(); - break; - } - - // SGPR register restore - case AMDGPU::SI_SPILL_S512_RESTORE: - case AMDGPU::SI_SPILL_S256_RESTORE: - case AMDGPU::SI_SPILL_S128_RESTORE: - case AMDGPU::SI_SPILL_S64_RESTORE: - case AMDGPU::SI_SPILL_S32_RESTORE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - SIMachineFunctionInfo::SpilledReg Spill; - unsigned FrameIndex = MI->getOperand(2).getImm(); - unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(), - &AMDGPU::SGPR_32RegClass, i); - Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); - - BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg) - .addReg(MI->getOperand(1).getReg()) - .addImm(Spill.Lane + i); - } - insertNOPs(MI, 3); - MI->eraseFromParent(); - break; - } case AMDGPU::SI_CONSTDATA_PTR: { unsigned Reg = MI->getOperand(0).getReg(); unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); @@ -369,7 +665,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); // Add 32-bit offset from this instruction to the start of the constant data. - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_I32), RegLo) + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) .addReg(RegLo) .addTargetIndex(AMDGPU::TI_CONSTDATA_START) .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); @@ -381,6 +677,39 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MI->eraseFromParent(); break; } + case AMDGPU::SGPR_USE: + // This is just a placeholder for register allocation. + MI->eraseFromParent(); + break; + + case AMDGPU::V_MOV_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + + const MachineOperand &SrcOp = MI->getOperand(1); + // FIXME: Will this work for 64-bit floating point immediates? + assert(!SrcOp.isFPImm()); + if (SrcOp.isImm()) { + APInt Imm(64, SrcOp.getImm()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addImm(Imm.getLoBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addImm(Imm.getHiBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + } else { + assert(SrcOp.isReg()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit); + } + MI->eraseFromParent(); + break; + } } return true; } @@ -388,35 +717,66 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg()) + if (MI->getNumOperands() < 3) return nullptr; - // Cannot commute VOP2 if src0 is SGPR. - if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() && - RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg()))) - return nullptr; + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + assert(Src0Idx != -1 && "Should always have src0 operand"); - if (!MI->getOperand(2).isReg()) { - // XXX: Commute instructions with FPImm operands - if (NewMI || MI->getOperand(2).isFPImm() || + MachineOperand &Src0 = MI->getOperand(Src0Idx); + if (!Src0.isReg()) + return nullptr; + + int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1); + if (Src1Idx == -1) + return nullptr; + + MachineOperand &Src1 = MI->getOperand(Src1Idx); + + // Make sure it's legal to commute operands for VOP2. + if (isVOP2(MI->getOpcode()) && + (!isOperandLegal(MI, Src0Idx, &Src1) || + !isOperandLegal(MI, Src1Idx, &Src0))) { + return nullptr; + } + + if (!Src1.isReg()) { + // Allow commuting instructions with Imm operands. + if (NewMI || !Src1.isImm() || (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { return nullptr; } - // XXX: Commute VOP3 instructions with abs and neg set. - if (isVOP3(MI->getOpcode()) && - (MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::abs)).getImm() || - MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::neg)).getImm())) - return nullptr; + // Be sure to copy the source modifiers to the right place. + if (MachineOperand *Src0Mods + = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { + MachineOperand *Src1Mods + = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); + + int Src0ModsVal = Src0Mods->getImm(); + if (!Src1Mods && Src0ModsVal != 0) + return nullptr; + + // XXX - This assert might be a lie. It might be useful to have a neg + // modifier with 0.0. + int Src1ModsVal = Src1Mods->getImm(); + assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); + + Src1Mods->setImm(Src0ModsVal); + Src0Mods->setImm(Src1ModsVal); + } + + unsigned Reg = Src0.getReg(); + unsigned SubReg = Src0.getSubReg(); + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else + llvm_unreachable("Should only have immediates"); - unsigned Reg = MI->getOperand(1).getReg(); - unsigned SubReg = MI->getOperand(1).getSubReg(); - MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm()); - MI->getOperand(2).ChangeToRegister(Reg, false); - MI->getOperand(2).setSubReg(SubReg); + Src1.ChangeToRegister(Reg, false); + Src1.setSubReg(SubReg); } else { MI = TargetInstrInfo::commuteInstruction(MI, NewMI); } @@ -427,6 +787,44 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, return MI; } +// This needs to be implemented because the source modifiers may be inserted +// between the true commutable operands, and the base +// TargetInstrInfo::commuteInstruction uses it. +bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.isCommutable()) + return false; + + unsigned Opc = MI->getOpcode(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return false; + + // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on + // immediate. + if (!MI->getOperand(Src0Idx).isReg()) + return false; + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return false; + + if (!MI->getOperand(Src1Idx).isReg()) + return false; + + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + + SrcOpIdx1 = Src0Idx; + SrcOpIdx2 = Src1Idx; + return true; +} + MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned DstReg, @@ -463,51 +861,106 @@ SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, } } -namespace llvm { -namespace AMDGPU { -// Helper function generated by tablegen. We are wrapping this with -// an SIInstrInfo function that reutrns bool rather than int. -int isDS(uint16_t Opcode); -} +static bool offsetsDoNotOverlap(int WidthA, int OffsetA, + int WidthB, int OffsetB) { + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + return LowOffset + LowWidth <= HighOffset; } -bool SIInstrInfo::isDS(uint16_t Opcode) const { - return ::AMDGPU::isDS(Opcode) != -1; -} +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const { + unsigned BaseReg0, Offset0; + unsigned BaseReg1, Offset1; + + if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && + getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { + assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && + "read2 / write2 not expected here yet"); + unsigned Width0 = (*MIa->memoperands_begin())->getSize(); + unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + if (BaseReg0 == BaseReg1 && + offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { + return true; + } + } -int SIInstrInfo::isMIMG(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MIMG; + return false; } -int SIInstrInfo::isSMRD(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SMRD; -} +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, + AliasAnalysis *AA) const { + unsigned Opc0 = MIa->getOpcode(); + unsigned Opc1 = MIb->getOpcode(); -bool SIInstrInfo::isVOP1(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP1; -} + assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + "MIa must load from or modify a memory location"); + assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + "MIb must load from or modify a memory location"); -bool SIInstrInfo::isVOP2(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP2; -} + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + return false; -bool SIInstrInfo::isVOP3(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP3; -} + // XXX - Can we relax this between address spaces? + if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + return false; -bool SIInstrInfo::isVOPC(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOPC; -} + // TODO: Should we check the address space from the MachineMemOperand? That + // would allow us to distinguish objects we know don't alias based on the + // underlying addres space, even if it was lowered to a different one, + // e.g. private accesses lowered to use MUBUF instructions on a scratch + // buffer. + if (isDS(Opc0)) { + if (isDS(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1); + } + + if (isMUBUF(Opc0) || isMTBUF(Opc0)) { + if (isMUBUF(Opc1) || isMTBUF(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isSMRD(Opc1); + } + + if (isSMRD(Opc0)) { + if (isSMRD(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); -bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU; + return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); + } + + if (isFLAT(Opc0)) { + if (isFLAT(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return false; + } + + return false; } bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { - int32_t Val = Imm.getSExtValue(); - if (Val >= -16 && Val <= 64) + int64_t SVal = Imm.getSExtValue(); + if (SVal >= -16 && SVal <= 64) return true; + if (Imm.getBitWidth() == 64) { + uint64_t Val = Imm.getZExtValue(); + return (DoubleToBits(0.0) == Val) || + (DoubleToBits(1.0) == Val) || + (DoubleToBits(-1.0) == Val) || + (DoubleToBits(0.5) == Val) || + (DoubleToBits(-0.5) == Val) || + (DoubleToBits(2.0) == Val) || + (DoubleToBits(-2.0) == Val) || + (DoubleToBits(4.0) == Val) || + (DoubleToBits(-4.0) == Val); + } + // The actual type of the operand does not seem to matter as long // as the bits match one of the inline immediate values. For example: // @@ -516,32 +969,28 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { // // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in // floating-point, so it is a legal inline immediate. - - return (APInt::floatToBits(0.0f) == Imm) || - (APInt::floatToBits(1.0f) == Imm) || - (APInt::floatToBits(-1.0f) == Imm) || - (APInt::floatToBits(0.5f) == Imm) || - (APInt::floatToBits(-0.5f) == Imm) || - (APInt::floatToBits(2.0f) == Imm) || - (APInt::floatToBits(-2.0f) == Imm) || - (APInt::floatToBits(4.0f) == Imm) || - (APInt::floatToBits(-4.0f) == Imm); + uint32_t Val = Imm.getZExtValue(); + + return (FloatToBits(0.0f) == Val) || + (FloatToBits(1.0f) == Val) || + (FloatToBits(-1.0f) == Val) || + (FloatToBits(0.5f) == Val) || + (FloatToBits(-0.5f) == Val) || + (FloatToBits(2.0f) == Val) || + (FloatToBits(-2.0f) == Val) || + (FloatToBits(4.0f) == Val) || + (FloatToBits(-4.0f) == Val); } bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const { if (MO.isImm()) return isInlineConstant(APInt(32, MO.getImm(), true)); - if (MO.isFPImm()) { - APFloat FpImm = MO.getFPImm()->getValueAPF(); - return isInlineConstant(FpImm.bitcastToAPInt()); - } - return false; } bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const { - return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO); + return MO.isImm() && !isInlineConstant(MO); } static bool compareMachineOp(const MachineOperand &Op0, @@ -554,8 +1003,6 @@ static bool compareMachineOp(const MachineOperand &Op0, return Op0.getReg() == Op1.getReg(); case MachineOperand::MO_Immediate: return Op0.getImm() == Op1.getImm(); - case MachineOperand::MO_FPImmediate: - return Op0.getFPImm() == Op1.getFPImm(); default: llvm_unreachable("Didn't expect to be comparing these operand types"); } @@ -565,7 +1012,7 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO) const { const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; - assert(MO.isImm() || MO.isFPImm()); + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) return true; @@ -573,16 +1020,91 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - return RI.regClassCanUseImmediate(OpInfo.RegClass); + if (isLiteralConstant(MO)) + return RI.opCanUseLiteralConstant(OpInfo.OperandType); + + return RI.opCanUseInlineConstant(OpInfo.OperandType); +} + +bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const { + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: { + // MUBUF instructions a 12-bit offset in bytes. + return isUInt<12>(OffsetSize); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // SMRD instructions have an 8-bit offset in dwords on SI and + // a 20-bit offset in bytes on VI. + if (RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return isUInt<20>(OffsetSize); + else + return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // The single offset versions have a 16-bit offset in bytes. + return isUInt<16>(OffsetSize); + } + case AMDGPUAS::PRIVATE_ADDRESS: + // Indirect register addressing does not use any offsets. + default: + return 0; + } } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { return AMDGPU::getVOPe32(Opcode) != -1; } +bool SIInstrInfo::hasModifiers(unsigned Opcode) const { + // The src0_modifier operand is present on all instructions + // that have modifiers. + + return AMDGPU::getNamedOperandIdx(Opcode, + AMDGPU::OpName::src0_modifiers) != -1; +} + +bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const { + const MachineOperand *Mods = getNamedOperand(MI, OpName); + return Mods && Mods->getImm(); +} + +bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO) const { + // Literal constants use the constant bus. + if (isLiteralConstant(MO)) + return true; + + if (!MO.isReg() || !MO.isUse()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) + return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); + + // FLAT_SCR is just an SGPR pair. + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) + return true; + + // EXEC register uses the constant bus. + if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) + return true; + + // SGPRs use the constant bus + if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || + (!MO.isImplicit() && + (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || + AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { + return true; + } + + return false; +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); @@ -596,23 +1118,27 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } // Make sure the register classes are correct - for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) { + for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isFPImm()) { + ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " + "all fp values to integers."; + return false; + } + switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: { - int RegClass = Desc.OpInfo[i].RegClass; - if (!RI.regClassCanUseImmediate(RegClass) && - (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) { - ErrInfo = "Expected register, but got immediate"; - return false; + if (MI->getOperand(i).isImm() && + !isImmOperandLegal(MI, i, MI->getOperand(i))) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } } - } break; case MCOI::OPERAND_IMMEDIATE: // Check if this operand is an immediate. // FrameIndex operands will be replaced by immediates, so they are // allowed. - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() && - !MI->getOperand(i).isFI()) { + if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } @@ -641,31 +1167,27 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify VOP* if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { + // Only look at the true operands. Only a real operand can use the constant + // bus, and we don't want to check pseudo-operands like the source modifier + // flags. + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + unsigned ConstantBusCount = 0; unsigned SGPRUsed = AMDGPU::NoRegister; - for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (MO.isReg() && MO.isUse() && - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) { - - // EXEC register uses the constant bus. - if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) - ++ConstantBusCount; + for (int OpIdx : OpIndices) { + if (OpIdx == -1) + break; - // SGPRs use the constant bus - if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { - if (SGPRUsed != MO.getReg()) { + const MachineOperand &MO = MI->getOperand(OpIdx); + if (usesConstantBus(MRI, MO)) { + if (MO.isReg()) { + if (MO.getReg() != SGPRUsed) ++ConstantBusCount; - SGPRUsed = MO.getReg(); - } + SGPRUsed = MO.getReg(); + } else { + ++ConstantBusCount; } } - // Literal constants use the constant bus. - if (isLiteralConstant(MO)) - ++ConstantBusCount; } if (ConstantBusCount > 1) { ErrInfo = "VOP* instruction uses the constant bus more than once"; @@ -676,7 +1198,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify SRC1 for VOP2 and VOPC if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) { const MachineOperand &Src1 = MI->getOperand(Src1Idx); - if (Src1.isImm() || Src1.isFPImm()) { + if (Src1.isImm()) { ErrInfo = "VOP[2C] src1 cannot be an immediate."; return false; } @@ -701,11 +1223,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { - MI->dump(); - - const MachineOperand &Src0 = MI->getOperand(2); - const MachineOperand &Src1 = MI->getOperand(3); - const MachineOperand &Src2 = MI->getOperand(4); + const MachineOperand &Src0 = MI->getOperand(Src0Idx); + const MachineOperand &Src1 = MI->getOperand(Src1Idx); + const MachineOperand &Src2 = MI->getOperand(Src2Idx); if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { if (!compareMachineOp(Src0, Src1) && !compareMachineOp(Src0, Src2)) { @@ -728,10 +1248,13 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_MOV_B32: return MI.getOperand(1).isReg() ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; - case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; - case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32; + case AMDGPU::S_SUB_I32: + case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; + case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; @@ -779,8 +1302,13 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); const MCInstrDesc &Desc = get(MI.getOpcode()); if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || - Desc.OpInfo[OpNo].RegClass == -1) - return MRI.getRegClass(MI.getOperand(OpNo).getReg()); + Desc.OpInfo[OpNo].RegClass == -1) { + unsigned Reg = MI.getOperand(OpNo).getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI.getRegClass(Reg); + return RI.getRegClass(Reg); + } unsigned RCID = Desc.OpInfo[OpNo].RegClass; return RI.getRegClass(RCID); @@ -800,21 +1328,28 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { MachineBasicBlock::iterator I = MI; + MachineBasicBlock *MBB = MI->getParent(); MachineOperand &MO = MI->getOperand(OpIdx); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Opcode = AMDGPU::V_MOV_B32_e32; - if (MO.isReg()) { + if (MO.isReg()) Opcode = AMDGPU::COPY; - } else if (RI.isSGPRClass(RC)) { + else if (RI.isSGPRClass(RC)) Opcode = AMDGPU::S_MOV_B32; - } + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); + if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) + VRC = &AMDGPU::VReg_64RegClass; + else + VRC = &AMDGPU::VGPR_32RegClass; + unsigned Reg = MRI.createVirtualRegister(VRC); - BuildMI(*MI->getParent(), I, MI->getParent()->findDebugLoc(I), get(Opcode), - Reg).addOperand(MO); + DebugLoc DL = MBB->findDebugLoc(I); + BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) + .addOperand(MO); MO.ChangeToRegister(Reg, false); } @@ -834,13 +1369,15 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY), - NewSuperReg) - .addOperand(SuperReg); + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) + .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(NewSuperReg, 0, SubIdx); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY), - SubReg) - .addReg(NewSuperReg, 0, SubIdx); return SubReg; } @@ -896,8 +1433,68 @@ unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, return Dst; } +// Change the order of operands from (0, 1, 2) to (0, 2, 1) +void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { + assert(Inst->getNumExplicitOperands() == 3); + MachineOperand Op1 = Inst->getOperand(1); + Inst->RemoveOperand(1); + Inst->addOperand(Op1); +} + +bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO) const { + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MCInstrDesc &InstDesc = get(MI->getOpcode()); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + const TargetRegisterClass *DefinedRC = + OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; + if (!MO) + MO = &MI->getOperand(OpIdx); + + if (isVALU(InstDesc.Opcode) && usesConstantBus(MRI, *MO)) { + unsigned SGPRUsed = + MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (i == OpIdx) + continue; + if (usesConstantBus(MRI, MI->getOperand(i)) && + MI->getOperand(i).isReg() && MI->getOperand(i).getReg() != SGPRUsed) { + return false; + } + } + } + + if (MO->isReg()) { + assert(DefinedRC); + const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; + } + + + // Handle non-register types that are treated like immediates. + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); + + if (!DefinedRC) { + // This operand expects an immediate. + return true; + } + + return isImmOperandLegal(MI, OpIdx, *MO); +} + void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), @@ -907,45 +1504,40 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // Legalize VOP2 if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { - MachineOperand &Src0 = MI->getOperand(Src0Idx); - MachineOperand &Src1 = MI->getOperand(Src1Idx); - - // If the instruction implicitly reads VCC, we can't have any SGPR operands, - // so move any. - bool ReadsVCC = MI->readsRegister(AMDGPU::VCC, &RI); - if (ReadsVCC && Src0.isReg() && - RI.isSGPRClass(MRI.getRegClass(Src0.getReg()))) { + // Legalize src0 + if (!isOperandLegal(MI, Src0Idx)) legalizeOpWithMove(MI, Src0Idx); - return; - } - if (ReadsVCC && Src1.isReg() && - RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { - legalizeOpWithMove(MI, Src1Idx); + // Legalize src1 + if (isOperandLegal(MI, Src1Idx)) return; - } - // Legalize VOP2 instructions where src1 is not a VGPR. An SGPR input must - // be the first operand, and there can only be one. - if (Src1.isImm() || Src1.isFPImm() || - (Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())))) { - if (MI->isCommutable()) { - if (commuteInstruction(MI)) - return; - } - legalizeOpWithMove(MI, Src1Idx); + // Usually src0 of VOP2 instructions allow more types of inputs + // than src1, so try to commute the instruction to decrease our + // chances of having to insert a MOV instruction to legalize src1. + if (MI->isCommutable()) { + if (commuteInstruction(MI)) + // If we are successful in commuting, then we know MI is legal, so + // we are done. + return; } + + legalizeOpWithMove(MI, Src1Idx); + return; } // XXX - Do any VOP3 instructions read VCC? // Legalize VOP3 if (isVOP3(MI->getOpcode())) { - int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx}; - unsigned SGPRReg = AMDGPU::NoRegister; + int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + for (unsigned i = 0; i < 3; ++i) { int Idx = VOP3Idx[i]; if (Idx == -1) - continue; + break; MachineOperand &MO = MI->getOperand(Idx); if (MO.isReg()) { @@ -1045,106 +1637,216 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // Legalize MUBUF* instructions // FIXME: If we start using the non-addr64 instructions for compute, we // may need to legalize them here. + int SRsrcIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + if (SRsrcIdx != -1) { + // We have an MUBUF instruction + MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); + unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; + if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), + RI.getRegClass(SRsrcRC))) { + // The operands are legal. + // FIXME: We may need to legalize operands besided srsrc. + return; + } - int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::srsrc); - int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::vaddr); - if (SRsrcIdx != -1 && VAddrIdx != -1) { - const TargetRegisterClass *VAddrRC = - RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass); - - if(VAddrRC->getSize() == 8 && - MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) { - // We have a MUBUF instruction that uses a 64-bit vaddr register and - // srsrc has the incorrect register class. In order to fix this, we - // need to extract the pointer from the resource descriptor (srsrc), - // add it to the value of vadd, then store the result in the vaddr - // operand. Then, we need to set the pointer field of the resource - // descriptor to zero. + MachineBasicBlock &MBB = *MI->getParent(); + // Extract the the ptr from the resource descriptor. - MachineBasicBlock &MBB = *MI->getParent(); - MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx); - MachineOperand &VAddrOp = MI->getOperand(VAddrIdx); - unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi; - unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - - // SRsrcPtrLo = srsrc:sub0 - SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass); - - // SRsrcPtrHi = srsrc:sub1 - SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass); - - // VAddrLo = vaddr:sub0 - VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp, - &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass); - - // VAddrHi = vaddr:sub1 - VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp, - &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass); - - // NewVaddrLo = SRsrcPtrLo + VAddrLo + // SRsrcPtrLo = srsrc:sub0 + unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); + + // SRsrcPtrHi = srsrc:sub1 + unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); + + // Create an empty resource descriptor + unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + // Zero64 = 0 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), + Zero64) + .addImm(0); + + // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatLo) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + + // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatHi) + .addImm(RsrcDataFormat >> 32); + + // NewSRsrc = {Zero64, SRsrcFormat} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + unsigned NewVAddrLo; + unsigned NewVAddrHi; + if (VAddr) { + // This is already an ADDR64 instruction so we need to add the pointer + // extracted from the resource descriptor to the current value of VAddr. + NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) .addReg(SRsrcPtrLo) - .addReg(VAddrLo) - .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit); + .addReg(VAddr->getReg(), 0, AMDGPU::sub0) + .addReg(AMDGPU::VCC, RegState::ImplicitDefine); - // NewVaddrHi = SRsrcPtrHi + VAddrHi + // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) .addReg(SRsrcPtrHi) - .addReg(VAddrHi) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1) .addReg(AMDGPU::VCC, RegState::ImplicitDefine) .addReg(AMDGPU::VCC, RegState::Implicit); - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); + } else { + // This instructions is the _OFFSET variant, so we need to convert it to + // ADDR64. + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); + MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); + MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); + assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF " + "with non-zero soffset is not implemented"); + (void)SOffset; + + // Create the new instruction. + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); + MachineInstr *Addr64 = + BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*SRsrc) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*Offset); + + MI->removeFromParent(); + MI = Addr64; + + NewVAddrLo = SRsrcPtrLo; + NewVAddrHi = SRsrcPtrHi; + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); + } - // Zero64 = 0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), - Zero64) - .addImm(0); + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); - // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatLo) - .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); - - // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatHi) - .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); - - // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); - // Update the instruction to use NewVaddr - MI->getOperand(VAddrIdx).setReg(NewVAddr); - // Update the instruction to use NewSRsrc - MI->getOperand(SRsrcIdx).setReg(NewSRsrc); + // Update the instruction to use NewVaddr + VAddr->setReg(NewVAddr); + // Update the instruction to use NewSRsrc + SRsrc->setReg(NewSRsrc); + } +} + +void SIInstrInfo::splitSMRD(MachineInstr *MI, + const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const { + + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RegLo = MRI.createVirtualRegister(HalfRC); + unsigned RegHi = MRI.createVirtualRegister(HalfRC); + unsigned HalfSize = HalfRC->getSize(); + const MachineOperand *OffOp = + getNamedOperand(*MI, AMDGPU::OpName::offset); + const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); + + // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes + // on VI. + if (OffOp) { + bool isVI = RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + unsigned OffScale = isVI ? 1 : 4; + // Handle the _IMM variant + unsigned LoOffset = OffOp->getImm() * OffScale; + unsigned HiOffset = LoOffset + HalfSize; + Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) + .addOperand(*SBase) + .addImm(LoOffset / OffScale); + + if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { + unsigned OffsetSGPR = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) + .addImm(HiOffset); // The offset in register is in bytes. + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) + .addOperand(*SBase) + .addReg(OffsetSGPR); + } else { + Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) + .addOperand(*SBase) + .addImm(HiOffset / OffScale); } + } else { + // Handle the _SGPR variant + MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); + Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) + .addOperand(*SBase) + .addOperand(*SOff); + unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) + .addOperand(*SOff) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) + .addOperand(*SBase) + .addReg(OffsetSGPR); + } + + unsigned SubLo, SubHi; + switch (HalfSize) { + case 4: + SubLo = AMDGPU::sub0; + SubHi = AMDGPU::sub1; + break; + case 8: + SubLo = AMDGPU::sub0_sub1; + SubHi = AMDGPU::sub2_sub3; + break; + case 16: + SubLo = AMDGPU::sub0_sub1_sub2_sub3; + SubHi = AMDGPU::sub4_sub5_sub6_sub7; + break; + case 32: + SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; + SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + break; + default: + llvm_unreachable("Unhandled HalfSize"); } + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) + .addOperand(MI->getOperand(0)) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); } void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { @@ -1155,7 +1857,7 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::S_LOAD_DWORDX2_SGPR: case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: + case AMDGPU::S_LOAD_DWORDX4_SGPR: { unsigned NewOpcode = getVALUOp(*MI); unsigned RegOffset; unsigned ImmOffset; @@ -1165,10 +1867,13 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con ImmOffset = 0; } else { assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets and MUBUF instructions - // take a byte offset. - ImmOffset = MI->getOperand(2).getImm() << 2; + // SMRD instructions take a dword offsets on SI and byte offset on VI + // and MUBUF instructions always take a byte offset. + ImmOffset = MI->getOperand(2).getImm(); + if (RI.ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + ImmOffset <<= 2; RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + if (isUInt<12>(ImmOffset)) { BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), RegOffset) @@ -1186,13 +1891,14 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) .addImm(0); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); + .addImm(RsrcDataFormat & 0xFFFFFFFF); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); + .addImm(RsrcDataFormat >> 32); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) .addReg(DWord0) .addImm(AMDGPU::sub0) @@ -1202,14 +1908,44 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con .addImm(AMDGPU::sub2) .addReg(DWord3) .addImm(AMDGPU::sub3); - MI->setDesc(get(NewOpcode)); - if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(MI->getOperand(1).getReg()); - } else { - MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); - } - MI->getOperand(1).setReg(SRsrc); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + MI->setDesc(get(NewOpcode)); + if (MI->getOperand(2).isReg()) { + MI->getOperand(2).setReg(MI->getOperand(1).getReg()); + } else { + MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); + } + MI->getOperand(1).setReg(SRsrc); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + + const TargetRegisterClass *NewDstRC = + RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + break; + } + case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, + AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } + + case AMDGPU::S_LOAD_DWORDX16_IMM: + case AMDGPU::S_LOAD_DWORDX16_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, + AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } } } @@ -1281,8 +2017,32 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->eraseFromParent(); continue; + case AMDGPU::S_BFE_I64: { + splitScalar64BitBFE(Worklist, Inst); + Inst->eraseFromParent(); + continue; + } + + case AMDGPU::S_LSHL_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_BFE_U64: - case AMDGPU::S_BFE_I64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); } @@ -1311,17 +2071,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // We are converting these to a BFE, so we need to add the missing // operands for the size and offset. unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - Inst->addOperand(Inst->getOperand(1)); - Inst->getOperand(1).ChangeToImmediate(0); - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(Size)); - // XXX - Other pointless operands. There are 4, but it seems you only need - // 3 to not hit an assertion later in MCInstLower. - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(0)); } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { // The VALU version adds the second operand to the result, so insert an // extra 0 operand. @@ -1340,16 +2092,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst->RemoveOperand(2); // Remove old immediate. - Inst->addOperand(Inst->getOperand(1)); - Inst->getOperand(1).ChangeToImmediate(0); - Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(Offset)); - Inst->addOperand(MachineOperand::CreateImm(0)); Inst->addOperand(MachineOperand::CreateImm(BitWidth)); - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(0)); } // Update the destination register class. @@ -1403,7 +2148,7 @@ unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, } const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::VReg_32RegClass; + return &AMDGPU::VGPR_32RegClass; } void SIInstrInfo::splitScalar64BitUnaryOp( @@ -1562,6 +2307,67 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist Worklist.push_back(Second); } +void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + uint32_t Imm = Inst->getOperand(2).getImm(); + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + (void) Offset; + + // Only sext_inreg cases handled. + assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && + BitWidth <= 32 && + Offset == 0 && + "Not implemented"); + + if (BitWidth < 32) { + unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) + .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) + .addImm(31) + .addReg(MidRegLo); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(MidRegLo) + .addImm(AMDGPU::sub0) + .addReg(MidRegHi) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + return; + } + + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) + .addImm(31) + .addReg(Src.getReg(), 0, AMDGPU::sub0); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(Src.getReg(), 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(TmpReg) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); +} + void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, MachineInstr *Inst) const { // Add the implict and explicit register definitions. @@ -1580,13 +2386,81 @@ void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, } } +unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, + int OpIndices[3]) const { + const MCInstrDesc &Desc = get(MI->getOpcode()); + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = AMDGPU::NoRegister; + + // First we need to consider the instruction's operand requirements before + // legalizing. Some operands are required to be SGPRs, such as implicit uses + // of VCC, but we are still bound by the constant bus requirement to only use + // one. + // + // If the operand's class is an SGPR, we can never move it. + + for (const MachineOperand &MO : MI->implicit_operands()) { + // We only care about reads. + if (MO.isDef()) + continue; + + if (MO.getReg() == AMDGPU::VCC) + return AMDGPU::VCC; + + if (MO.getReg() == AMDGPU::FLAT_SCR) + return AMDGPU::FLAT_SCR; + } + + unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = OpIndices[i]; + if (Idx == -1) + break; + + const MachineOperand &MO = MI->getOperand(Idx); + if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) + SGPRReg = MO.getReg(); + + if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + UsedSGPRs[i] = MO.getReg(); + } + + if (SGPRReg != AMDGPU::NoRegister) + return SGPRReg; + + // We don't have a required SGPR operand, so we have a bit more freedom in + // selecting operands to move. + + // Try to select the most used SGPR. If an SGPR is equal to one of the + // others, we choose that. + // + // e.g. + // V_FMA_F32 v0, s0, s0, s0 -> No moves + // V_FMA_F32 v0, s0, s1, s0 -> Move s1 + + if (UsedSGPRs[0] != AMDGPU::NoRegister) { + if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[0]; + } + + if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { + if (UsedSGPRs[1] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[1]; + } + + return SGPRReg; +} + MachineInstrBuilder SIInstrInfo::buildIndirectWrite( MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister( + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( getIndirectIndexBegin(*MBB->getParent())); return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) @@ -1604,7 +2478,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead( unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister( + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( getIndirectIndexBegin(*MBB->getParent())); return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) @@ -1626,7 +2500,7 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, for (int Index = Begin; Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index)); + Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); @@ -1644,11 +2518,19 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); } -const MachineOperand *SIInstrInfo::getNamedOperand(const MachineInstr& MI, - unsigned OperandName) const { +MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, + unsigned OperandName) const { int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); if (Idx == -1) return nullptr; return &MI.getOperand(Idx); } + +uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { + uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; + if (ST.isAmdHsaOS()) + RsrcDataFormat |= (1ULL << 56); + + return RsrcDataFormat; +} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 4687539fdf58..f766dc85e86a 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -13,10 +13,11 @@ //===----------------------------------------------------------------------===// -#ifndef SIINSTRINFO_H -#define SIINSTRINFO_H +#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H +#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H #include "AMDGPUInstrInfo.h" +#include "SIDefines.h" #include "SIRegisterInfo.h" namespace llvm { @@ -44,6 +45,8 @@ private: const TargetRegisterClass *RC, const MachineOperand &Op) const; + void swapOperands(MachineBasicBlock::iterator Inst) const; + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst, unsigned Opcode) const; @@ -52,9 +55,16 @@ private: void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst) const; + void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const; + + unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; + public: explicit SIInstrInfo(const AMDGPUSubtarget &st); @@ -62,11 +72,30 @@ public: return RI; } + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const override; + + bool getLdStBaseRegImmOfs(MachineInstr *LdSt, + unsigned &BaseReg, unsigned &Offset, + const TargetRegisterInfo *TRI) const final; + + bool shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const final; + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, + unsigned TmpReg, + unsigned Offset, + unsigned Size) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, @@ -79,29 +108,102 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + // \brief Returns an opcode that can be used to move a value to a \p DstRC + // register. If there is no hardware instruction that can store to \p + // DstRC, then AMDGPU::COPY is returned. + unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; unsigned commuteOpcode(unsigned Opcode) const; MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI=false) const override; + bool NewMI = false) const override; + bool findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const override; bool isTriviallyReMaterializable(const MachineInstr *MI, AliasAnalysis *AA = nullptr) const; + bool areMemAccessesTriviallyDisjoint( + MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; + MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned DstReg, unsigned SrcReg) const override; bool isMov(unsigned Opcode) const override; bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - bool isDS(uint16_t Opcode) const; - int isMIMG(uint16_t Opcode) const; - int isSMRD(uint16_t Opcode) const; - bool isVOP1(uint16_t Opcode) const; - bool isVOP2(uint16_t Opcode) const; - bool isVOP3(uint16_t Opcode) const; - bool isVOPC(uint16_t Opcode) const; + + bool isSALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SALU; + } + + bool isVALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VALU; + } + + bool isSOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP1; + } + + bool isSOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP2; + } + + bool isSOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPC; + } + + bool isSOPK(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPK; + } + + bool isSOPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPP; + } + + bool isVOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP1; + } + + bool isVOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP2; + } + + bool isVOP3(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP3; + } + + bool isVOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOPC; + } + + bool isMUBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MUBUF; + } + + bool isMTBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MTBUF; + } + + bool isSMRD(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SMRD; + } + + bool isDS(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DS; + } + + bool isMIMG(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MIMG; + } + + bool isFLAT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FLAT; + } + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO) const; bool isLiteralConstant(const MachineOperand &MO) const; @@ -109,14 +211,28 @@ public: bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, const MachineOperand &MO) const; + /// \brief Return true if the given offset Size in bytes can be folded into + /// the immediate offsets of a memory instruction for the given address space. + bool canFoldOffset(unsigned OffsetSize, unsigned AS) const; + /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. /// This function will return false if you pass it a 32-bit instruction. bool hasVALU32BitEncoding(unsigned Opcode) const; + /// \brief Returns true if this operand uses the constant bus. + bool usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO) const; + + /// \brief Return true if this instruction has any modifiers. + /// e.g. src[012]_mod, omod, clamp. + bool hasModifiers(unsigned Opcode) const; + + bool hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const; + bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const override; - bool isSALUInstr(const MachineInstr &MI) const; static unsigned getVALUOp(const MachineInstr &MI); bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; @@ -144,10 +260,21 @@ public: /// instead of MOV. void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; + /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand + /// for \p MI. + bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO = nullptr) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr *MI) const; + /// \brief Split an SMRD instruction into two smaller loads of half the + // size storing the results in \p Lo and \p Hi. + void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const; + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; /// \brief Replace this instruction's opcode with the equivalent VALU @@ -181,8 +308,15 @@ public: /// \brief Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. - const MachineOperand *getNamedOperand(const MachineInstr& MI, - unsigned OperandName) const; + MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + + const MachineOperand *getNamedOperand(const MachineInstr &MI, + unsigned OpName) const { + return getNamedOperand(const_cast<MachineInstr &>(MI), OpName); + } + + uint64_t getDefaultRsrcDataFormat() const; + }; namespace AMDGPU { @@ -192,21 +326,34 @@ namespace AMDGPU { int getCommuteRev(uint16_t Opcode); int getCommuteOrig(uint16_t Opcode); int getMCOpcode(uint16_t Opcode, unsigned Gen); + int getAddr64Inst(uint16_t Opcode); + int getAtomicRetOp(uint16_t Opcode); + int getAtomicNoRetOp(uint16_t Opcode); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_TID_ENABLE = 1LL << 55; } // End namespace AMDGPU -} // End namespace llvm +namespace SI { +namespace KernelInputOffsets { + +/// Offsets in bytes from the start of the input buffer +enum Offsets { + NGROUPS_X = 0, + NGROUPS_Y = 4, + NGROUPS_Z = 8, + GLOBAL_SIZE_X = 12, + GLOBAL_SIZE_Y = 16, + GLOBAL_SIZE_Z = 20, + LOCAL_SIZE_X = 24, + LOCAL_SIZE_Y = 28, + LOCAL_SIZE_Z = 32 +}; + +} // End namespace KernelInputOffsets +} // End namespace SI -namespace SIInstrFlags { - enum Flags { - // First 4 bits are the instruction encoding - VM_CNT = 1 << 0, - EXP_CNT = 1 << 1, - LGKM_CNT = 1 << 2 - }; -} +} // End namespace llvm -#endif //SIINSTRINFO_H +#endif diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index b0ac20f558d0..7cc9588c8e4b 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -7,11 +7,61 @@ // //===----------------------------------------------------------------------===// +class vop { + field bits<9> SI3; + field bits<10> VI3; +} + +class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; + + field bits<9> SI3 = {0, si{7-0}}; + field bits<10> VI3 = {0, 0, vi{7-0}}; +} + +class vop1 <bits<8> si, bits<8> vi = si> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; + + field bits<9> SI3 = {1, 1, si{6-0}}; + field bits<10> VI3 = !add(0x140, vi); +} + +class vop2 <bits<6> si, bits<6> vi = si> : vop { + field bits<6> SI = si; + field bits<6> VI = vi; + + field bits<9> SI3 = {1, 0, 0, si{5-0}}; + field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; +} + +class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop { + let SI3 = si; + let VI3 = vi; +} + +class sop1 <bits<8> si, bits<8> vi = si> { + field bits<8> SI = si; + field bits<8> VI = vi; +} + +class sop2 <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class sopk <bits<5> si, bits<5> vi = si> { + field bits<5> SI = si; + field bits<5> VI = vi; +} + // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum // in AMDGPUMCInstLower.h def SISubtarget { int NONE = -1; int SI = 0; + int VI = 1; } //===----------------------------------------------------------------------===// @@ -105,6 +155,22 @@ def as_i32imm: SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32); }]>; +def as_i64imm: SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64); +}]>; + def IMM8bit : PatLeaf <(imm), [{return isUInt<8>(N->getZExtValue());}] >; @@ -117,6 +183,10 @@ def IMM16bit : PatLeaf <(imm), [{return isUInt<16>(N->getZExtValue());}] >; +def IMM20bit : PatLeaf <(imm), + [{return isUInt<20>(N->getZExtValue());}] +>; + def IMM32bit : PatLeaf <(imm), [{return isUInt<32>(N->getZExtValue());}] >; @@ -130,13 +200,17 @@ class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{ return isInlineImmediate(N); }]>; +class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ + return isInlineImmediate(N); +}]>; + class SGPRImm <dag frag> : PatLeaf<frag, [{ if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { return false; } const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo*>(TM.getRegisterInfo()); + static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo()); for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { @@ -159,13 +233,70 @@ def sopp_brtarget : Operand<OtherVT> { let OperandType = "OPERAND_PCREL"; } +include "SIInstrFormats.td" +include "VIInstrFormats.td" + +let OperandType = "OPERAND_IMMEDIATE" in { + +def offen : Operand<i1> { + let PrintMethod = "printOffen"; +} +def idxen : Operand<i1> { + let PrintMethod = "printIdxen"; +} +def addr64 : Operand<i1> { + let PrintMethod = "printAddr64"; +} +def mbuf_offset : Operand<i16> { + let PrintMethod = "printMBUFOffset"; +} +def ds_offset : Operand<i16> { + let PrintMethod = "printDSOffset"; +} +def ds_offset0 : Operand<i8> { + let PrintMethod = "printDSOffset0"; +} +def ds_offset1 : Operand<i8> { + let PrintMethod = "printDSOffset1"; +} +def glc : Operand <i1> { + let PrintMethod = "printGLC"; +} +def slc : Operand <i1> { + let PrintMethod = "printSLC"; +} +def tfe : Operand <i1> { + let PrintMethod = "printTFE"; +} + +def omod : Operand <i32> { + let PrintMethod = "printOModSI"; +} + +def ClampMod : Operand <i1> { + let PrintMethod = "printClampSI"; +} + +} // End OperandType = "OPERAND_IMMEDIATE" + //===----------------------------------------------------------------------===// // Complex patterns //===----------------------------------------------------------------------===// +def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; +def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; + def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">; +def MUBUFAddr64Atomic : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; +def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; +def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; + +def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; +def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; +def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">; +def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; //===----------------------------------------------------------------------===// // SI assembler operands @@ -174,9 +305,20 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; def SIOperand { int ZERO = 0x80; int VCC = 0x6A; + int FLAT_SCR = 0x68; } -include "SIInstrFormats.td" +def SRCMODS { + int NONE = 0; +} + +def DSTCLAMP { + int NONE = 0; +} + +def DSTOMOD { + int NONE = 0; +} //===----------------------------------------------------------------------===// // @@ -194,43 +336,175 @@ include "SIInstrFormats.td" // //===----------------------------------------------------------------------===// +class SIMCInstr <string pseudo, int subtarget> { + string PseudoInstr = pseudo; + int Subtarget = subtarget; +} + +//===----------------------------------------------------------------------===// +// EXP classes +//===----------------------------------------------------------------------===// + +class EXPCommon : InstSI< + (outs), + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, + VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), + "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", + [] > { + + let EXP_CNT = 1; + let Uses = [EXEC]; +} + +multiclass EXP_m { + + let isPseudo = 1 in { + def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; + } + + def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; + + def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; +} + //===----------------------------------------------------------------------===// // Scalar classes //===----------------------------------------------------------------------===// -class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 < - op, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0", pattern ->; +class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SOP1 <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; +} -class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < - op, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern ->; +class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> : + SOP1 <outs, ins, asm, pattern>, + SOP1e <op.SI>, + SIMCInstr<opName, SISubtarget.SI>; + +class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> : + SOP1 <outs, ins, asm, pattern>, + SOP1e <op.VI>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), + pattern>; + + def _si : SOP1_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), + opName#" $dst, $src0", pattern>; + + def _vi : SOP1_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), + opName#" $dst, $src0", pattern>; +} + +multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), + pattern>; + + def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern>; + + def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern>; +} + +// no input, 64-bit output. +multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>; + + def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins), + opName#" $dst", pattern> { + let SSRC0 = 0; + } + + def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins), + opName#" $dst", pattern> { + let SSRC0 = 0; + } +} // 64-bit input, 32-bit output. -class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 < - op, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern ->; +multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> { + def "" : SOP1_Pseudo <opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), + pattern>; -class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 < - op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->; + def _si : SOP1_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern>; -class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 < - op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), - opName#" $dst, $src0, $src1", pattern ->; + def _vi : SOP1_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern>; +} -class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 < - op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->; +class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : + SOP2<outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let Size = 4; +} + +class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> : + SOP2<outs, ins, asm, pattern>, + SOP2e<op.SI>, + SIMCInstr<opName, SISubtarget.SI>; + +class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> : + SOP2<outs, ins, asm, pattern>, + SOP2e<op.VI>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { + def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>; + + def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), + opName#" $dst, $src0, $src1 [$scc]", pattern>; + + def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), + opName#" $dst, $src0, $src1 [$scc]", pattern>; +} + +multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> { + def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1), pattern>; + def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>; -class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt, + def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), + (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>; +} + +multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> { + def "" : SOP2_Pseudo <opName, (outs SReg_64:$dst), + (ins SSrc_64:$src0, SSrc_64:$src1), pattern>; + + def _si : SOP2_Real_si <op, opName, (outs SReg_64:$dst), + (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1", pattern>; + + def _vi : SOP2_Real_vi <op, opName, (outs SReg_64:$dst), + (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1", pattern>; +} + +multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> { + def "" : SOP2_Pseudo <opName, (outs SReg_64:$dst), + (ins SSrc_64:$src0, SSrc_32:$src1), pattern>; + + def _si : SOP2_Real_si <op, opName, (outs SReg_64:$dst), + (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>; + + def _vi : SOP2_Real_vi <op, opName, (outs SReg_64:$dst), + (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>; +} + + +class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), opName#" $dst, $src0, $src1", []>; @@ -241,28 +515,90 @@ class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_64, i64, opName, cond>; -class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK < - op, (outs SReg_32:$dst), (ins i16imm:$src0), - opName#" $dst, $src0", pattern ->; +class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SOPK <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; +} -class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK < - op, (outs SReg_64:$dst), (ins i16imm:$src0), - opName#" $dst, $src0", pattern ->; +class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> : + SOPK <outs, ins, asm, pattern>, + SOPKe <op.SI>, + SIMCInstr<opName, SISubtarget.SI>; + +class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> : + SOPK <outs, ins, asm, pattern>, + SOPKe <op.VI>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { + def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0), + pattern>; + + def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), + opName#" $dst, $src0", pattern>; + + def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), + opName#" $dst, $src0", pattern>; +} + +multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { + def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst), + (ins SReg_32:$src0, u16imm:$src1), pattern>; + + def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), + (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0", pattern>; + + def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), + (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0", pattern>; +} + +//===----------------------------------------------------------------------===// +// SMRD classes +//===----------------------------------------------------------------------===// + +class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + SMRD <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; +} + +class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD <outs, ins, asm, []>, + SMRDe <op, imm>, + SIMCInstr<opName, SISubtarget.SI>; -multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass, +class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD <outs, ins, asm, []>, + SMEMe_vi <op, imm>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, + string asm, list<dag> pattern> { + + def "" : SMRD_Pseudo <opName, outs, ins, pattern>; + + def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>; + + def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; +} + +multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, RegisterClass dstClass> { - def _IMM : SMRD < - op, 1, (outs dstClass:$dst), + defm _IMM : SMRD_m < + op, opName#"_IMM", 1, (outs dstClass:$dst), (ins baseClass:$sbase, u32imm:$offset), - asm#" $dst, $sbase, $offset", [] + opName#" $dst, $sbase, $offset", [] >; - def _SGPR : SMRD < - op, 0, (outs dstClass:$dst), + defm _SGPR : SMRD_m < + op, opName#"_SGPR", 0, (outs dstClass:$dst), (ins baseClass:$sbase, SReg_32:$soff), - asm#" $dst, $sbase, $soff", [] + opName#" $dst, $sbase, $soff", [] >; } @@ -270,6 +606,210 @@ multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass, // Vector ALU classes //===----------------------------------------------------------------------===// +// This must always be right before the operand being input modified. +def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printOperandAndMods"; +} +def InputModsNoDefault : Operand <i32> { + let PrintMethod = "printOperandAndMods"; +} + +class getNumSrcArgs<ValueType Src1, ValueType Src2> { + int ret = + !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 + !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 + 3)); // VOP3 +} + +// Returns the register class to use for the destination of VOP[123C] +// instructions for the given VT. +class getVALUDstForVT<ValueType VT> { + RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, + !if(!eq(VT.Size, 64), VReg_64, + SReg_64)); // else VT == i1 +} + +// Returns the register class to use for source 0 of VOP[12C] +// instructions for the given VT. +class getVOPSrc0ForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); +} + +// Returns the register class to use for source 1 of VOP[12C] for the +// given VT. +class getVOPSrc1ForVT<ValueType VT> { + RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); +} + +// Returns the register classes for the source arguments of a VOP[12C] +// instruction for the given SrcVTs. +class getInRC32 <list<ValueType> SrcVT> { + list<DAGOperand> ret = [ + getVOPSrc0ForVT<SrcVT[0]>.ret, + getVOPSrc1ForVT<SrcVT[1]>.ret + ]; +} + +// Returns the register class to use for sources of VOP3 instructions for the +// given VT. +class getVOP3SrcForVT<ValueType VT> { + RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); +} + +// Returns the register classes for the source arguments of a VOP3 +// instruction for the given SrcVTs. +class getInRC64 <list<ValueType> SrcVT> { + list<DAGOperand> ret = [ + getVOP3SrcForVT<SrcVT[0]>.ret, + getVOP3SrcForVT<SrcVT[1]>.ret, + getVOP3SrcForVT<SrcVT[2]>.ret + ]; +} + +// Returns 1 if the source arguments have modifiers, 0 if they do not. +class hasModifiers<ValueType SrcVT> { + bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, 0)); +} + +// Returns the input arguments for VOP[12C] instructions for the given SrcVT. +class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { + dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 + !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 + (ins))); +} + +// Returns the input arguments for VOP3 instructions for the given SrcVT. +class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, + RegisterOperand Src2RC, int NumSrcArgs, + bit HasModifiers> { + + dag ret = + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP1 without modifiers + (ins Src0RC:$src0) + /* endif */ ), + !if (!eq(NumSrcArgs, 2), + !if (!eq(HasModifiers, 1), + // VOP 2 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP2 without modifiers + (ins Src0RC:$src0, Src1RC:$src1) + /* endif */ ) + /* NumSrcArgs == 3 */, + !if (!eq(HasModifiers, 1), + // VOP3 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + InputModsNoDefault:$src2_modifiers, Src2RC:$src2, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP3 without modifiers + (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) + /* endif */ ))); +} + +// Returns the assembly string for the inputs and outputs of a VOP[12C] +// instruction. This does not add the _e32 suffix, so it can be reused +// by getAsm64. +class getAsm32 <int NumSrcArgs> { + string src1 = ", $src1"; + string src2 = ", $src2"; + string ret = " $dst, $src0"# + !if(!eq(NumSrcArgs, 1), "", src1)# + !if(!eq(NumSrcArgs, 3), src2, ""); +} + +// Returns the assembly string for the inputs and outputs of a VOP3 +// instruction. +class getAsm64 <int NumSrcArgs, bit HasModifiers> { + string src0 = "$src0_modifiers,"; + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + string ret = + !if(!eq(HasModifiers, 0), + getAsm32<NumSrcArgs>.ret, + " $dst, "#src0#src1#src2#"$clamp"#"$omod"); +} + + +class VOPProfile <list<ValueType> _ArgVT> { + + field list<ValueType> ArgVT = _ArgVT; + + field ValueType DstVT = ArgVT[0]; + field ValueType Src0VT = ArgVT[1]; + field ValueType Src1VT = ArgVT[2]; + field ValueType Src2VT = ArgVT[3]; + field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret; + field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; + field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret; + field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; + field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; + field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; + + field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret; + field bit HasModifiers = hasModifiers<Src0VT>.ret; + + field dag Outs = (outs DstRC:$dst); + + field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; + field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, + HasModifiers>.ret; + + field string Asm32 = "_e32"#getAsm32<NumSrcArgs>.ret; + field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret; +} + +def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; +def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; +def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; +def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>; +def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>; +def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; +def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; +def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; +def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; + +def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; +def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; +def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; +def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; +def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { + let Src0RC32 = VCSrc_32; +} + +def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = " $dst, $src0_modifiers, $src1"; +} + +def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = " $dst, $src0_modifiers, $src1"; +} + +def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; +def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; + +def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; +def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; +def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; +def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; + + class VOP <string opName> { string OpName = opName; } @@ -279,399 +819,943 @@ class VOP2_REV <string revOp, bit isOrig> { bit IsOrig = isOrig; } -class SIMCInstr <string pseudo, int subtarget> { - string PseudoInstr = pseudo; - int Subtarget = subtarget; +class AtomicNoRet <string noRetOp, bit isRet> { + string NoRetOp = noRetOp; + bit IsRet = isRet; +} + +class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP1Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr <opName#"_e32", SISubtarget.NONE> { + let isPseudo = 1; +} + +multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName> { + def "" : VOP1_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP1<op.SI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; + def _vi : VOP1<op.VI, outs, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI>; +} + +class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOP2Common <outs, ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SISubtarget.NONE> { + let isPseudo = 1; +} + +multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, string revOpSI, string revOpVI> { + def "" : VOP2_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOpSI#"_e32", !eq(revOpSI, opName)>; + + def _si : VOP2 <op.SI, outs, ins, opName#asm, []>, + VOP2_REV<revOpSI#"_e32_si", !eq(revOpSI, opName)>, + SIMCInstr <opName#"_e32", SISubtarget.SI>; + def _vi : VOP2 <op.VI, outs, ins, opName#asm, []>, + VOP2_REV<revOpVI#"_e32_vi", !eq(revOpVI, opName)>, + SIMCInstr <opName#"_e32", SISubtarget.VI>; +} + +class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { + + bits<2> src0_modifiers = !if(HasModifiers, ?, 0); + bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); + bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ? ,0) ,0); + bits<2> omod = !if(HasModifiers, ?, 0); + bits<1> clamp = !if(HasModifiers, ?, 0); + bits<9> src1 = !if(HasSrc1, ?, 0); + bits<9> src2 = !if(HasSrc2, ?, 0); } class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP3Common <outs, ins, "", pattern>, VOP <opName>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName#"_e64", SISubtarget.NONE> { let isPseudo = 1; } class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : - VOP3 <op, outs, ins, asm, []>, - SIMCInstr<opName, SISubtarget.SI>; + VOP3Common <outs, ins, asm, []>, + VOP3e <op>, + SIMCInstr<opName#"_e64", SISubtarget.SI>; + +class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : + VOP3Common <outs, ins, asm, []>, + VOP3e_vi <op>, + SIMCInstr <opName#"_e64", SISubtarget.VI>; + +// VI only instruction +class VOP3_vi <bits<10> op, string opName, dag outs, dag ins, string asm, + list<dag> pattern, int NumSrcArgs, bit HasMods = 1> : + VOP3Common <outs, ins, asm, pattern>, + VOP <opName>, + VOP3e_vi <op>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; -multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern, - string opName> { +multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - def _si : VOP3_Real_si <op, outs, ins, asm, opName>; - + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), + !if(!eq(NumSrcArgs, 2), 0, 1), + HasMods>; } -multiclass VOP3_1_m <bits<8> op, dag outs, dag ins, string asm, - list<dag> pattern, string opName> { +// VOP3_m without source modifiers +multiclass VOP3_m_nosrcmod <vop op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - let src1 = 0, src1_modifiers = 0, src2 = 0, src2_modifiers = 0 in { - - def _si : VOP3_Real_si < - {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, - outs, ins, asm, opName - >; - - } // src1 = 0, src1_modifiers = 0, src2 = 0, src2_modifiers = 0 + let src0_modifiers = 0, + src1_modifiers = 0, + src2_modifiers = 0 in { + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>; + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>; + } } -multiclass VOP3_2_m <bits<6> op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp> { +multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - let src2 = 0, src2_modifiers = 0 in { + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; - def _si : VOP3_Real_si < - {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, - outs, ins, asm, opName>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - - } // src2 = 0, src2_modifiers = 0 + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<0, 0, HasMods>; } -// This must always be right before the operand being input modified. -def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { - let PrintMethod = "printOperandAndMods"; +multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOpSI, string revOpVI, + bit HasMods = 1, bit UseFullOp = 0> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOpSI#"_e64", !eq(revOpSI, opName)>; + + def _si : VOP3_Real_si <op.SI3, + outs, ins, asm, opName>, + VOP2_REV<revOpSI#"_e64_si", !eq(revOpSI, opName)>, + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3_Real_vi <op.VI3, + outs, ins, asm, opName>, + VOP2_REV<revOpVI#"_e64_vi", !eq(revOpVI, opName)>, + VOP3DisableFields<1, 0, HasMods>; } -multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src, - string opName, list<dag> pattern> { - - def _e32 : VOP1 < - op, (outs drc:$dst), (ins src:$src0), - opName#"_e32 $dst, $src0", pattern - >, VOP <opName>; - - defm _e64 : VOP3_1_m < - op, - (outs drc:$dst), - (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod), - opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", [], opName>; -} - -multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> - : VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>; - -multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> - : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>; - -multiclass VOP1_32_64 <bits<8> op, string opName, list<dag> pattern> - : VOP1_Helper <op, VReg_32, VSrc_64, opName, pattern>; - -multiclass VOP1_64_32 <bits<8> op, string opName, list<dag> pattern> - : VOP1_Helper <op, VReg_64, VSrc_32, opName, pattern>; - -multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc, - string opName, list<dag> pattern, string revOp> { - def _e32 : VOP2 < - op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), - opName#"_e32 $dst, $src0, $src1", pattern - >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - - defm _e64 : VOP3_2_m < - op, - (outs vrc:$dst), - (ins InputMods:$src0_modifiers, arc:$src0, - InputMods:$src1_modifiers, arc:$src1, - i32imm:$clamp, i32imm:$omod), - opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [], - opName, revOp>; -} - -multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern, - string revOp = opName> - : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern, revOp>; - -multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern, - string revOp = opName> - : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern, revOp>; - -multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern, - RegisterClass src0_rc, string revOp = opName> { - - def _e32 : VOP2 < - op, (outs VReg_32:$dst), (ins src0_rc:$src0, VReg_32:$src1), - opName#"_e32 $dst, $src0, $src1", pattern - >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - - def _e64 : VOP3b < - {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, - (outs VReg_32:$dst), - (ins InputMods: $src0_modifiers, VSrc_32:$src0, - InputMods:$src1_modifiers, VSrc_32:$src1, - i32imm:$clamp, i32imm:$omod), - opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [] - >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { - let src2 = 0; - let src2_modifiers = 0; - /* the VOP2 variant puts the carry out into VCC, the VOP3 variant - can write it into any SGPR. We currently don't use the carry out, - so for now hardcode it to VCC as well */ - let sdst = SIOperand.VCC; - } +multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + + // The VOP2 variant puts the carry out into VCC, the VOP3 variant + // can write it into any SGPR. We currently don't use the carry out, + // so for now hardcode it to VCC as well. + let sdst = SIOperand.VCC, Defs = [VCC] in { + def _si : VOP3b <op.SI3, outs, ins, asm, pattern>, + VOP3DisableFields<1, 0, HasMods>, + SIMCInstr<opName#"_e64", SISubtarget.SI>, + VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>; + + // TODO: Do we need this VI variant here? + /*def _vi : VOP3b_vi <op.VI3, outs, ins, asm, pattern>, + VOP3DisableFields<1, 0, HasMods>, + SIMCInstr<opName#"_e64", SISubtarget.VI>, + VOP2_REV<revOp#"_e64_vi", !eq(revOp, opName)>;*/ + } // End sdst = SIOperand.VCC, Defs = [VCC] } -multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, - string opName, ValueType vt, PatLeaf cond, bit defExec = 0> { - def _e32 : VOPC < - op, (ins arc:$src0, vrc:$src1), - opName#"_e32 $dst, $src0, $src1", [] - >, VOP <opName> { +multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, + bit HasMods, bit defExec> { + + def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); } - def _e64 : VOP3 < - {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, - (outs SReg_64:$dst), - (ins InputMods:$src0_modifiers, arc:$src0, - InputMods:$src1_modifiers, arc:$src1, - InstFlag:$clamp, InstFlag:$omod), - opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", - !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>, - [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))] - ) - >, VOP <opName> { + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); - let src2 = 0; - let src2_modifiers = 0; } } -multiclass VOPC_32 <bits<8> op, string opName, - ValueType vt = untyped, PatLeaf cond = COND_NULL> - : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond>; +multiclass VOP1_Helper <vop1 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + bit HasMods> { + + defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>; + + defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>; +} + +multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP1_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + P.HasModifiers +>; + +multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> { + + def _e32 : VOP1 <op.SI, P.Outs, P.Ins32, opName#P.Asm32, []>, + VOP <opName>; + + def _e64 : VOP3Common <P.Outs, P.Ins64, opName#P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))])>, + VOP <opName>, + VOP3e <op.SI3>, + VOP3DisableFields<0, 0, P.HasModifiers>; +} + +multiclass VOP2_Helper <vop2 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOpSI, string revOpVI, bit HasMods> { + defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOpSI, revOpVI>; -multiclass VOPC_64 <bits<8> op, string opName, - ValueType vt = untyped, PatLeaf cond = COND_NULL> - : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>; + defm _e64 : VOP3_2_m <op, + outs, ins64, opName#"_e64"#asm64, pat64, opName, revOpSI, revOpVI, HasMods + >; +} + +multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOpSI = opName, string revOpVI = revOpSI> : VOP2_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOpSI, revOpVI, P.HasModifiers +>; -multiclass VOPCX_32 <bits<8> op, string opName, - ValueType vt = untyped, PatLeaf cond = COND_NULL> - : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>; +multiclass VOP2b_Helper <vop2 op, string opName, dag outs, + dag ins32, string asm32, list<dag> pat32, + dag ins64, string asm64, list<dag> pat64, + string revOp, bit HasMods> { -multiclass VOPCX_64 <bits<8> op, string opName, - ValueType vt = untyped, PatLeaf cond = COND_NULL> - : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>; + defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp, revOp>; + + defm _e64 : VOP3b_2_m <op, + outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods + >; +} -multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m < - op, (outs VReg_32:$dst), - (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers, - VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2, - InstFlag:$clamp, InstFlag:$omod), - opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName +multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> : VOP2b_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers >; -class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 < - op, (outs VReg_64:$dst), - (ins VSrc_64:$src0, VSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->, VOP <opName> { +class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : + VOPCCommon <ins, "", pattern>, + VOP <opName>, + SIMCInstr<opName#"_e32", SISubtarget.NONE> { + let isPseudo = 1; +} + +multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, + string opName, bit DefExec> { + def "" : VOPC_Pseudo <outs, ins, pattern, opName>; + + def _si : VOPC<op.SI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let Defs = !if(DefExec, [EXEC], []); + } + + def _vi : VOPC<op.VI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let Defs = !if(DefExec, [EXEC], []); + } +} - let src2 = 0; - let src2_modifiers = 0; - let src0_modifiers = 0; - let clamp = 0; - let omod = 0; +multiclass VOPC_Helper <vopc op, string opName, + dag ins32, string asm32, list<dag> pat32, + dag out64, dag ins64, string asm64, list<dag> pat64, + bit HasMods, bit DefExec> { + defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; + + defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, + opName, HasMods, DefExec>; } -class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 < - op, (outs VReg_64:$dst), - (ins InputMods:$src0_modifiers, VSrc_64:$src0, - InputMods:$src1_modifiers, VSrc_64:$src1, - InputMods:$src2_modifiers, VSrc_64:$src2, - InstFlag:$clamp, InstFlag:$omod), - opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern ->, VOP <opName>; +multiclass VOPCInst <vopc op, string opName, + VOPProfile P, PatLeaf cond = COND_NULL, + bit DefExec = 0> : VOPC_Helper < + op, opName, + P.Ins32, P.Asm32, [], + (outs SReg_64:$dst), P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set i1:$dst, + (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), + P.HasModifiers, DefExec +>; + +multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, + bit DefExec = 0> : VOPC_Helper < + op, opName, + P.Ins32, P.Asm32, [], + (outs SReg_64:$dst), P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set i1:$dst, + (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], + [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), + P.HasModifiers, DefExec +>; + + +multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCInst <op, opName, VOP_F32_F32_F32, cond>; + +multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCInst <op, opName, VOP_F64_F64_F64, cond>; + +multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCInst <op, opName, VOP_I32_I32_I32, cond>; + +multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCInst <op, opName, VOP_I64_I64_I64, cond>; + + +multiclass VOPCX <vopc op, string opName, VOPProfile P, + PatLeaf cond = COND_NULL> + : VOPCInst <op, opName, P, cond, 1>; + +multiclass VOPCX_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCX <op, opName, VOP_F32_F32_F32, cond>; +multiclass VOPCX_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCX <op, opName, VOP_F64_F64_F64, cond>; -class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc, - string opName, list<dag> pattern> : VOP3 < - op, (outs vrc:$dst0, SReg_64:$dst1), - (ins arc:$src0, arc:$src1, arc:$src2, - InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), - opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern ->, VOP <opName>; +multiclass VOPCX_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCX <op, opName, VOP_I32_I32_I32, cond>; +multiclass VOPCX_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> : + VOPCX <op, opName, VOP_I64_I64_I64, cond>; -class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> : +multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, + list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < + op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods +>; + +multiclass VOPC_CLASS_F32 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>; + +multiclass VOPCX_CLASS_F32 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>; + +multiclass VOPC_CLASS_F64 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>; + +multiclass VOPCX_CLASS_F64 <vopc op, string opName> : + VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>; + +multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_Helper < + op, opName, P.Outs, P.Ins64, P.Asm64, + !if(!eq(P.NumSrcArgs, 3), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, + P.Src2VT:$src2))]), + !if(!eq(P.NumSrcArgs, 2), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) + /* P.NumSrcArgs == 1 */, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), + P.NumSrcArgs, P.HasModifiers +>; + +class VOP3InstVI <bits<10> op, string opName, VOPProfile P, + SDPatternOperator node = null_frag> : VOP3_vi < + op, opName#"_vi", P.Outs, P.Ins64, opName#P.Asm64, + !if(!eq(P.NumSrcArgs, 3), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, + P.Src2VT:$src2))]), + !if(!eq(P.NumSrcArgs, 2), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) + /* P.NumSrcArgs == 1 */, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), + P.NumSrcArgs, P.HasModifiers +>; + +multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc, + string opName, list<dag> pattern> : + VOP3b_2_m < + op, (outs vrc:$vdst, SReg_64:$sdst), + (ins InputModsNoDefault:$src0_modifiers, arc:$src0, + InputModsNoDefault:$src1_modifiers, arc:$src1, + InputModsNoDefault:$src2_modifiers, arc:$src2, + ClampMod:$clamp, omod:$omod), + opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, + opName, opName, 1, 1 +>; + +multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> : VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; -class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> : - VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>; +multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> : + VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>; + + +class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))), + (Inst i32:$src0_modifiers, P.Src0VT:$src0, + i32:$src1_modifiers, P.Src1VT:$src1, + i32:$src2_modifiers, P.Src2VT:$src2, + i1:$clamp, + i32:$omod)>; + +//===----------------------------------------------------------------------===// +// Interpolation opcodes +//===----------------------------------------------------------------------===// + +class VINTRP_Pseudo <string opName, dag outs, dag ins, string asm, + list<dag> pattern> : + VINTRPCommon <outs, ins, asm, pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; +} + +class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins, + string asm, list<dag> pattern> : + VINTRPCommon <outs, ins, asm, pattern>, + VINTRPe <op>, + SIMCInstr<opName, SISubtarget.SI>; + +class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins, + string asm, list<dag> pattern> : + VINTRPCommon <outs, ins, asm, pattern>, + VINTRPe_vi <op>, + SIMCInstr<opName, SISubtarget.VI>; + +multiclass VINTRP_m <bits <2> op, string opName, dag outs, dag ins, string asm, + string disableEncoding = "", string constraints = "", + list<dag> pattern = []> { + let DisableEncoding = disableEncoding, + Constraints = constraints in { + def "" : VINTRP_Pseudo <opName, outs, ins, asm, pattern>; + + def _si : VINTRP_Real_si <op, opName, outs, ins, asm, pattern>; + + def _vi : VINTRP_Real_vi <op, opName, outs, ins, asm, pattern>; + } +} //===----------------------------------------------------------------------===// // Vector I/O classes //===----------------------------------------------------------------------===// -class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> : - DS <op, outs, ins, asm, pat> { +class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + DS <outs, ins, "", pattern>, + SIMCInstr <opName, SISubtarget.NONE> { + let isPseudo = 1; +} + +class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe <op>, + SIMCInstr <opName, SISubtarget.SI>; + +class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe_vi <op>, + SIMCInstr <opName, SISubtarget.VI>; + +class DS_1A_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe <op>, + SIMCInstr <opName, SISubtarget.SI> { + + // Single load interpret the 2 i8imm operands as a single i16 offset. bits<16> offset; + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; +} + +class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : + DS <outs, ins, asm, []>, + DSe_vi <op>, + SIMCInstr <opName, SISubtarget.VI> { // Single load interpret the 2 i8imm operands as a single i16 offset. + bits<16> offset; let offset0 = offset{7-0}; let offset1 = offset{15-8}; } -class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < +multiclass DS_1A_Load_m <bits<8> op, string opName, dag outs, dag ins, string asm, + list<dag> pat> { + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def "" : DS_Pseudo <opName, outs, ins, pat>; + + let data0 = 0, data1 = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } +} + +multiclass DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> + : DS_1A_Load_m < op, + asm, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset), - asm#" $vdst, $addr, $offset, [M0]", - []> { - let data0 = 0; - let data1 = 0; - let mayLoad = 1; - let mayStore = 0; + (ins i1imm:$gds, VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0), + asm#" $vdst, $addr"#"$offset"#" [M0]", + []>; + +multiclass DS_Load2_m <bits<8> op, string opName, dag outs, dag ins, string asm, + list<dag> pat> { + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def "" : DS_Pseudo <opName, outs, ins, pat>; + + let data0 = 0, data1 = 0 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } + } } -class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS < +multiclass DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> + : DS_Load2_m < op, + asm, (outs regClass:$vdst), - (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1), - asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]", - []> { - let data0 = 0; - let data1 = 0; - let mayLoad = 1; - let mayStore = 0; + (ins i1imm:$gds, VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, + M0Reg:$m0), + asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]", + []>; + +multiclass DS_1A_Store_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat> { + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, pat>; + + let data1 = 0, vdst = 0 in { + def _si : DS_1A_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>; + } + } } -class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < +multiclass DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> + : DS_1A_Store_m < op, + asm, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset), - asm#" $addr, $data0, $offset [M0]", - []> { - let data1 = 0; - let mayStore = 1; - let mayLoad = 0; - let vdst = 0; + (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0), + asm#" $addr, $data0"#"$offset"#" [M0]", + []>; + +multiclass DS_Store_m <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pat> { + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def "" : DS_Pseudo <opName, outs, ins, pat>; + + let vdst = 0 in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } + } } -class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A < +multiclass DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> + : DS_Store_m < op, + asm, (outs), - (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1), - asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]", - []> { - let mayStore = 1; - let mayLoad = 0; - let vdst = 0; + (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, regClass:$data1, + ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0), + asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]", + []>; + +class DS_1A_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> : + DS_si <op, outs, ins, asm, pat> { + bits<16> offset; + + // Single load interpret the 2 i8imm operands as a single i16 offset. + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; + + let hasSideEffects = 0; } // 1 address, 1 data. -class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < +class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A_si < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset), - asm#" $vdst, $addr, $data0, $offset, [M0]", - []> { + (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), + asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>, + AtomicNoRet<noRetOp, 1> { let data1 = 0; let mayStore = 1; let mayLoad = 1; + + let hasPostISelHook = 1; // Adjusted to no return version. } // 1 address, 2 data. -class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A < +class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A_si < op, (outs rc:$vdst), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset), - asm#" $vdst, $addr, $data0, $data1, $offset, [M0]", - []> { + (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), + asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]", + []>, + AtomicNoRet<noRetOp, 1> { let mayStore = 1; let mayLoad = 1; + let hasPostISelHook = 1; // Adjusted to no return version. } // 1 address, 2 data. -class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A < +class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A_si < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset), - asm#" $addr, $data0, $data1, $offset, [M0]", - []> { + (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0), + asm#" $addr, $data0, $data1"#"$offset"#" [M0]", + []>, + AtomicNoRet<noRetOp, 0> { let mayStore = 1; let mayLoad = 1; } // 1 address, 1 data. -class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A < +class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A_si < op, (outs), - (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset), - asm#" $addr, $data0, $offset, [M0]", - []> { + (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0), + asm#" $addr, $data0"#"$offset"#" [M0]", + []>, + AtomicNoRet<noRetOp, 0> { let data1 = 0; let mayStore = 1; let mayLoad = 1; } -class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < - op, - (outs), +//===----------------------------------------------------------------------===// +// MTBUF classes +//===----------------------------------------------------------------------===// + +class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + MTBUF <outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; +} + +class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, + string asm> : + MTBUF <outs, ins, asm, []>, + MTBUFe <op>, + SIMCInstr<opName, SISubtarget.SI>; + +class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> : + MTBUF <outs, ins, asm, []>, + MTBUFe_vi <op>, + SIMCInstr <opName, SISubtarget.VI>; + +multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm, + list<dag> pattern> { + + def "" : MTBUF_Pseudo <opName, outs, ins, pattern>; + + def _si : MTBUF_Real_si <op, opName, outs, ins, asm>; + + def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>; + +} + +let mayStore = 1, mayLoad = 0 in { + +multiclass MTBUF_Store_Helper <bits<3> op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs), (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, - i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, - SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), - asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", - []> { - let mayStore = 1; - let mayLoad = 0; + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, + SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), + opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayStore = 1, mayLoad = 0 + +let mayLoad = 1, mayStore = 0 in { + +multiclass MTBUF_Load_Helper <bits<3> op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs regClass:$dst), + (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, + i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), + opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayLoad = 1, mayStore = 0 + +//===----------------------------------------------------------------------===// +// MUBUF classes +//===----------------------------------------------------------------------===// + +class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe <op> { + let lds = 0; +} + +class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> { + let lds = 0; +} + +class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { + + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + +class MUBUFAtomicAddr64 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> + : MUBUF_si <op, outs, ins, asm, pattern> { + + let offen = 0; + let idxen = 0; + let addr64 = 1; + let tfe = 0; + let lds = 0; + let soffset = 128; +} + +class MUBUFAtomicOffset <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> + : MUBUF_si <op, outs, ins, asm, pattern> { + + let offen = 0; + let idxen = 0; + let addr64 = 0; + let tfe = 0; + let lds = 0; + let vaddr = 0; +} + +multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc, + ValueType vt, SDPatternOperator atomic> { + + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { + + // No return variants + let glc = 0 in { + + def _ADDR64 : MUBUFAtomicAddr64 < + op, (outs), + (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", [] + >, MUBUFAddr64Table<1>, AtomicNoRet<NAME#"_ADDR64", 0>; + + def _OFFSET : MUBUFAtomicOffset < + op, (outs), + (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, + SCSrc_32:$soffset, slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [] + >, MUBUFAddr64Table<0>, AtomicNoRet<NAME#"_OFFSET", 0>; + } // glc = 0 + + // Variant that return values + let glc = 1, Constraints = "$vdata = $vdata_in", + DisableEncoding = "$vdata_in" in { + + def _RTN_ADDR64 : MUBUFAtomicAddr64 < + op, (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc", + [(set vt:$vdata, + (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset, + i1:$slc), vt:$vdata_in))] + >, MUBUFAddr64Table<1, "_RTN">, AtomicNoRet<NAME#"_ADDR64", 1>; + + def _RTN_OFFSET : MUBUFAtomicOffset < + op, (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset, + SCSrc_32:$soffset, slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", + [(set vt:$vdata, + (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, + i1:$slc), vt:$vdata_in))] + >, MUBUFAddr64Table<0, "_RTN">, AtomicNoRet<NAME#"_OFFSET", 1>; + + } // glc = 1 + + } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 } multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { - let lds = 0, mayLoad = 1 in { + let mayLoad = 1, mayStore = 0 in { let addr64 = 0 in { let offen = 0, idxen = 0, vaddr = 0 in { - def _OFFSET : MUBUF <op, (outs regClass:$vdata), + def _OFFSET : MUBUF_si <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, - u16imm:$offset, SSrc_32:$soffset, i1imm:$glc, - i1imm:$slc, i1imm:$tfe), - asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; + mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, + slc:$slc, tfe:$tfe), + asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe)))]>, + MUBUFAddr64Table<0>; } let offen = 1, idxen = 0 in { - def _OFFEN : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_32:$vaddr, - SSrc_32:$soffset, u16imm:$offset, i1imm:$glc, i1imm:$slc, - i1imm:$tfe), - asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>; + def _OFFEN : MUBUF_si <op, (outs regClass:$vdata), + (ins SReg_128:$srsrc, VGPR_32:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } let offen = 0, idxen = 1 in { - def _IDXEN : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_32:$vaddr, - u16imm:$offset, SSrc_32:$soffset, i1imm:$glc, - i1imm:$slc, i1imm:$tfe), - asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; + def _IDXEN : MUBUF_si <op, (outs regClass:$vdata), + (ins SReg_128:$srsrc, VGPR_32:$vaddr, + mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, + slc:$slc, tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; } let offen = 1, idxen = 1 in { - def _BOTHEN : MUBUF <op, (outs regClass:$vdata), + def _BOTHEN : MUBUF_si <op, (outs regClass:$vdata), (ins SReg_128:$srsrc, VReg_64:$vaddr, - SSrc_32:$soffset, i1imm:$glc, - i1imm:$slc, i1imm:$tfe), - asm#" $vdata, $srsrc[$vaddr[0]] + $vaddr[1] + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>; + SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>; } } let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in { - def _ADDR64 : MUBUF <op, (outs regClass:$vdata), - (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), - asm#" $vdata, $srsrc + $vaddr + $offset", + def _ADDR64 : MUBUF_si <op, (outs regClass:$vdata), + (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), + asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, - i64:$vaddr, u16imm:$offset)))]>; + i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>; + } + } +} + +multiclass MUBUF_Load_Helper_vi <bits<7> op, string asm, RegisterClass regClass, + ValueType load_vt = i32, + SDPatternOperator ld = null_frag> { + + let lds = 0, mayLoad = 1 in { + let offen = 0, idxen = 0, vaddr = 0 in { + def _OFFSET : MUBUF_vi <op, (outs regClass:$vdata), + (ins SReg_128:$srsrc, + mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, + slc:$slc, tfe:$tfe), + asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, + i32:$soffset, i16:$offset, + i1:$glc, i1:$slc, i1:$tfe)))]>, + MUBUFAddr64Table<0>; + } + + let offen = 1, idxen = 0 in { + def _OFFEN : MUBUF_vi <op, (outs regClass:$vdata), + (ins SReg_128:$srsrc, VGPR_32:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, + tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 0, idxen = 1 in { + def _IDXEN : MUBUF_vi <op, (outs regClass:$vdata), + (ins SReg_128:$srsrc, VGPR_32:$vaddr, + mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc, + slc:$slc, tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + } + + let offen = 1, idxen = 1 in { + def _BOTHEN : MUBUF_vi <op, (outs regClass:$vdata), + (ins SReg_128:$srsrc, VReg_64:$vaddr, + SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>; } } } @@ -679,23 +1763,51 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass, multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, ValueType store_vt, SDPatternOperator st> { - def "" : MUBUF < - op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset, - u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc, - i1imm:$tfe), - name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe", - [] - > { - let addr64 = 0; - } + let mayLoad = 0, mayStore = 1 in { + let addr64 = 0 in { + + def "" : MUBUF_si < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, + mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, + tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# + "$glc"#"$slc"#"$tfe", + [] + >; - def _ADDR64 : MUBUF < + let offen = 0, idxen = 0, vaddr = 0 in { + def _OFFSET : MUBUF_si < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset, + SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, + i1:$tfe))] + >, MUBUFAddr64Table<0>; + } // offen = 0, idxen = 0, vaddr = 0 + + let offen = 1, idxen = 0 in { + def _OFFEN : MUBUF_si < + op, (outs), + (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset, + mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# + "$glc"#"$slc"#"$tfe", + [] + >; + } // end offen = 1, idxen = 0 + + } // End addr64 = 0 + + def _ADDR64 : MUBUF_si < op, (outs), - (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset), - name#" $vdata, $srsrc + $vaddr + $offset", + (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset), + name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset", [(st store_vt:$vdata, - (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> { + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1> + { let mayLoad = 0; let mayStore = 1; @@ -705,24 +1817,35 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass let idxen = 0; let glc = 0; let addr64 = 1; - let lds = 0; let slc = 0; let tfe = 0; let soffset = 128; // ZERO } + } // End mayLoad = 0, mayStore = 1 } -class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < - op, - (outs regClass:$dst), - (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), - asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", - []> { +class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : + FLAT <op, (outs regClass:$data), + (ins VReg_64:$addr), + asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> { + let glc = 0; + let slc = 0; + let tfe = 0; let mayLoad = 1; - let mayStore = 0; +} + +class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : + FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr), + name#" $data, $addr, [M0, FLAT_SCRATCH]", + []> { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let glc = 0; + let slc = 0; + let tfe = 0; } class MIMG_Mask <string op, int channels> { @@ -750,7 +1873,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm, multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, int channels> { - def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>, + def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>, MIMG_Mask<asm#"_V1", channels>; def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>, MIMG_Mask<asm#"_V2", channels>; @@ -759,7 +1882,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, } multiclass MIMG_NoSampler <bits<7> op, string asm> { - defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>; + defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>; defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>; defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>; @@ -784,7 +1907,7 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, int channels> { - def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>, + def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32>, MIMG_Mask<asm#"_V1", channels>; def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>, MIMG_Mask<asm#"_V2", channels>; @@ -797,7 +1920,7 @@ multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, } multiclass MIMG_Sampler <bits<7> op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>; + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1>; defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>; defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>; defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>; @@ -831,7 +1954,7 @@ class MIMG_Gather_Helper <bits<7> op, string asm, multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, int channels> { - def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>, + def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32>, MIMG_Mask<asm#"_V1", channels>; def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>, MIMG_Mask<asm#"_V2", channels>; @@ -844,7 +1967,7 @@ multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, } multiclass MIMG_Gather <bits<7> op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>; + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1>; defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>; defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>; defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>; @@ -898,20 +2021,40 @@ def getCommuteOrig : InstrMapping { let ValueCols = [["1"]]; } -def isDS : InstrMapping { - let FilterClass = "DS"; - let RowFields = ["Inst"]; - let ColFields = ["Size"]; - let KeyCol = ["8"]; - let ValueCols = [["8"]]; -} - -def getMCOpcode : InstrMapping { +def getMCOpcodeGen : InstrMapping { let FilterClass = "SIMCInstr"; let RowFields = ["PseudoInstr"]; let ColFields = ["Subtarget"]; let KeyCol = [!cast<string>(SISubtarget.NONE)]; - let ValueCols = [[!cast<string>(SISubtarget.SI)]]; + let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]]; +} + +def getAddr64Inst : InstrMapping { + let FilterClass = "MUBUFAddr64Table"; + let RowFields = ["OpName"]; + let ColFields = ["IsAddr64"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its version with a return value. +def getAtomicRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its returnless version. +def getAtomicNoRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; } include "SIInstructions.td" +include "CIInstructions.td" +include "VIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index aecd847a2ba1..e05b6bb7d0f1 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -26,18 +26,37 @@ def SendMsgImm : Operand<i32> { let PrintMethod = "printSendMsg"; } -def isSI : Predicate<"Subtarget.getGeneration() " +def isGCN : Predicate<"Subtarget.getGeneration() " ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">; - +def isSICI : Predicate< + "Subtarget.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" +>; def isCI : Predicate<"Subtarget.getGeneration() " ">= AMDGPUSubtarget::SEA_ISLANDS">; +def isVI : Predicate < + "Subtarget.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS" +>; -def isCFDepth0 : Predicate<"isCFDepth0()">; +def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">; -def WAIT_FLAG : InstFlag<"printWaitFlag">; +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; +} -let SubtargetPredicate = isSI in { -let OtherPredicates = [isCFDepth0] in { +def WAIT_FLAG : InstFlag<"printWaitFlag"> { + let ParserMatchClass = SWaitMatchClass; +} + +let SubtargetPredicate = isGCN in { + +//===----------------------------------------------------------------------===// +// EXP Instructions +//===----------------------------------------------------------------------===// + +defm EXP : EXP_m; //===----------------------------------------------------------------------===// // SMRD Instructions @@ -48,129 +67,135 @@ let mayLoad = 1 in { // We are using the SGPR_32 and not the SReg_32 register class for 32-bit // SMRD instructions, because the SGPR_32 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>; -defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>; +defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32 + 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 >; defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64 + 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 >; defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128 + 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 >; defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256 + 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 >; defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512 + 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; } // mayLoad = 1 -//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; -//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; +//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; +//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; //===----------------------------------------------------------------------===// // SOP1 Instructions //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in { - let isMoveImm = 1 in { -def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>; -def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>; -def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>; -def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>; + let isReMaterializable = 1 in { + defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>; + defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>; + } // let isRematerializeable = 1 + + let Uses = [SCC] in { + defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>; + defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>; + } // End Uses = [SCC] } // End isMoveImm = 1 -def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", - [(set i32:$dst, (not i32:$src0))] ->; +let Defs = [SCC] in { + defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32", + [(set i32:$dst, (not i32:$src0))] + >; -def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", - [(set i64:$dst, (not i64:$src0))] ->; -def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>; -def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>; -def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", + defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64", + [(set i64:$dst, (not i64:$src0))] + >; + defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>; + defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>; +} // End Defs = [SCC] + + +defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", [(set i32:$dst, (AMDGPUbrev i32:$src0))] >; -def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; -} // End neverHasSideEffects = 1 +defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; -////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>; -////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>; -def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32", - [(set i32:$dst, (ctpop i32:$src0))] ->; -def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>; +let Defs = [SCC] in { + //defm S_BCNT0_I32_B32 : SOP1_BCNT0 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>; + //defm S_BCNT0_I32_B64 : SOP1_BCNT0 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>; + defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32", + [(set i32:$dst, (ctpop i32:$src0))] + >; + defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>; +} // End Defs = [SCC] -////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>; -////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>; -def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32", +//defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>; +//defm S_FF0_I32_B64 : SOP1_FF0 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>; +defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", [(set i32:$dst, (cttz_zero_undef i32:$src0))] >; -////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>; +////defm S_FF1_I32_B64 : SOP1_FF1 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; -def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", +defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", [(set i32:$dst, (ctlz_zero_undef i32:$src0))] >; -//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>; -def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>; -//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>; -def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", +//defm S_FLBIT_I32_B64 : SOP1_32 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; +defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", []>; +//defm S_FLBIT_I32_I64 : SOP1_32 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; +defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8", [(set i32:$dst, (sext_inreg i32:$src0, i8))] >; -def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", +defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16", [(set i32:$dst, (sext_inreg i32:$src0, i16))] >; -////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>; -////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>; -////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>; -////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>; -def S_GETPC_B64 : SOP1 < - 0x0000001f, (outs SReg_64:$dst), (ins), "S_GETPC_B64 $dst", [] -> { - let SSRC0 = 0; -} -def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>; -def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>; -def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>; - -let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in { - -def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>; -def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>; -def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>; -def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>; -def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>; -def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>; -def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>; -def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>; - -} // End hasSideEffects = 1 - -def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>; -def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>; -def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>; -def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>; -def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>; -def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>; -//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>; -def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>; -def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>; -def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>; +////defm S_BITSET0_B32 : SOP1_BITSET0 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>; +////defm S_BITSET0_B64 : SOP1_BITSET0 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; +////defm S_BITSET1_B32 : SOP1_BITSET1 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>; +////defm S_BITSET1_B64 : SOP1_BITSET1 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; +defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>; +defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>; +defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>; +defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>; + +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { + +defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>; +defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>; +defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>; +defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>; +defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>; +defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>; +defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>; + +} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] + +defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>; +defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>; +defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; +defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; +defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; +defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; +//defm S_CBRANCH_JOIN : SOP1_ <sop1<0x32, 0x2e>, "s_cbranch_join", []>; +defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; +let Defs = [SCC] in { + defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>; +} // End Defs = [SCC] +defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>; //===----------------------------------------------------------------------===// // SOP2 Instructions @@ -178,145 +203,161 @@ def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>; let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { -def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>; -def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", +defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>; +defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32", [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] >; } // End isCommutable = 1 -def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>; -def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", +defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>; +defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32", [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] >; let Uses = [SCC] in { // Carry in comes from SCC let isCommutable = 1 in { -def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", +defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32", [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End isCommutable = 1 -def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", +defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32", [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End Uses = [SCC] -} // End Defs = [SCC] -def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", +defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32", [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))] >; -def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", +defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32", [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))] >; -def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", +defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32", [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))] >; -def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", +defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32", [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))] >; +} // End Defs = [SCC] -def S_CSELECT_B32 : SOP2 < - 0x0000000a, (outs SReg_32:$dst), - (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32", - [] ->; +defm S_CSELECT_B32 : SOP2_SELECT_32 <sop2<0x0a>, "s_cselect_b32", []>; -def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; +let Uses = [SCC] in { + defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>; +} // End Uses = [SCC] -def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", +let Defs = [SCC] in { +defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32", [(set i32:$dst, (and i32:$src0, i32:$src1))] >; -def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", +defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64", [(set i64:$dst, (and i64:$src0, i64:$src1))] >; -def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", +defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32", [(set i32:$dst, (or i32:$src0, i32:$src1))] >; -def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", +defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64", [(set i64:$dst, (or i64:$src0, i64:$src1))] >; -def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", +defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32", [(set i32:$dst, (xor i32:$src0, i32:$src1))] >; -def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", +defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64", [(set i64:$dst, (xor i64:$src0, i64:$src1))] >; -def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>; -def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>; -def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>; -def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>; -def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>; -def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>; -def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>; -def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>; -def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>; -def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>; +defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>; +defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>; +defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>; +defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>; +defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>; +defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>; +defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>; +defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>; +defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>; +defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>; +} // End Defs = [SCC] // Use added complexity so these patterns are preferred to the VALU patterns. let AddedComplexity = 1 in { +let Defs = [SCC] in { -def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", +defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32", [(set i32:$dst, (shl i32:$src0, i32:$src1))] >; -def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64", +defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64", [(set i64:$dst, (shl i64:$src0, i32:$src1))] >; -def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", +defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32", [(set i32:$dst, (srl i32:$src0, i32:$src1))] >; -def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64", +defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64", [(set i64:$dst, (srl i64:$src0, i32:$src1))] >; -def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", +defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32", [(set i32:$dst, (sra i32:$src0, i32:$src1))] >; -def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64", +defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64", [(set i64:$dst, (sra i64:$src0, i32:$src1))] >; +} // End Defs = [SCC] + +defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", []>; +defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>; +defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", + [(set i32:$dst, (mul i32:$src0, i32:$src1))] +>; } // End AddedComplexity = 1 -def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>; -def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>; -def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>; -def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>; -def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>; -def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>; -def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>; -//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>; -def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; +let Defs = [SCC] in { +defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>; +defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>; +defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>; +defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>; +} // End Defs = [SCC] + +//defm S_CBRANCH_G_FORK : SOP2_ <sop2<0x2b, 0x29>, "s_cbranch_g_fork", []>; +let Defs = [SCC] in { +defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>; +} // End Defs = [SCC] //===----------------------------------------------------------------------===// // SOPC Instructions //===----------------------------------------------------------------------===// -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">; -////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>; -////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>; -////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>; -////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>; -//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>; +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; +//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; //===----------------------------------------------------------------------===// // SOPK Instructions //===----------------------------------------------------------------------===// -def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>; -def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>; +let isReMaterializable = 1 in { +defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>; +} // End isReMaterializable = 1 +let Uses = [SCC] in { + defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>; +} + +let isCompare = 1 in { /* This instruction is disabled for now until we can figure out how to teach @@ -330,50 +371,46 @@ SCC = S_CMPK_EQ_I32 SGPR0, imm VCC = COPY SCC VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 -def S_CMPK_EQ_I32 : SOPK < - 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1), - "S_CMPK_EQ_I32", +defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32", [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] >; */ -let isCompare = 1, Defs = [SCC] in { -def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>; -def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>; -def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>; -def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>; -def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>; -def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>; -def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>; -def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>; -def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>; -def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>; -def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>; -} // End isCompare = 1, Defs = [SCC] - -let Defs = [SCC], isCommutable = 1 in { - def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>; - def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>; -} +defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>; +defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>; +defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>; +defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>; +defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>; +defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>; +defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>; +defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>; +defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>; +defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>; +defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>; +} // End isCompare = 1 -//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>; -def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>; -def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>; -def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>; -//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>; -//def EXP : EXP_ <0x00000000, "EXP", []>; +let isCommutable = 1 in { + let Defs = [SCC], isCommutable = 1 in { + defm S_ADDK_I32 : SOPK_32 <sopk<0x0f, 0x0e>, "s_addk_i32", []>; + } + defm S_MULK_I32 : SOPK_32 <sopk<0x10, 0x0f>, "s_mulk_i32", []>; +} -} // End let OtherPredicates = [isCFDepth0] +//defm S_CBRANCH_I_FORK : SOPK_ <sopk<0x11, 0x10>, "s_cbranch_i_fork", []>; +defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>; +defm S_SETREG_B32 : SOPK_32 <sopk<0x13, 0x12>, "s_setreg_b32", []>; +defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>; +//defm S_SETREG_IMM32_B32 : SOPK_32 <sopk<0x15, 0x14>, "s_setreg_imm32_b32", []>; //===----------------------------------------------------------------------===// // SOPP Instructions //===----------------------------------------------------------------------===// -def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "S_NOP $simm16", []>; +def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; let isTerminator = 1 in { -def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", +def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", [(IL_retflag)]> { let simm16 = 0; let isBarrier = 1; @@ -382,7 +419,7 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", let isBranch = 1 in { def S_BRANCH : SOPP < - 0x00000002, (ins sopp_brtarget:$simm16), "S_BRANCH $simm16", + 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", [(br bb:$simm16)]> { let isBarrier = 1; } @@ -390,36 +427,31 @@ def S_BRANCH : SOPP < let DisableEncoding = "$scc" in { def S_CBRANCH_SCC0 : SOPP < 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), - "S_CBRANCH_SCC0 $simm16", [] + "s_cbranch_scc0 $simm16" >; def S_CBRANCH_SCC1 : SOPP < 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), - "S_CBRANCH_SCC1 $simm16", - [] + "s_cbranch_scc1 $simm16" >; } // End DisableEncoding = "$scc" def S_CBRANCH_VCCZ : SOPP < 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), - "S_CBRANCH_VCCZ $simm16", - [] + "s_cbranch_vccz $simm16" >; def S_CBRANCH_VCCNZ : SOPP < 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), - "S_CBRANCH_VCCNZ $simm16", - [] + "s_cbranch_vccnz $simm16" >; let DisableEncoding = "$exec" in { def S_CBRANCH_EXECZ : SOPP < 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), - "S_CBRANCH_EXECZ $simm16", - [] + "s_cbranch_execz $simm16" >; def S_CBRANCH_EXECNZ : SOPP < 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), - "S_CBRANCH_EXECNZ $simm16", - [] + "s_cbranch_execnz $simm16" >; } // End DisableEncoding = "$exec" @@ -428,7 +460,7 @@ def S_CBRANCH_EXECNZ : SOPP < } // End isTerminator = 1 let hasSideEffects = 1 in { -def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER", +def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", [(int_AMDGPU_barrier_local)] > { let simm16 = 0; @@ -438,27 +470,29 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER", let mayStore = 1; } -def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16", - [] ->; -//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>; -//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>; -//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>; +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; +def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; +def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; +def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; let Uses = [EXEC] in { - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16", + def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "s_sendmsg $simm16", [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)] > { let DisableEncoding = "$m0"; } } // End Uses = [EXEC] -//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>; -//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>; -//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>; -//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>; -//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>; -//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>; +def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; +def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; +def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { + let simm16 = 0; +} +def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; +def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; +def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { + let simm16 = 0; +} } // End hasSideEffects //===----------------------------------------------------------------------===// @@ -467,256 +501,260 @@ let Uses = [EXEC] in { let isCompare = 1 in { -defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">; -defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_OLT>; -defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_OEQ>; -defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_OLE>; -defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_OGT>; -defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32">; -defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_OGE>; -defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", f32, COND_O>; -defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", f32, COND_UO>; -defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">; -defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">; -defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">; -defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">; -defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>; -defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">; -defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">; +defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">; +defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT>; +defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>; +defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE>; +defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>; +defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>; +defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>; +defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>; +defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>; +defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT>; +defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>; +defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE>; +defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>; +defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>; +defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>; +defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">; let hasSideEffects = 1 in { -defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">; -defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">; -defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">; -defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">; -defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">; -defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">; -defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">; -defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">; -defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">; -defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">; -defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">; -defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">; -defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">; -defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">; -defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">; -defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">; +defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32">; +defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32">; +defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">; +defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">; +defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">; +defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">; +defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">; +defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">; +defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">; +defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">; +defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">; +defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">; +defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">; +defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">; } // End hasSideEffects = 1 -defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">; -defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>; -defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_OEQ>; -defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_OLE>; -defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_OGT>; -defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">; -defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_OGE>; -defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", f64, COND_O>; -defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", f64, COND_UO>; -defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">; -defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">; -defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">; -defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">; -defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>; -defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">; -defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">; +defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">; +defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT>; +defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>; +defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE>; +defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>; +defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>; +defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>; +defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>; +defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>; +defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT>; +defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>; +defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE>; +defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>; +defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>; +defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>; +defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">; let hasSideEffects = 1 in { -defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">; -defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">; -defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">; -defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">; -defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">; -defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">; -defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">; -defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">; -defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">; -defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">; -defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">; -defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">; -defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">; -defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">; -defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">; -defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">; +defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64">; +defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64">; +defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">; +defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">; +defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">; +defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">; +defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64">; +defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64">; +defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">; +defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">; +defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">; +defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">; } // End hasSideEffects = 1 -defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">; -defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">; -defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32">; -defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32">; -defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32">; -defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32">; -defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32">; -defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32">; -defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32">; -defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32">; -defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32">; -defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32">; -defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32">; -defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">; -defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">; -defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">; +let SubtargetPredicate = isSICI in { + +defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">; +defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">; +defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">; +defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32">; +defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">; +defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">; +defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">; +defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">; +defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">; +defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32">; +defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">; +defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32">; +defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">; +defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">; +defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">; +defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">; let hasSideEffects = 1 in { -defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">; -defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">; -defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">; -defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">; -defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">; -defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">; -defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">; -defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">; -defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">; -defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">; -defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">; -defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">; -defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">; -defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">; -defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">; -defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">; +defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">; +defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32">; +defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">; +defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32">; +defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">; +defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">; +defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">; +defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">; +defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">; +defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32">; +defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">; +defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32">; +defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">; +defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">; +defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">; +defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">; } // End hasSideEffects = 1 -defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">; -defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">; -defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64">; -defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64">; -defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64">; -defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64">; -defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64">; -defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64">; -defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64">; -defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64">; -defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64">; -defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64">; -defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64">; -defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64">; -defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64">; -defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64">; +defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">; +defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64">; +defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">; +defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64">; +defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">; +defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">; +defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">; +defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">; +defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">; +defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64">; +defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">; +defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64">; +defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">; +defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">; +defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">; +defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">; let hasSideEffects = 1, Defs = [EXEC] in { -defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64">; -defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64">; -defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64">; -defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64">; -defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64">; -defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64">; -defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64">; -defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64">; -defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64">; -defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64">; -defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64">; -defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64">; -defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64">; -defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64">; -defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64">; -defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">; +defm V_CMPSX_F_F64 : VOPC_F64 <vopc<0x70>, "v_cmpsx_f_f64">; +defm V_CMPSX_LT_F64 : VOPC_F64 <vopc<0x71>, "v_cmpsx_lt_f64">; +defm V_CMPSX_EQ_F64 : VOPC_F64 <vopc<0x72>, "v_cmpsx_eq_f64">; +defm V_CMPSX_LE_F64 : VOPC_F64 <vopc<0x73>, "v_cmpsx_le_f64">; +defm V_CMPSX_GT_F64 : VOPC_F64 <vopc<0x74>, "v_cmpsx_gt_f64">; +defm V_CMPSX_LG_F64 : VOPC_F64 <vopc<0x75>, "v_cmpsx_lg_f64">; +defm V_CMPSX_GE_F64 : VOPC_F64 <vopc<0x76>, "v_cmpsx_ge_f64">; +defm V_CMPSX_O_F64 : VOPC_F64 <vopc<0x77>, "v_cmpsx_o_f64">; +defm V_CMPSX_U_F64 : VOPC_F64 <vopc<0x78>, "v_cmpsx_u_f64">; +defm V_CMPSX_NGE_F64 : VOPC_F64 <vopc<0x79>, "v_cmpsx_nge_f64">; +defm V_CMPSX_NLG_F64 : VOPC_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">; +defm V_CMPSX_NGT_F64 : VOPC_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64">; +defm V_CMPSX_NLE_F64 : VOPC_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">; +defm V_CMPSX_NEQ_F64 : VOPC_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">; +defm V_CMPSX_NLT_F64 : VOPC_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">; +defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">; } // End hasSideEffects = 1, Defs = [EXEC] -defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">; -defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_SLT>; -defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>; -defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_SLE>; -defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_SGT>; -defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>; -defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>; -defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">; +} // End SubtargetPredicate = isSICI + +defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">; +defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT>; +defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>; +defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE>; +defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>; +defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>; +defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>; +defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">; let hasSideEffects = 1 in { -defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">; -defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">; -defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">; -defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">; -defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">; -defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">; -defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">; -defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">; +defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32">; +defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32">; +defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">; +defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">; +defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">; +defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">; } // End hasSideEffects = 1 -defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">; -defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>; -defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", i64, COND_EQ>; -defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", i64, COND_SLE>; -defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", i64, COND_SGT>; -defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>; -defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>; -defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">; +defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">; +defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT>; +defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>; +defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE>; +defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>; +defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>; +defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>; +defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">; let hasSideEffects = 1 in { -defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">; -defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">; -defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">; -defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">; -defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">; -defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">; -defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">; -defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">; +defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64">; +defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64">; +defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">; +defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">; +defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">; +defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">; } // End hasSideEffects = 1 -defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">; -defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>; -defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", i32, COND_EQ>; -defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", i32, COND_ULE>; -defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", i32, COND_UGT>; -defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>; -defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>; -defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">; +defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">; +defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT>; +defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>; +defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE>; +defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>; +defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>; +defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>; +defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">; let hasSideEffects = 1 in { -defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">; -defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">; -defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">; -defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">; -defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">; -defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">; -defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">; -defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">; +defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32">; +defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32">; +defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">; +defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">; +defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">; +defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">; } // End hasSideEffects = 1 -defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">; -defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>; -defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", i64, COND_EQ>; -defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", i64, COND_ULE>; -defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", i64, COND_UGT>; -defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>; -defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>; -defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">; +defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">; +defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT>; +defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>; +defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE>; +defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>; +defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>; +defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>; +defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">; let hasSideEffects = 1 in { -defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">; -defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">; -defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">; -defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">; -defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">; -defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">; -defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">; -defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">; +defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64">; +defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64">; +defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">; +defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">; +defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">; +defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">; } // End hasSideEffects = 1 -defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">; +defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">; let hasSideEffects = 1 in { -defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">; +defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">; } // End hasSideEffects = 1 -defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">; +defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">; let hasSideEffects = 1 in { -defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">; +defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">; } // End hasSideEffects = 1 } // End isCompare = 1 @@ -726,88 +764,88 @@ defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">; //===----------------------------------------------------------------------===// -def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>; -def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>; -def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>; -def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>; -def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>; -def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>; -def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>; -def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>; -def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>; -def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>; -def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>; -def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>; -def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>; -def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>; -def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>; -def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>; -def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>; - -def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>; -def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>; -def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>; -def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>; -def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>; -def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>; -def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>; -def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>; -def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>; -def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>; -def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>; -def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>; -def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>; -def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>; -//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>; -//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>; -def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>; -def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>; -def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>; -def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>; +def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; +def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; +def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; +def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; +def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; +def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; +def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; +def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; +def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; +def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; +def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; +def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; +def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; +def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; +def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>; +def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>; + +def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; +def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; +def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; +def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; +def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; +def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; +def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; +def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; +def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; +def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; +def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; +def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; +def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; +//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">; +//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">; +def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; +def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; +def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; let SubtargetPredicate = isCI in { -def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>; +def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; } // End isCI -def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>; -def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>; -def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>; -def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>; -def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>; -def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>; -def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>; -def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>; -def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>; -def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>; -def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>; -def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>; -def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>; -def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>; -def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>; -def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>; -def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>; - -def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>; -def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>; -def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>; -def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>; -def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>; -def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>; -def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>; -def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>; -def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>; -def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>; -def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>; -def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>; -def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>; -def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>; -//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>; -//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>; -def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>; -def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>; -def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>; -def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>; +def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; +def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; +def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; +def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; +def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; +def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; +def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; +def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; +def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; +def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; +def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; +def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; +def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; +def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; +def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; +def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; + +def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; +def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; +def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; +def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; +def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; +def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; +def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; +def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; +def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; +def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; +def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; +def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; +def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; +//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">; +//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">; +def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; +def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; +def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">; +def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">; //let SubtargetPredicate = isCI in { // DS_CONDXCHG32_RTN_B64 @@ -816,719 +854,951 @@ def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>; // TODO: _SRC2_* forms -def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>; -def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>; -def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>; -def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "DS_WRITE_B64", VReg_64>; +defm DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VGPR_32>; +defm DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VGPR_32>; +defm DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VGPR_32>; +defm DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>; -def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>; -def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>; -def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>; -def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>; -def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>; -def DS_READ_B64 : DS_Load_Helper <0x00000076, "DS_READ_B64", VReg_64>; +defm DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VGPR_32>; +defm DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VGPR_32>; +defm DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VGPR_32>; +defm DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VGPR_32>; +defm DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VGPR_32>; +defm DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>; // 2 forms. -def DS_WRITE2_B32 : DS_Load2_Helper <0x0000000E, "DS_WRITE2_B32", VReg_64>; -def DS_WRITE2_B64 : DS_Load2_Helper <0x0000004E, "DS_WRITE2_B64", VReg_128>; +defm DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VGPR_32>; +defm DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>; -def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "DS_READ2_B32", VReg_64>; -def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>; - -// TODO: DS_READ2ST64_B32, DS_READ2ST64_B64, -// DS_WRITE2ST64_B32, DS_WRITE2ST64_B64 +defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>; +defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>; +defm DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>; //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// -//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>; -//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>; -//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>; -//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>; -//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>; -//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>; -//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>; +let SubtargetPredicate = isSICI in { + +//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>; +//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>; +//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>; +//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>; +//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>; +//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>; +//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - 0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global + 0x00000008, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global >; defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - 0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global + 0x00000009, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global >; defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - 0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global + 0x0000000a, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global >; defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - 0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global + 0x0000000b, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - 0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load + 0x0000000c, "buffer_load_dword", VGPR_32, i32, global_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - 0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load + 0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load + 0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load >; defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < - 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global + 0x00000018, "buffer_store_byte", VGPR_32, i32, truncstorei8_global >; defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < - 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global + 0x0000001a, "buffer_store_short", VGPR_32, i32, truncstorei16_global >; defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < - 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store + 0x0000001c, "buffer_store_dword", VGPR_32, i32, global_store >; defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store + 0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store >; defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store ->; -//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; -//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>; -//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>; -//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>; -//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>; -//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>; -//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>; -//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>; -//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>; -//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>; -//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>; -//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>; -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>; -//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>; -//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>; -//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>; -//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>; -//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>; -//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>; -//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>; -//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>; -//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>; -//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>; -//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>; -//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>; -//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>; -//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>; -//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>; -//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>; -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>; -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>; + 0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store +>; +//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>; +defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < + 0x00000030, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global +>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>; +defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < + 0x00000032, "buffer_atomic_add", VGPR_32, i32, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < + 0x00000033, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global +>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>; +defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < + 0x00000035, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < + 0x00000036, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < + 0x00000037, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < + 0x00000038, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND : MUBUF_Atomic < + 0x00000039, "buffer_atomic_and", VGPR_32, i32, atomic_and_global +>; +defm BUFFER_ATOMIC_OR : MUBUF_Atomic < + 0x0000003a, "buffer_atomic_or", VGPR_32, i32, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < + 0x0000003b, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global +>; +//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>; +//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>; +//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>; +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>; +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>; +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>; +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>; +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>; +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>; + +} // End SubtargetPredicate = isSICI //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>; -def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>; -def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "TBUFFER_STORE_FORMAT_X", VReg_32>; -def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FORMAT_XY", VReg_64>; -def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>; -def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>; +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; //===----------------------------------------------------------------------===// // MIMG Instructions //===----------------------------------------------------------------------===// -defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">; -//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>; -//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>; -//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>; -//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>; -//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>; -//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>; -//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>; -//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>; -defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">; -//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>; -//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>; -//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>; -//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>; -//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>; -//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>; -//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>; -//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>; -//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>; -//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>; -//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>; -//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>; -//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>; -//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>; -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>; -//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>; -//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>; -defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">; -defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "IMAGE_SAMPLE_CL">; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "IMAGE_SAMPLE_D_CL">; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">; -defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "IMAGE_SAMPLE_B_CL">; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "IMAGE_SAMPLE_LZ">; -defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "IMAGE_SAMPLE_C_CL">; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "IMAGE_SAMPLE_C_D_CL">; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "IMAGE_SAMPLE_C_B_CL">; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "IMAGE_SAMPLE_C_LZ">; -defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "IMAGE_SAMPLE_O">; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "IMAGE_SAMPLE_CL_O">; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "IMAGE_SAMPLE_D_O">; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "IMAGE_SAMPLE_D_CL_O">; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "IMAGE_SAMPLE_L_O">; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "IMAGE_SAMPLE_B_O">; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "IMAGE_SAMPLE_B_CL_O">; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "IMAGE_SAMPLE_LZ_O">; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "IMAGE_SAMPLE_C_O">; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "IMAGE_SAMPLE_C_CL_O">; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "IMAGE_SAMPLE_C_D_O">; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "IMAGE_SAMPLE_C_D_CL_O">; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "IMAGE_SAMPLE_C_L_O">; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "IMAGE_SAMPLE_C_B_O">; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "IMAGE_SAMPLE_C_B_CL_O">; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "IMAGE_SAMPLE_C_LZ_O">; -defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "IMAGE_GATHER4">; -defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">; -defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">; -defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">; -defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">; -defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">; -defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">; -defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">; -defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">; -defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">; -defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">; -defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">; -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "IMAGE_SAMPLE_CD">; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "IMAGE_SAMPLE_CD_CL">; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "IMAGE_SAMPLE_C_CD">; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "IMAGE_SAMPLE_C_CD_CL">; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "IMAGE_SAMPLE_CD_O">; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "IMAGE_SAMPLE_CD_CL_O">; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "IMAGE_SAMPLE_C_CD_O">; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "IMAGE_SAMPLE_C_CD_CL_O">; -//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>; -//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>; +defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; +//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; +defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; +defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "image_sample">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "image_sample_cl">; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; +defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "image_sample_b">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "image_sample_b_cl">; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; +defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "image_sample_c">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "image_sample_c_cl">; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "image_sample_c_b">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; +defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "image_sample_o">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "image_sample_cl_o">; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "image_sample_b_o">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "image_sample_c_o">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; +defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "image_gather4">; +defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "image_gather4_cl">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; +defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "image_gather4_b">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "image_gather4_b_cl">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; +defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "image_gather4_c">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "image_gather4_c_cl">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; +defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "image_gather4_c_b">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; +defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "image_gather4_o">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "image_gather4_cl_o">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; +defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "image_gather4_b_o">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; +defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "image_gather4_c_o">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "image_get_lod">; +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; + +//===----------------------------------------------------------------------===// +// Flat Instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasFlatAddressSpace] in { +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VGPR_32>; +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VGPR_32>; +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VGPR_32>; +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VGPR_32>; +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VGPR_32>; +def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "flat_load_dwordx2", VReg_64>; +def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "flat_load_dwordx4", VReg_128>; +def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "flat_load_dwordx3", VReg_96>; + +def FLAT_STORE_BYTE : FLAT_Store_Helper < + 0x00000018, "flat_store_byte", VGPR_32 +>; + +def FLAT_STORE_SHORT : FLAT_Store_Helper < + 0x0000001a, "flat_store_short", VGPR_32 +>; + +def FLAT_STORE_DWORD : FLAT_Store_Helper < + 0x0000001c, "flat_store_dword", VGPR_32 +>; + +def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + 0x0000001d, "flat_store_dwordx2", VReg_64 +>; + +def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + 0x0000001e, "flat_store_dwordx4", VReg_128 +>; + +def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + 0x0000001e, "flat_store_dwordx3", VReg_96 +>; + +//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "flat_atomic_swap", []>; +//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "flat_atomic_cmpswap", []>; +//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "flat_atomic_add", []>; +//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "flat_atomic_sub", []>; +//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "flat_atomic_rsub", []>; +//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "flat_atomic_smin", []>; +//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "flat_atomic_umin", []>; +//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "flat_atomic_smax", []>; +//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "flat_atomic_umax", []>; +//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "flat_atomic_and", []>; +//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "flat_atomic_or", []>; +//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "flat_atomic_xor", []>; +//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "flat_atomic_inc", []>; +//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "flat_atomic_dec", []>; +//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "flat_atomic_fcmpswap", []>; +//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "flat_atomic_fmin", []>; +//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "flat_atomic_fmax", []>; +//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "flat_atomic_swap_x2", []>; +//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "flat_atomic_cmpswap_x2", []>; +//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "flat_atomic_add_x2", []>; +//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "flat_atomic_sub_x2", []>; +//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "flat_atomic_rsub_x2", []>; +//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "flat_atomic_smin_x2", []>; +//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "flat_atomic_umin_x2", []>; +//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "flat_atomic_smax_x2", []>; +//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "flat_atomic_umax_x2", []>; +//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "flat_atomic_and_x2", []>; +//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "flat_atomic_or_x2", []>; +//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "flat_atomic_xor_x2", []>; +//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "flat_atomic_inc_x2", []>; +//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "flat_atomic_dec_x2", []>; +//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "flat_atomic_fcmpswap_x2", []>; +//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "flat_atomic_fmin_x2", []>; +//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "flat_atomic_fmax_x2", []>; + +} // End HasFlatAddressSpace predicate //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// -//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>; +//def V_NOP : VOP1_ <0x00000000, "v_nop", []>; -let neverHasSideEffects = 1, isMoveImm = 1 in { -defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>; -} // End neverHasSideEffects = 1, isMoveImm = 1 +let isMoveImm = 1 in { +defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>; +} // End isMoveImm = 1 let Uses = [EXEC] in { +// FIXME: Specify SchedRW for READFIRSTLANE_B32 + def V_READFIRSTLANE_B32 : VOP1 < 0x00000002, (outs SReg_32:$vdst), - (ins VReg_32:$src0), - "V_READFIRSTLANE_B32 $vdst, $src0", + (ins VGPR_32:$src0), + "v_readfirstlane_b32 $vdst, $src0", [] >; } -defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64", - [(set i32:$dst, (fp_to_sint f64:$src0))] +let SchedRW = [WriteQuarterRate32] in { + +defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64", + VOP_I32_F64, fp_to_sint >; -defm V_CVT_F64_I32 : VOP1_64_32 <0x00000004, "V_CVT_F64_I32", - [(set f64:$dst, (sint_to_fp i32:$src0))] +defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32", + VOP_F64_I32, sint_to_fp >; -defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", - [(set f32:$dst, (sint_to_fp i32:$src0))] +defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32", + VOP_F32_I32, sint_to_fp >; -defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", - [(set f32:$dst, (uint_to_fp i32:$src0))] +defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32", + VOP_F32_I32, uint_to_fp >; -defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", - [(set i32:$dst, (fp_to_uint f32:$src0))] +defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32", + VOP_I32_F32, fp_to_uint >; -defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", - [(set i32:$dst, (fp_to_sint f32:$src0))] +defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32", + VOP_I32_F32, fp_to_sint >; -defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; -defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32", - [(set i32:$dst, (fp_to_f16 f32:$src0))] +defm V_MOV_FED_B32 : VOP1Inst <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>; +defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32", + VOP_I32_F32, fp_to_f16 >; -defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", - [(set f32:$dst, (f16_to_fp i32:$src0))] +defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16", + VOP_F32_I32, f16_to_fp >; -//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; -//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; -//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; -defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64", - [(set f32:$dst, (fround f64:$src0))] +//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "v_cvt_rpi_i32_f32", []>; +//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "v_cvt_flr_i32_f32", []>; +//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "v_cvt_off_f32_i4", []>; +defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64", + VOP_F32_F64, fround >; -defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32", - [(set f64:$dst, (fextend f32:$src0))] +defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32", + VOP_F64_F32, fextend >; -defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", - [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))] +defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0", + VOP_F32_I32, AMDGPUcvt_f32_ubyte0 >; -defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", - [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))] +defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1", + VOP_F32_I32, AMDGPUcvt_f32_ubyte1 >; -defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", - [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))] +defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2", + VOP_F32_I32, AMDGPUcvt_f32_ubyte2 >; -defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", - [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))] +defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3", + VOP_F32_I32, AMDGPUcvt_f32_ubyte3 >; -defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64", - [(set i32:$dst, (fp_to_uint f64:$src0))] +defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64", + VOP_I32_F64, fp_to_uint >; -defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32", - [(set f64:$dst, (uint_to_fp i32:$src0))] +defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32", + VOP_F64_I32, uint_to_fp >; -defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", - [(set f32:$dst, (AMDGPUfract f32:$src0))] ->; -defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", - [(set f32:$dst, (ftrunc f32:$src0))] +} // let SchedRW = [WriteQuarterRate32] + +defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32", + VOP_F32_F32, AMDGPUfract >; -defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", - [(set f32:$dst, (fceil f32:$src0))] +defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32", + VOP_F32_F32, ftrunc >; -defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", - [(set f32:$dst, (frint f32:$src0))] +defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32", + VOP_F32_F32, fceil >; -defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", - [(set f32:$dst, (ffloor f32:$src0))] +defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32", + VOP_F32_F32, frint >; -defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", - [(set f32:$dst, (fexp2 f32:$src0))] +defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32", + VOP_F32_F32, ffloor >; -defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; -defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", - [(set f32:$dst, (flog2 f32:$src0))] +defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32", + VOP_F32_F32, fexp2 >; -defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; -defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; -defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set f32:$dst, (AMDGPUrcp f32:$src0))] +let SchedRW = [WriteQuarterRate32] in { + +defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32", + VOP_F32_F32, flog2 >; -defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; -defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", - [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))] +defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32", + VOP_F32_F32, AMDGPUrcp >; -defm V_RSQ_LEGACY_F32 : VOP1_32 < - 0x0000002d, "V_RSQ_LEGACY_F32", - [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))] +defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32", + VOP_F32_F32 >; -defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", - [(set f32:$dst, (AMDGPUrsq f32:$src0))] +defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32", + VOP_F32_F32, AMDGPUrsq >; -defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", - [(set f64:$dst, (AMDGPUrcp f64:$src0))] + +} //let SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64", + VOP_F64_F64, AMDGPUrcp >; -defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; -defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", - [(set f64:$dst, (AMDGPUrsq f64:$src0))] +defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64", + VOP_F64_F64, AMDGPUrsq >; -defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", - [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))] + +} // let SchedRW = [WriteDouble]; + +defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32", + VOP_F32_F32, fsqrt >; -defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", - [(set f32:$dst, (fsqrt f32:$src0))] + +let SchedRW = [WriteDouble] in { + +defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64", + VOP_F64_F64, fsqrt +>; + +} // let SchedRW = [WriteDouble] + +defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", + VOP_F32_F32, AMDGPUsin >; -defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", - [(set f64:$dst, (fsqrt f64:$src0))] +defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32", + VOP_F32_F32, AMDGPUcos >; -defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", - [(set f32:$dst, (AMDGPUsin f32:$src0))] +defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; +defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; +defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; +//defm V_FREXP_EXP_I32_F64 : VOPInst <0x0000003c, "v_frexp_exp_i32_f64", VOP_I32_F32>; +defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", + VOP_F64_F64 >; -defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", - [(set f32:$dst, (AMDGPUcos f32:$src0))] +defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>; +//defm V_FREXP_EXP_I32_F32 : VOPInst <0x0000003f, "v_frexp_exp_i32_f32", VOP_I32_F32>; +defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", + VOP_F32_F32 >; -defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>; -defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>; -defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>; -defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>; -defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>; -//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>; -defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>; -defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>; -//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>; -defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>; -//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>; -defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>; -defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>; -defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>; +//def V_CLREXCP : VOP1_ <0x00000041, "v_clrexcp", []>; +defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; +defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; +defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; +// These instruction only exist on SI and CI +let SubtargetPredicate = isSICI in { -//===----------------------------------------------------------------------===// -// VINTRP Instructions -//===----------------------------------------------------------------------===// +let SchedRW = [WriteQuarterRate32] in { -def V_INTERP_P1_F32 : VINTRP < - 0x00000000, - (outs VReg_32:$dst), - (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]", - []> { - let DisableEncoding = "$m0"; -} +defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; +defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; +defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; +defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32", + VOP_F32_F32, AMDGPUrsq_clamped +>; +defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", + VOP_F32_F32, AMDGPUrsq_legacy +>; -def V_INTERP_P2_F32 : VINTRP < - 0x00000001, - (outs VReg_32:$dst), - (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]", - []> { +} // End let SchedRW = [WriteQuarterRate32] - let Constraints = "$src0 = $dst"; - let DisableEncoding = "$src0,$m0"; +let SchedRW = [WriteDouble] in { -} +defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>; +defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", + VOP_F64_F64, AMDGPUrsq_clamped +>; -def V_INTERP_MOV_F32 : VINTRP < - 0x00000002, - (outs VReg_32:$dst), +} // End SchedRW = [WriteDouble] + +} // End SubtargetPredicate = isSICI + +//===----------------------------------------------------------------------===// +// VINTRP Instructions +//===----------------------------------------------------------------------===// + +// FIXME: Specify SchedRW for VINTRP insturctions. +defm V_INTERP_P1_F32 : VINTRP_m < + 0x00000000, "v_interp_p1_f32", + (outs VGPR_32:$dst), + (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), + "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [$m0]", + "$m0">; + +defm V_INTERP_P2_F32 : VINTRP_m < + 0x00000001, "v_interp_p2_f32", + (outs VGPR_32:$dst), + (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), + "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]", + "$src0,$m0", + "$src0 = $dst">; + +defm V_INTERP_MOV_F32 : VINTRP_m < + 0x00000002, "v_interp_mov_f32", + (outs VGPR_32:$dst), (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), - "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]", - []> { - let DisableEncoding = "$m0"; -} + "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [$m0]", + "$m0">; //===----------------------------------------------------------------------===// // VOP2 Instructions //===----------------------------------------------------------------------===// -def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), - (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc), - "V_CNDMASK_B32_e32 $dst, $src0, $src1, [$vcc]", - [] ->{ - let DisableEncoding = "$vcc"; -} +defm V_CNDMASK_B32_e64 : VOP3_m_nosrcmod <vop3<0x100>, (outs VGPR_32:$dst), + (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2), + "v_cndmask_b32_e64 $dst, $src0, $src1, $src2", + [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))], + "v_cndmask_b32_e64", 3 +>; -def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), - (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2, - InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), - "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", - [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))] -> { - let src0_modifiers = 0; - let src1_modifiers = 0; - let src2_modifiers = 0; -} -def V_READLANE_B32 : VOP2 < - 0x00000001, - (outs SReg_32:$vdst), - (ins VReg_32:$src0, SSrc_32:$vsrc1), - "V_READLANE_B32 $vdst, $src0, $vsrc1", - [] +let isCommutable = 1 in { +defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32", + VOP_F32_F32_F32, fadd >; -def V_WRITELANE_B32 : VOP2 < - 0x00000002, - (outs VReg_32:$vdst), - (ins SReg_32:$src0, SSrc_32:$vsrc1), - "V_WRITELANE_B32 $vdst, $src0, $vsrc1", - [] +defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>; +defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32", + VOP_F32_F32_F32, null_frag, "v_sub_f32" >; +} // End isCommutable = 1 let isCommutable = 1 in { -defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", - [(set f32:$dst, (fadd f32:$src0, f32:$src1))] + +defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32", + VOP_F32_F32_F32, int_AMDGPU_mul >; -defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", - [(set f32:$dst, (fsub f32:$src0, f32:$src1))] +defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32", + VOP_F32_F32_F32, fmul >; -defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">; -} // End isCommutable = 1 -defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>; +defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24", + VOP_I32_I32_I32, AMDGPUmul_i24 +>; +//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "v_mul_hi_i32_i24", []>; +defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24", + VOP_I32_I32_I32, AMDGPUmul_u24 +>; +//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>; -let isCommutable = 1 in { +defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32, + fminnum>; +defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32, + fmaxnum>; +defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32, + AMDGPUsmin +>; +defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32, + AMDGPUsmax +>; +defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32, + AMDGPUumin +>; +defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32, + AMDGPUumax +>; -defm V_MUL_LEGACY_F32 : VOP2_32 < - 0x00000007, "V_MUL_LEGACY_F32", - [(set f32:$dst, (int_AMDGPU_mul f32:$src0, f32:$src1))] +// No non-Rev Op on VI +defm V_LSHRREV_B32 : VOP2Inst < + vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshr_b32", "v_lshrrev_b32" >; -defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", - [(set f32:$dst, (fmul f32:$src0, f32:$src1))] +// No non-Rev OP on VI +defm V_ASHRREV_I32 : VOP2Inst < + vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, + "v_ashr_i32", "v_ashrrev_i32" >; +// No non-Rev OP on VI +defm V_LSHLREV_B32 : VOP2Inst < + vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshl_b32", "v_lshlrev_b32" +>; -defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", - [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))] +defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", + VOP_I32_I32_I32, and>; +defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", + VOP_I32_I32_I32, or >; -//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>; -defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", - [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))] +defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", + VOP_I32_I32_I32, xor >; -//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>; +defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>; +} // End isCommutable = 1 + +defm V_MADMK_F32 : VOP2Inst <vop2<0x20, 0x17>, "v_madmk_f32", VOP_F32_F32_F32>; + +let isCommutable = 1 in { +defm V_MADAK_F32 : VOP2Inst <vop2<0x21, 0x18>, "v_madak_f32", VOP_F32_F32_F32>; +} // End isCommutable = 1 + +let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC +// No patterns so that the scalar instructions are always selected. +// The scalar versions will be replaced with vector when needed later. + +// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, +// but the VI instructions behave the same as the SI versions. +defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", + VOP_I32_I32_I32, add +>; +defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", + VOP_I32_I32_I32, sub +>; -defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", - [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))] +defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", + VOP_I32_I32_I32, null_frag, "v_sub_i32" >; -defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", - [(set f32:$dst, (AMDGPUfmax f32:$src0, f32:$src1))] +let Uses = [VCC] in { // Carry-in comes from VCC +defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", + VOP_I32_I32_I32_VCC, adde +>; +defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", + VOP_I32_I32_I32_VCC, sube +>; +defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", + VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" >; -defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; -defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>; -defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", - [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>; -defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", - [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>; -defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", - [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>; -defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", - [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>; +} // End Uses = [VCC] +} // End isCommutable = 1, Defs = [VCC] -defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", - [(set i32:$dst, (srl i32:$src0, i32:$src1))] +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +def V_READLANE_B32 : VOP2 < + 0x00000001, + (outs SReg_32:$vdst), + (ins VGPR_32:$src0, SSrc_32:$vsrc1), + "v_readlane_b32 $vdst, $src0, $vsrc1", + [] +>; + +def V_WRITELANE_B32 : VOP2 < + 0x00000002, + (outs VGPR_32:$vdst), + (ins SReg_32:$src0, SSrc_32:$vsrc1), + "v_writelane_b32 $vdst, $src0, $vsrc1", + [] >; -defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">; +let isCommutable = 1 in { +defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32", + VOP_F32_F32_F32 +>; +} // End isCommutable = 1 -defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", - [(set i32:$dst, (sra i32:$src0, i32:$src1))] +defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmin_legacy +>; +defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmax_legacy +>; + +let isCommutable = 1 in { +defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>; +defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32", + VOP_I32_I32_I32, sra >; -defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">; let hasPostISelHook = 1 in { +defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>; +} -defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", - [(set i32:$dst, (shl i32:$src0, i32:$src1))] +} // End isCommutable = 1 + +defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32", VOP_I32_I32_I32, + AMDGPUbfm>; +defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>; +defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32", + VOP_I32_I32_I32 +>; +defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32", + VOP_F32_F32_I32, AMDGPUldexp >; -} -defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">; +////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>; +////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>; +////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32", + VOP_I32_F32_F32, int_SI_packf16 +>; +////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>; +////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>; -defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", - [(set i32:$dst, (and i32:$src0, i32:$src1))]>; -defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", - [(set i32:$dst, (or i32:$src0, i32:$src1))] +} // End let SubtargetPredicate = SICI +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { +defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32", + VOP_F32_F32_F32_F32 >; -defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", - [(set i32:$dst, (xor i32:$src0, i32:$src1))] + +defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32", + VOP_F32_F32_F32_F32, fmad >; +defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24", + VOP_I32_I32_I32_I32, AMDGPUmad_i24 +>; +defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24", + VOP_I32_I32_I32_I32, AMDGPUmad_u24 +>; } // End isCommutable = 1 -defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", - [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; -defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; -defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>; -defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; -defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; -defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; -defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; +defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32", + VOP_F32_F32_F32_F32 +>; -let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC -// No patterns so that the scalar instructions are always selected. -// The scalar versions will be replaced with vector when needed later. -defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", - [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>; -defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", - [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>; -defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32, - "V_SUB_I32">; +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32", + VOP_I32_I32_I32_I32, AMDGPUbfe_u32 +>; +defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32", + VOP_I32_I32_I32_I32, AMDGPUbfe_i32 +>; +} -let Uses = [VCC] in { // Carry-in comes from VCC -defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", - [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>; -defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", - [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>; -defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32, - "V_SUBB_U32">; -} // End Uses = [VCC] -} // End isCommutable = 1, Defs = [VCC] +defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32", + VOP_I32_I32_I32_I32, AMDGPUbfi +>; -defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>; -////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>; -////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>; -////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>; -defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32", - [(set i32:$dst, (int_SI_packf16 f32:$src0, f32:$src1))] +let isCommutable = 1 in { +defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32", + VOP_F32_F32_F32_F32, fma +>; +defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64", + VOP_F64_F64_F64_F64, fma >; -////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; -////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; +} // End isCommutable = 1 -//===----------------------------------------------------------------------===// -// VOP3 Instructions -//===----------------------------------------------------------------------===// +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; +defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32", + VOP_I32_I32_I32_I32 +>; +defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32", + VOP_I32_I32_I32_I32 +>; -let neverHasSideEffects = 1 in { +// Only on SI +defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", + VOP_F32_F32_F32_F32>; +defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmin3>; -defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>; -defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", - [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] +defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmin3 >; -defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", - [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))] +defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32", + VOP_I32_I32_I32_I32, AMDGPUumin3 >; -defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", - [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))] +defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmax3 +>; +defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmax3 +>; +defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32", + VOP_I32_I32_I32_I32, AMDGPUumax3 +>; +//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>; +//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>; +//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>; +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; +defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32", + VOP_I32_I32_I32_I32 +>; +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; +defm V_DIV_FIXUP_F32 : VOP3Inst < + vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; -} // End neverHasSideEffects +let SchedRW = [WriteDouble] in { -defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>; -defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>; -defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>; -defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; +defm V_DIV_FIXUP_F64 : VOP3Inst < + vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup +>; -let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in { -defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", - [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>; -defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", - [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>; -} +} // let SchedRW = [WriteDouble] -defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", - [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>; -defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", - [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))] ->; -def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", - [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] ->; -//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; -defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>; - -defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>; -defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; -////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>; -////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>; -////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>; -////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>; -////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>; -////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>; -////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>; -////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>; -////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>; -//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>; -//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>; -//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; -defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; -////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; -defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", - [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))] ->; -def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", - [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))] ->; - -def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64", - [(set i64:$dst, (shl i64:$src0, i32:$src1))] +defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", + VOP_I64_I64_I32, shl >; -def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64", - [(set i64:$dst, (srl i64:$src0, i32:$src1))] + +// Only on SI +defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", + VOP_I64_I64_I32, srl >; -def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64", - [(set i64:$dst, (sra i64:$src0, i32:$src1))] + +// Only on SI +defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", + VOP_I64_I64_I32, sra >; +let SchedRW = [WriteDouble] in { let isCommutable = 1 in { -def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>; -def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>; -def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>; -def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>; +defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", + VOP_F64_F64_F64, fadd +>; +defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64", + VOP_F64_F64_F64, fmul +>; + +defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64", + VOP_F64_F64_F64, fminnum +>; +defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64", + VOP_F64_F64_F64, fmaxnum +>; } // isCommutable = 1 -def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>; +defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", + VOP_F64_F64_I32, AMDGPUldexp +>; -let isCommutable = 1 in { +} // let SchedRW = [WriteDouble] -defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; -defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; -defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; -defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; +let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { -} // isCommutable = 1 +defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32", + VOP_I32_I32_I32 +>; + +defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", + VOP_I32_I32_I32 +>; + +} // isCommutable = 1, SchedRW = [WriteQuarterRate32] -def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>; +defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>; +let SchedRW = [WriteDouble] in { // Double precision division pre-scale. -def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>; +defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>; +} // let SchedRW = [WriteDouble] -defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", - [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] +let isCommutable = 1 in { +defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", + VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; -def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", - [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))] +let SchedRW = [WriteDouble] in { +defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", + VOP_F64_F64_F64_F64, AMDGPUdiv_fmas >; -//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; -//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; -//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; -def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64", - [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))] +} // End SchedRW = [WriteDouble] +} // End isCommutable = 1 + +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; + +let SchedRW = [WriteDouble] in { +defm V_TRIG_PREOP_F64 : VOP3Inst < + vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop >; +} // let SchedRW = [WriteDouble] + //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// - let isCodeGenOnly = 1, isPseudo = 1 in { -def V_MOV_I1 : InstSI < - (outs VReg_1:$dst), - (ins i1imm:$src), - "", [(set i1:$dst, (imm:$src))] ->; +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +// 64-bit vector move instruction. This is mainly used by the SIFoldOperands +// pass to enable folding of inline immediates. +def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; +} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 -def V_AND_I1 : InstSI < - (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", - [(set i1:$dst, (and i1:$src0, i1:$src1))] ->; - -def V_OR_I1 : InstSI < - (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", - [(set i1:$dst, (or i1:$src0, i1:$src1))] ->; - -def V_XOR_I1 : InstSI < - (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "", - [(set i1:$dst, (xor i1:$src0, i1:$src1))] ->; +let hasSideEffects = 1 in { +def SGPR_USE : InstSI <(outs),(ins), "", []>; +} // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. @@ -1557,7 +1827,7 @@ def SI_ELSE : InstSI < def SI_LOOP : InstSI < (outs), (ins SReg_64:$saved, brtarget:$target), - "SI_LOOP $saved, $target", + "si_loop $saved, $target", [(int_SI_loop i64:$saved, bb:$target)] >; @@ -1566,35 +1836,35 @@ def SI_LOOP : InstSI < def SI_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src), - "SI_ELSE $dst, $src", + "si_else $dst, $src", [(set i64:$dst, (int_SI_break i64:$src))] >; def SI_IF_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), - "SI_IF_BREAK $dst, $vcc, $src", + "si_if_break $dst, $vcc, $src", [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] >; def SI_ELSE_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), - "SI_ELSE_BREAK $dst, $src0, $src1", + "si_else_break $dst, $src0, $src1", [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] >; def SI_END_CF : InstSI < (outs), (ins SReg_64:$saved), - "SI_END_CF $saved", + "si_end_cf $saved", [(int_SI_end_cf i64:$saved)] >; def SI_KILL : InstSI < (outs), (ins VSrc_32:$src), - "SI_KILL $src", + "si_kill $src", [(int_AMDGPU_kill f32:$src)] >; @@ -1603,12 +1873,12 @@ def SI_KILL : InstSI < let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { -//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri, ADDRIndirect>; +//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>; let UseNamedOperandTable = 1 in { def SI_RegisterLoad : InstSI < - (outs VReg_32:$dst, SReg_64:$temp), + (outs VGPR_32:$dst, SReg_64:$temp), (ins FRAMEri32:$addr, i32imm:$chan), "", [] > { @@ -1618,7 +1888,7 @@ def SI_RegisterLoad : InstSI < class SIRegStore<dag outs> : InstSI < outs, - (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan), + (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), "", [] > { let isRegisterStore = 1; @@ -1634,22 +1904,22 @@ def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; } // End UseNamedOperandTable = 1 def SI_INDIRECT_SRC : InstSI < - (outs VReg_32:$dst, SReg_64:$temp), + (outs VGPR_32:$dst, SReg_64:$temp), (ins unknown:$src, VSrc_32:$idx, i32imm:$off), - "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off", + "si_indirect_src $dst, $temp, $src, $idx, $off", [] >; class SI_INDIRECT_DST<RegisterClass rc> : InstSI < (outs rc:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val), - "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val", + (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), + "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", [] > { let Constraints = "$src = $dst"; } -def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>; +def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; @@ -1659,24 +1929,10 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; let usesCustomInserter = 1 in { -// This pseudo instruction takes a pointer as input and outputs a resource -// constant that can be used with the ADDR64 MUBUF instructions. -def SI_ADDR64_RSRC : InstSI < - (outs SReg_128:$srsrc), - (ins SSrc_64:$ptr), - "", [] ->; - -def SI_BUFFER_RSRC : InstSI < - (outs SReg_128:$srsrc), - (ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi), - "", [] ->; - def V_SUB_F64 : InstSI < (outs VReg_64:$dst), (ins VReg_64:$src0, VReg_64:$src1), - "V_SUB_F64 $dst, $src0, $src1", + "v_sub_f64 $dst, $src0, $src1", [(set f64:$dst, (fsub f64:$src0, f64:$src1))] >; @@ -1684,18 +1940,20 @@ def V_SUB_F64 : InstSI < multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { - def _SAVE : InstSI < - (outs VReg_32:$dst), - (ins sgpr_class:$src, i32imm:$frame_idx), - "", [] - >; - - def _RESTORE : InstSI < - (outs sgpr_class:$dst), - (ins VReg_32:$src, i32imm:$frame_idx), - "", [] - >; - + let UseNamedOperandTable = 1 in { + def _SAVE : InstSI < + (outs), + (ins sgpr_class:$src, i32imm:$frame_idx, SReg_64:$scratch_ptr, + SReg_32:$scratch_offset), + "", [] + >; + + def _RESTORE : InstSI < + (outs sgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_64:$scratch_ptr, SReg_32:$scratch_offset), + "", [] + >; + } // End UseNamedOperandTable = 1 } defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; @@ -1704,6 +1962,30 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; +multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { + let UseNamedOperandTable = 1 in { + def _SAVE : InstSI < + (outs), + (ins vgpr_class:$src, i32imm:$frame_idx, SReg_64:$scratch_ptr, + SReg_32:$scratch_offset), + "", [] + >; + + def _RESTORE : InstSI < + (outs vgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_64:$scratch_ptr, SReg_32:$scratch_offset), + "", [] + >; + } // End UseNamedOperandTable = 1 +} + +defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; + let Defs = [SCC] in { def SI_CONSTDATA_PTR : InstSI < @@ -1716,13 +1998,15 @@ def SI_CONSTDATA_PTR : InstSI < } // end IsCodeGenOnly, isPseudo -} // end SubtargetPredicate = SI +} // end SubtargetPredicate = isGCN -let Predicates = [isSI] in { +let Predicates = [isGCN] in { def : Pat< (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), - (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0)) + (V_CNDMASK_B32_e64 $src2, $src1, + (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, + DSTCLAMP.NONE, DSTOMOD.NONE)) >; def : Pat < @@ -1730,12 +2014,16 @@ def : Pat < (SI_KILL 0xbf800000) >; +let Predicates = [isSICI] in { + /* int_SI_vs_load_input */ def : Pat< (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) >; +} // End Predicates = [isSICI] + /* int_SI_export */ def : Pat < (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, @@ -1750,7 +2038,7 @@ def : Pat < multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { - // 1. Offset as 8bit DWORD immediate + // 1. SI-CI: Offset as 8bit DWORD immediate def : Pat < (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) @@ -1769,6 +2057,28 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { >; } +multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { + + // 1. VI: Offset as 20bit immediate in bytes + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), + (vt (Instr_IMM $sbase, (as_i32imm $offset))) + >; + + // 2. Offset loaded in an 32bit SGPR + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), + (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + >; + + // 3. No offset at all + def : Pat < + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) + >; +} + +let Predicates = [isSICI] in { defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; @@ -1776,6 +2086,19 @@ defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; +} // End Predicates = [isSICI] + +let Predicates = [isVI] in { +defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; +defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; +} // End Predicates = [isVI] + +let Predicates = [isSICI] in { // 1. Offset as 8bit DWORD immediate def : Pat < @@ -1783,42 +2106,36 @@ def : Pat < (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) >; +} // End Predicates = [isSICI] + // 2. Offset loaded in an 32bit SGPR def : Pat < (SIload_constant v4i32:$sbase, imm:$offset), (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) >; -} // Predicates = [isSI] in { - //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// -let Predicates = [isSI, isCFDepth0] in { - def : Pat < (i64 (ctpop i64:$src)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_BCNT1_I32_B64 $src), sub0), - (S_MOV_B32 0), sub1) + (i64 (REG_SEQUENCE SReg_64, + (S_BCNT1_I32_B64 $src), sub0, + (S_MOV_B32 0), sub1)) >; //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// -// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector +// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector // case, the sgpr-copies pass will fix this to use the vector version. def : Pat < (i32 (addc i32:$src0, i32:$src1)), - (S_ADD_I32 $src0, $src1) + (S_ADD_U32 $src0, $src1) >; -} // Predicates = [isSI, isCFDepth0] - -let Predicates = [isSI] in { - //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// @@ -1842,49 +2159,9 @@ defm : RsqPat<V_RSQ_F32_e32, f32>; // VOP2 Patterns //===----------------------------------------------------------------------===// -class BinOp64Pat <SDNode node, Instruction inst> : Pat < - (node i64:$src0, i64:$src1), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (inst (EXTRACT_SUBREG i64:$src0, sub0), - (EXTRACT_SUBREG i64:$src1, sub0)), sub0), - (inst (EXTRACT_SUBREG i64:$src0, sub1), - (EXTRACT_SUBREG i64:$src1, sub1)), sub1) ->; - -def : BinOp64Pat <or, V_OR_B32_e32>; -def : BinOp64Pat <xor, V_XOR_B32_e32>; - -class SextInReg <ValueType vt, int ShiftAmt> : Pat < - (sext_inreg i32:$src0, vt), - (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0)) ->; - -def : SextInReg <i8, 24>; -def : SextInReg <i16, 16>; - def : Pat < (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), - (V_BCNT_U32_B32_e32 $popcnt, $val) ->; - -def : Pat < - (i32 (ctpop i32:$popcnt)), - (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0) ->; - -def : Pat < - (i64 (ctpop i64:$src)), - (INSERT_SUBREG - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1), - (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)), - sub0), - (V_MOV_B32_e32 0), sub1) ->; - -def : Pat < - (addc i32:$src0, i32:$src1), - (V_ADD_I32_e32 $src0, $src1) + (V_BCNT_U32_B32_e64 $popcnt, $val) >; /********** ======================= **********/ @@ -2222,10 +2499,10 @@ foreach Index = 0-15 in { } def : BitConvert <i32, f32, SReg_32>; -def : BitConvert <i32, f32, VReg_32>; +def : BitConvert <i32, f32, VGPR_32>; def : BitConvert <f32, i32, SReg_32>; -def : BitConvert <f32, i32, VReg_32>; +def : BitConvert <f32, i32, VGPR_32>; def : BitConvert <i64, f64, VReg_64>; @@ -2258,62 +2535,63 @@ def : BitConvert <v16f32, v16i32, VReg_512>; /********** Src & Dst modifiers **********/ /********** =================== **********/ -def FCLAMP_SI : AMDGPUShaderInst < - (outs VReg_32:$dst), - (ins VSrc_32:$src0), - "FCLAMP_SI $dst, $src0", - [] -> { - let usesCustomInserter = 1; -} - def : Pat < - (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), - (FCLAMP_SI f32:$src) + (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), + (f32 FP_ZERO), (f32 FP_ONE)), + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) >; /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ -// Manipulate the sign bit directly, as e.g. using the source negation modifier -// in V_ADD_F32_e64 $src, 0, [...] does not result in -0.0 for $src == +0.0, -// breaking the piglit *s-floatBitsToInt-neg* tests - -// TODO: Look into not implementing isFNegFree/isFAbsFree for SI, and possibly -// removing these patterns +// Prevent expanding both fneg and fabs. +// FIXME: Should use S_OR_B32 def : Pat < (fneg (fabs f32:$src)), (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ >; -def FABS_SI : AMDGPUShaderInst < - (outs VReg_32:$dst), - (ins VSrc_32:$src0), - "FABS_SI $dst, $src0", - [] -> { - let usesCustomInserter = 1; -} +// FIXME: Should use S_OR_B32 +def : Pat < + (fneg (fabs f64:$src)), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), // Set sign bit. + sub1) +>; def : Pat < (fabs f32:$src), - (FABS_SI f32:$src) + (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) >; -def FNEG_SI : AMDGPUShaderInst < - (outs VReg_32:$dst), - (ins VSrc_32:$src0), - "FNEG_SI $dst, $src0", - [] -> { - let usesCustomInserter = 1; -} - def : Pat < (fneg f32:$src), - (FNEG_SI f32:$src) + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) +>; + +def : Pat < + (fabs f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. + sub1) +>; + +def : Pat < + (fneg f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), + sub1) >; /********** ================== **********/ @@ -2327,7 +2605,7 @@ def : Pat < def : Pat < (SGPRImm<(f32 fpimm)>:$imm), - (S_MOV_B32 fpimm:$imm) + (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < @@ -2337,7 +2615,7 @@ def : Pat < def : Pat < (f32 fpimm:$imm), - (V_MOV_B32_e32 fpimm:$imm) + (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < @@ -2345,21 +2623,38 @@ def : Pat < (S_MOV_B64 InlineImm<i64>:$imm) >; +// XXX - Should this use a s_cmp to set SCC? + +// Set to sign-extended 64-bit value (true = -1, false = 0) +def : Pat < + (i1 imm:$imm), + (S_MOV_B64 (i64 (as_i64imm $imm))) +>; + +def : Pat < + (f64 InlineFPImm<f64>:$imm), + (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm))) +>; + /********** ===================== **********/ /********** Interpolation Paterns **********/ /********** ===================== **********/ +// The value of $params is constant through out the entire kernel. +// We need to use S_MOV_B32 $params, because CSE ignores copies, so +// without it we end up with a lot of redundant moves. + def : Pat < (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params), - (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params) + (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)) >; def : Pat < - (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij), + (int_SI_fs_interp imm:$attr_chan, imm:$attr, i32:$params, v2i32:$ij), (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0), - imm:$attr_chan, imm:$attr, i32:$params), + imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)), (EXTRACT_SUBREG $ij, sub1), - imm:$attr_chan, imm:$attr, $params) + imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)) >; /********** ================== **********/ @@ -2376,28 +2671,30 @@ def : Pat < def : Pat< (fdiv f64:$src0, f64:$src1), - (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0)) + (V_MUL_F64 0 /* src0_modifiers */, $src0, + 0 /* src1_modifiers */, (V_RCP_F64_e32 $src1), + 0 /* clamp */, 0 /* omod */) >; def : Pat < (int_AMDGPU_cube v4f32:$src), - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), - (V_CUBETC_F32 (EXTRACT_SUBREG $src, sub0), - (EXTRACT_SUBREG $src, sub1), - (EXTRACT_SUBREG $src, sub2)), - sub0), - (V_CUBESC_F32 (EXTRACT_SUBREG $src, sub0), - (EXTRACT_SUBREG $src, sub1), - (EXTRACT_SUBREG $src, sub2)), - sub1), - (V_CUBEMA_F32 (EXTRACT_SUBREG $src, sub0), - (EXTRACT_SUBREG $src, sub1), - (EXTRACT_SUBREG $src, sub2)), - sub2), - (V_CUBEID_F32 (EXTRACT_SUBREG $src, sub0), - (EXTRACT_SUBREG $src, sub1), - (EXTRACT_SUBREG $src, sub2)), - sub3) + (REG_SEQUENCE VReg_128, + (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub0, + (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub1, + (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub2, + (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub3) >; def : Pat < @@ -2413,12 +2710,16 @@ class Ext32Pat <SDNode ext> : Pat < def : Ext32Pat <zext>; def : Ext32Pat <anyext>; +let Predicates = [isSICI] in { + // Offset in an 32Bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) >; +} // End Predicates = [isSICI] + // The multiplication scales from [0,1] to the unsigned integer range def : Pat < (AMDGPUurecip i32:$src0), @@ -2427,12 +2728,16 @@ def : Pat < (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; +let Predicates = [isSICI] in { + def : Pat < (int_SI_tid), (V_MBCNT_HI_U32_B32_e32 0xffffffff, - (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0)) + (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) >; +} + //===----------------------------------------------------------------------===// // VOP3 Patterns //===----------------------------------------------------------------------===// @@ -2441,84 +2746,74 @@ def : IMad24Pat<V_MAD_I32_I24>; def : UMad24Pat<V_MAD_U32_U24>; def : Pat < - (fadd f64:$src0, f64:$src1), - (V_ADD_F64 $src0, $src1, (i64 0)) ->; - -def : Pat < - (fmul f64:$src0, f64:$src1), - (V_MUL_F64 $src0, $src1, (i64 0)) ->; - -def : Pat < - (mul i32:$src0, i32:$src1), - (V_MUL_LO_I32 $src0, $src1, (i32 0)) ->; - -def : Pat < (mulhu i32:$src0, i32:$src1), - (V_MUL_HI_U32 $src0, $src1, (i32 0)) + (V_MUL_HI_U32 $src0, $src1) >; def : Pat < (mulhs i32:$src0, i32:$src1), - (V_MUL_HI_I32 $src0, $src1, (i32 0)) + (V_MUL_HI_I32 $src0, $src1) >; -defm : BFIPatterns <V_BFI_B32, S_MOV_B32>; +def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>; + + +defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; /********** ======================= **********/ /********** Load/Store Patterns **********/ /********** ======================= **********/ -multiclass DSReadPat <DS inst, ValueType vt, PatFrag frag> { - def : Pat < - (vt (frag (add i32:$ptr, (i32 IMM16bit:$offset)))), - (inst (i1 0), $ptr, (as_i16imm $offset)) - >; +class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1)) +>; - def : Pat < - (frag i32:$src0), - (vt (inst 0, $src0, 0)) - >; -} +def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>; +def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>; +def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>; +def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>; +def : DSReadPat <DS_READ_B32, i32, local_load>; -defm : DSReadPat <DS_READ_I8, i32, sextloadi8_local>; -defm : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>; -defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>; -defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>; -defm : DSReadPat <DS_READ_B32, i32, local_load>; -defm : DSReadPat <DS_READ_B64, v2i32, local_load>; +let AddedComplexity = 100 in { -multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> { - def : Pat < - (frag vt:$value, (add i32:$ptr, (i32 IMM16bit:$offset))), - (inst (i1 0), $ptr, $value, (as_i16imm $offset)) - >; +def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>; - def : Pat < - (frag vt:$val, i32:$ptr), - (inst 0, $ptr, $val, 0) - >; -} +} // End AddedComplexity = 100 -defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; -defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>; -defm : DSWritePat <DS_WRITE_B32, i32, local_store>; -defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>; +def : Pat < + (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1))), + (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1)) +>; -multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> { - def : Pat < - (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value), - (inst (i1 0), $ptr, $value, (as_i16imm $offset)) - >; +class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat < + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), + (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) +>; - def : Pat < - (frag i32:$ptr, vt:$val), - (inst 0, $ptr, $val, 0) - >; -} +def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; +def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>; +def : DSWritePat <DS_WRITE_B32, i32, local_store>; + +let AddedComplexity = 100 in { + +def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>; +} // End AddedComplexity = 100 + +def : Pat < + (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), + (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (S_MOV_B32 -1)) +>; + +class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1)) +>; // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec // @@ -2530,69 +2825,56 @@ multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> { // We also load this -1 with s_mov_b32 / s_mov_b64 even though this // needs to be a VGPR. The SGPR copy pass will fix this, and it's // easier since there is no v_mov_b64. -multiclass DSAtomicIncRetPat<DS inst, ValueType vt, - Instruction LoadImm, PatFrag frag> { - def : Pat < - (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)), - (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset)) - >; - - def : Pat < - (frag i32:$ptr, (vt 1)), - (inst 0, $ptr, (LoadImm (vt -1)), 0) - >; -} +class DSAtomicIncRetPat<DS inst, ValueType vt, + Instruction LoadImm, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), + (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1)) +>; -multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> { - def : Pat < - (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap), - (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset)) - >; - def : Pat < - (frag i32:$ptr, vt:$cmp, vt:$swap), - (inst 0, $ptr, $cmp, $swap, 0) - >; -} +class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), + (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1)) +>; // 32-bit atomics. -defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, - S_MOV_B32, atomic_load_add_local>; -defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, - S_MOV_B32, atomic_load_sub_local>; - -defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>; -defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>; -defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>; -defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>; -defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>; -defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>; -defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>; -defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>; -defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>; -defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>; - -defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>; +def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, + S_MOV_B32, atomic_load_add_local>; +def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, + S_MOV_B32, atomic_load_sub_local>; + +def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>; + +def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>; // 64-bit atomics. -defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, - S_MOV_B64, atomic_load_add_local>; -defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, - S_MOV_B64, atomic_load_sub_local>; +def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, + S_MOV_B64, atomic_load_add_local>; +def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, + S_MOV_B64, atomic_load_sub_local>; -defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>; -defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>; -defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>; -defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>; -defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>; -defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>; -defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>; -defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>; -defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>; -defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>; +def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>; +def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>; +def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>; +def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>; +def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>; +def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>; +def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>; +def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>; +def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>; +def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>; -defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; +def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; //===----------------------------------------------------------------------===// @@ -2602,12 +2884,12 @@ defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>; multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, PatFrag constant_ld> { def : Pat < - (vt (constant_ld (add i64:$ptr, i64:$offset))), - (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0) + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))), + (Instr_ADDR64 $srsrc, $vaddr, $offset) >; - } +let Predicates = [isSICI] in { defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; @@ -2615,6 +2897,7 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>; +} // End Predicates = [isSICI] class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, @@ -2622,6 +2905,7 @@ class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) >; +let Predicates = [isSICI] in { def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; @@ -2629,6 +2913,7 @@ def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>; +} // End Predicates = [isSICI] // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen, @@ -2667,26 +2952,28 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe >; } +let Predicates = [isSICI] in { defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>; defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>; defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; +} // End Predicates = [isSICI] class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < - (st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset, - u16imm:$offset, i1imm:$offen, i1imm:$idxen, - i1imm:$glc, i1imm:$slc, i1imm:$tfe)), - (Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen, - $glc, $slc, $tfe) + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, + u16imm:$offset)), + (Instr $value, $srsrc, $vaddr, $soffset, $offset, 0, 0, 0) >; -def : MUBUFScratchStorePat <BUFFER_STORE_BYTE, i32, truncstorei8_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_SHORT, i32, truncstorei16_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORD, i32, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2, v2i32, store_private>; -def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4, v4i32, store_private>; +let Predicates = [isSICI] in { +def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; +} // End Predicates = [isSICI] /* class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat < @@ -2694,11 +2981,13 @@ class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat < (Instr $value, $srsrc, $vaddr, $offset) >; +let Predicates = [isSICI] in { def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>; def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>; def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>; def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>; def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>; +} // End Predicates = [isSICI] */ @@ -2725,29 +3014,26 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; let SubtargetPredicate = isCI in { -// Sea island new arithmetic instructinos -let neverHasSideEffects = 1 in { -defm V_TRUNC_F64 : VOP1_64 <0x00000017, "V_TRUNC_F64", - [(set f64:$dst, (ftrunc f64:$src0))] ->; -defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64", - [(set f64:$dst, (fceil f64:$src0))] +defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", + VOP_I32_I32_I32 >; -defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64", - [(set f64:$dst, (ffloor f64:$src0))] +defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", + VOP_I32_I32_I32 >; -defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", - [(set f64:$dst, (frint f64:$src0))] +defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", + VOP_I32_I32_I32 >; -defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>; -defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>; -defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>; -def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>; +let isCommutable = 1 in { +defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", + VOP_I64_I32_I32_I64 +>; // XXX - Does this set VCC? -def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>; -} // End neverHasSideEffects = 1 +defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", + VOP_I64_I32_I32_I64 +>; +} // End isCommutable = 1 // Remaining instructions: // FLAT_* @@ -2756,8 +3042,6 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>; // S_CBRANCH_CDBGSYS_OR_USER // S_CBRANCH_CDBGSYS_AND_USER // S_DCACHE_INV_VOL -// V_EXP_LEGACY_F32 -// V_LOG_LEGACY_F32 // DS_NOP // DS_GWS_SEMA_RELEASE_ALL // DS_WRAP_RTN_B32 @@ -2770,8 +3054,39 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>; // BUFFER_LOAD_DWORDX3 // BUFFER_STORE_DWORDX3 -} // End iSCI +} // End isCI + +//===----------------------------------------------------------------------===// +// Flat Patterns +//===----------------------------------------------------------------------===// + +class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt, + PatFrag flat_ld> : + Pat <(vt (flat_ld i64:$ptr)), + (Instr_ADDR64 $ptr) +>; + +def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>; +def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>; + +class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> : + Pat <(st vt:$value, i64:$ptr), + (Instr $value, $ptr) + >; +def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>; +def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>; +def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>; +def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>; /********** ====================== **********/ /********** Indirect adressing **********/ @@ -2821,44 +3136,37 @@ defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>; def : Pat<(i32 (sext_inreg i32:$src, i1)), (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 -// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it -// might not be worth the effort, and will need to expand to shifts when -// fixing SGPR copies. - // Handle sext_inreg in i64 def : Pat < (i64 (sext_inreg i64:$src, i1)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16 - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i8)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0), - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i16)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0), - (S_MOV_B32 -1), sub1) + (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i32)), + (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 >; class ZExt_i64_i32_Pat <SDNode ext> : Pat < (i64 (ext i32:$src)), - (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0), - (S_MOV_B32 0), sub1) + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) >; class ZExt_i64_i1_Pat <SDNode ext> : Pat < (i64 (ext i1:$src)), - (INSERT_SUBREG - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0), - (S_MOV_B32 0), sub1) + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, + (S_MOV_B32 0), sub1) >; @@ -2869,20 +3177,38 @@ def : ZExt_i64_i1_Pat<anyext>; def : Pat < (i64 (sext i32:$src)), - (INSERT_SUBREG - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0), - (S_ASHR_I32 $src, 31), sub1) + (REG_SEQUENCE SReg_64, $src, sub0, + (S_ASHR_I32 $src, 31), sub1) >; def : Pat < (i64 (sext i1:$src)), - (INSERT_SUBREG - (INSERT_SUBREG - (i64 (IMPLICIT_DEF)), - (V_CNDMASK_B32_e64 0, -1, $src), sub0), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 0, -1, $src), sub0, (V_CNDMASK_B32_e64 0, -1, $src), sub1) >; +// If we need to perform a logical operation on i1 values, we need to +// use vector comparisons since there is only one SCC register. Vector +// comparisions still write to a pair of SGPRs, so treat these as +// 64-bit comparisons. When legalizing SGPR copies, instructions +// resulting in the copies from SCC to these instructions will be +// moved to the VALU. +def : Pat < + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B64 $src0, $src1) +>; + +def : Pat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B64 $src0, $src1) +>; + +def : Pat < + (i1 (xor i1:$src0, i1:$src1)), + (S_XOR_B64 $src0, $src1) +>; + def : Pat < (f32 (sint_to_fp i1:$src)), (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) @@ -2895,7 +3221,7 @@ def : Pat < def : Pat < (f64 (sint_to_fp i1:$src)), - (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) >; def : Pat < @@ -2914,13 +3240,25 @@ def : Pat < def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1) + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) +>; + +def : Pat < + (i32 (bswap i32:$a)), + (V_BFI_B32 (S_MOV_B32 0x00ff00ff), + (V_ALIGNBIT_B32 $a, $a, 24), + (V_ALIGNBIT_B32 $a, $a, 8)) +>; + +def : Pat < + (f32 (select i1:$src2, f32:$src1, f32:$src0)), + (V_CNDMASK_B32_e64 $src0, $src1, $src2) >; //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// -def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>; +def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; -} // End isSI predicate +} // End isGCN predicate diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp new file mode 100644 index 000000000000..0cb67465012d --- /dev/null +++ b/lib/Target/R600/SILoadStoreOptimizer.cpp @@ -0,0 +1,434 @@ +//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to fuse DS instructions with close by immediate offsets. +// This will fuse operations such as +// ds_read_b32 v0, v2 offset:16 +// ds_read_b32 v1, v2 offset:32 +// ==> +// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 +// +// +// Future improvements: +// +// - This currently relies on the scheduler to place loads and stores next to +// each other, and then only merges adjacent pairs of instructions. It would +// be good to be more flexible with interleaved instructions, and possibly run +// before scheduling. It currently missing stores of constants because loading +// the constant into the data register is placed between the stores, although +// this is arguably a scheduling problem. +// +// - Live interval recomputing seems inefficient. This currently only matches +// one pair, and recomputes live intervals and moves on to the next pair. It +// would be better to compute a list of all merges that need to occur +// +// - With a list of instructions to process, we can also merge more. If a +// cluster of loads have offsets that are too large to fit in the 8-bit +// offsets, but are close enough to fit in the 8 bits, we can add to the base +// pointer and use the new reduced offsets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-load-store-opt" + +namespace { + +class SILoadStoreOptimizer : public MachineFunctionPass { +private: + const TargetMachine *TM; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + + + static bool offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned EltSize); + + MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize); + + void updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx); + + MachineBasicBlock::iterator mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + + MachineBasicBlock::iterator mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + +public: + static char ID; + + SILoadStoreOptimizer() : + MachineFunctionPass(ID), + TM(nullptr), + TII(nullptr), + TRI(nullptr), + MRI(nullptr), + LIS(nullptr) { + + } + + SILoadStoreOptimizer(const TargetMachine &TM_) : + MachineFunctionPass(ID), + TM(&TM_), + TII(static_cast<const SIInstrInfo*>(TM->getSubtargetImpl()->getInstrInfo())) { + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + } + + bool optimizeBlock(MachineBasicBlock &MBB); + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Load / Store Optimizer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreserved<LiveVariables>(); + AU.addRequired<LiveIntervals>(); + + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) + +char SILoadStoreOptimizer::ID = 0; + +char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; + +FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { + return new SILoadStoreOptimizer(TM); +} + +bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned Size) { + // XXX - Would the same offset be OK? Is there any reason this would happen or + // be useful? + if (Offset0 == Offset1) + return false; + + // This won't be valid if the offset isn't aligned. + if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) + return false; + + unsigned EltOffset0 = Offset0 / Size; + unsigned EltOffset1 = Offset1 / Size; + + // Check if the new offsets fit in the reduced 8-bit range. + if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) + return true; + + // If the offset in elements doesn't fit in 8-bits, we might be able to use + // the stride 64 versions. + if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) + return false; + + return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize){ + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + ++MBBI; + + if (MBBI->getOpcode() != I->getOpcode()) + return E; + + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) + return MBBI; + } + + return E; +} + +void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx) { + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), + E = MRI->reg_end(); I != E; ) { + MachineOperand &O = *I; + ++I; + O.substVirtReg(DstReg, SubIdx, *TRI); + } +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be careful, since the addresses could be subregisters themselves in weird + // cases, like vectors of pointers. + const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); + + unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); + unsigned DestReg1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Read2Desc = TII->get(Opc); + + const TargetRegisterClass *SuperRC + = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + + DebugLoc DL = I->getDebugLoc(); + MachineInstrBuilder Read2 + = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + .addImm(0) // gds + .addOperand(*AddrReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addOperand(*M0Reg) // M0 + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + LIS->InsertMachineInstrInMaps(Read2); + + unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); + updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); + + LIS->RemoveMachineInstrFromMaps(I); + LIS->RemoveMachineInstrFromMaps(Paired); + I->eraseFromParent(); + Paired->eraseFromParent(); + + LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); + LIS->shrinkToUses(&AddrRegLI); + + LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg()); + LIS->shrinkToUses(&M0RegLI); + + // Currently m0 is treated as a register class with one member instead of an + // implicit physical register. We are using the virtual register for the first + // one, but we still need to update the live range of the now unused second m0 + // virtual register to avoid verifier errors. + const MachineOperand *PairedM0Reg + = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0); + LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg()); + LIS->shrinkToUses(&PairedM0RegLI); + + LIS->getInterval(DestReg); // Create new LI + + DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); + return Read2.getInstr(); +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be sure to use .addOperand(), and not .addReg() with these. We want to be + // sure we preserve the subregister index and any register flags set on them. + const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); + const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); + const MachineOperand *Data1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); + + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Write2Desc = TII->get(Opc); + DebugLoc DL = I->getDebugLoc(); + + MachineInstrBuilder Write2 + = BuildMI(*MBB, I, DL, Write2Desc) + .addImm(0) // gds + .addOperand(*Addr) // addr + .addOperand(*Data0) // data0 + .addOperand(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addOperand(*M0Reg) // m0 + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + // XXX - How do we express subregisters here? + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(), + M0Reg->getReg()}; + + LIS->RemoveMachineInstrFromMaps(I); + LIS->RemoveMachineInstrFromMaps(Paired); + I->eraseFromParent(); + Paired->eraseFromParent(); + + LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); + + DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); + return Write2.getInstr(); +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { + bool Modified = false; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { + MachineInstr &MI = *I; + + // Don't combine if volatile. + if (MI.hasOrderedMemoryRef()) { + ++I; + continue; + } + + unsigned Opc = MI.getOpcode(); + if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { + unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeRead2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { + unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeWrite2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } + + ++I; + } + + return Modified; +} + +bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo *STM = MF.getTarget().getSubtargetImpl(); + TRI = static_cast<const SIRegisterInfo*>(STM->getRegisterInfo()); + TII = static_cast<const SIInstrInfo*>(STM->getInstrInfo()); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis<LiveIntervals>(); + + DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); + + assert(!MRI->isSSA()); + + bool Modified = false; + + for (MachineBasicBlock &MBB : MF) + Modified |= optimizeBlock(MBB); + + return Modified; +} diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 75b5a5e027ff..068b22f37704 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -49,8 +49,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -86,7 +88,6 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void InitM0ForLDS(MachineBasicBlock::iterator MI); void LoadM0(MachineInstr &MI, MachineInstr *MovRel); void IndirectSrc(MachineInstr &MI); void IndirectDst(MachineInstr &MI); @@ -307,10 +308,9 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { #endif // Clear this thread from the exec mask if the operand is negative - if ((Op.isImm() || Op.isFPImm())) { + if ((Op.isImm())) { // Constant operand: Set exec mask to 0 or do nothing - if (Op.isImm() ? (Op.getImm() & 0x80000000) : - Op.getFPImm()->isNegative()) { + if (Op.getImm() & 0x80000000) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) .addImm(0); } @@ -323,14 +323,6 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } -/// The m0 register stores the maximum allowable address for LDS reads and -/// writes. Its value must be at least the size in bytes of LDS allocated by -/// the shader. For simplicity, we set it to the maximum possible value. -void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) { - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::M0).addImm(0xffffffff); -} - void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { MachineBasicBlock &MBB = *MI.getParent(); @@ -347,7 +339,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { } else { assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VReg_32RegClass.contains(Idx)); + assert(AMDGPU::VGPR_32RegClass.contains(Idx)); // Save the EXEC mask BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) @@ -389,12 +381,6 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { .addReg(Save); } - // FIXME: Are there any values other than the LDS address clamp that need to - // be stored in the m0 register and may be live for more than a few - // instructions? If so, we should save the m0 register at the beginning - // of this function and restore it here. - // FIXME: Add support for LDS direct loads. - InitM0ForLDS(&MI); MI.eraseFromParent(); } @@ -442,13 +428,14 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { } bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo()); - TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; - bool NeedM0 = false; bool NeedWQM = false; + bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -460,10 +447,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isDS(MI.getOpcode())) { - NeedM0 = true; + if (TII->isDS(MI.getOpcode())) NeedWQM = true; - } + + // Flat uses m0 in case it needs to access LDS. + if (TII->isFLAT(MI.getOpcode())) + NeedFlat = true; switch (MI.getOpcode()) { default: break; @@ -530,23 +519,54 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::V_INTERP_MOV_F32: NeedWQM = true; break; - } } } - if (NeedM0) { - MachineBasicBlock &MBB = MF.front(); - // Initialize M0 to a value that won't cause LDS access to be discarded - // due to offset clamping - InitM0ForLDS(MBB.getFirstNonPHI()); - } - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); } + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + + // FIXME: This is reporting stack size that is used in a scratch buffer + // rather than registers as well. + uint64_t StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin + = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); + // Convert register index to 256-byte unit. + uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + // TODO: Can we use one s_mov_b64 here? + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + DebugLoc NoDL; + MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); + const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); + + assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); + + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) + .addImm(StackOffset); + + // Documentation says size is "per-thread scratch size in bytes" + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) + .addImm(StackSizeBytes); + } + return true; } diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp index db19235995be..67421e231d8d 100644 --- a/lib/Target/R600/SILowerI1Copies.cpp +++ b/lib/Target/R600/SILowerI1Copies.cpp @@ -15,6 +15,7 @@ #define DEBUG_TYPE "si-i1-copies" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineDominators.h" @@ -39,14 +40,14 @@ public: initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); } - virtual bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override; - virtual const char *getPassName() const override { - return "SI Lower il Copies"; + const char *getPassName() const override { + return "SI Lower i1 Copies"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -55,10 +56,10 @@ public: } // End anonymous namespace. INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, - "SI Lower il Copies", false, false) + "SI Lower i1 Copies", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, - "SI Lower il Copies", false, false) + "SI Lower i1 Copies", false, false) char SILowerI1Copies::ID = 0; @@ -70,9 +71,9 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - MF.getTarget().getInstrInfo()); - const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -84,71 +85,67 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (MI.getOpcode() == AMDGPU::V_MOV_I1) { - I1Defs.push_back(MI.getOperand(0).getReg()); - MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { + unsigned Reg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + if (RC == &AMDGPU::VReg_1RegClass) + MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); continue; } - if (MI.getOpcode() == AMDGPU::V_AND_I1) { - I1Defs.push_back(MI.getOperand(0).getReg()); - MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32)); + if (MI.getOpcode() != AMDGPU::COPY) continue; - } - if (MI.getOpcode() == AMDGPU::V_OR_I1) { - I1Defs.push_back(MI.getOperand(0).getReg()); - MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32)); - continue; - } + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Src = MI.getOperand(1); - if (MI.getOpcode() == AMDGPU::V_XOR_I1) { - I1Defs.push_back(MI.getOperand(0).getReg()); - MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32)); + if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || + !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) continue; - } - - if (MI.getOpcode() != AMDGPU::COPY || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg())) - continue; - - const TargetRegisterClass *DstRC = - MRI.getRegClass(MI.getOperand(0).getReg()); - const TargetRegisterClass *SrcRC = - MRI.getRegClass(MI.getOperand(1).getReg()); + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); + const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); if (DstRC == &AMDGPU::VReg_1RegClass && TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { - I1Defs.push_back(MI.getOperand(0).getReg()); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .addOperand(MI.getOperand(0)) - .addImm(0) - .addImm(-1) - .addOperand(MI.getOperand(1)) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0); + I1Defs.push_back(Dst.getReg()); + DebugLoc DL = MI.getDebugLoc(); + + MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); + if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { + if (DefInst->getOperand(1).isImm()) { + I1Defs.push_back(Dst.getReg()); + + int64_t Val = DefInst->getOperand(1).getImm(); + assert(Val == 0 || Val == -1); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) + .addOperand(Dst) + .addImm(Val); + MI.eraseFromParent(); + continue; + } + } + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) + .addOperand(Dst) + .addImm(0) + .addImm(-1) + .addOperand(Src); MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) - .addOperand(MI.getOperand(0)) - .addImm(0) - .addOperand(MI.getOperand(1)) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0); + .addOperand(Dst) + .addOperand(Src) + .addImm(0); MI.eraseFromParent(); } } } for (unsigned Reg : I1Defs) - MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass); + MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); return false; } diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index c53a7e10d548..198dd568374e 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -10,8 +10,10 @@ #include "SIMachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -26,72 +28,50 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + TIDReg(AMDGPU::NoRegister), + HasSpilledVGPRs(false), PSInputAddr(0), - SpillTracker(), - NumUserSGPRs(0) { } + NumUserSGPRs(0), + LDSWaveSpillSize(0) { } -static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) { - unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); +SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( + MachineFunction *MF, + unsigned FrameIndex, + unsigned SubIdx) { + const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>( + MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); + MachineRegisterInfo &MRI = MF->getRegInfo(); + int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); + Offset += SubIdx * 4; - // We need to add this register as live out for the function, in order to - // have the live range calculated directly. - // - // When register spilling begins, we have already calculated the live - // live intervals for all the registers. Since we are spilling SGPRs to - // VGPRs, we need to update the Lane VGPR's live interval every time we - // spill or restore a register. - // - // Unfortunately, there is no good way to update the live interval as - // the TargetInstrInfo callbacks for spilling and restoring don't give - // us access to the live interval information. - // - // We are lucky, though, because the InlineSpiller calls - // LiveRangeEdit::calculateRegClassAndHint() which iterates through - // all the new register that have been created when restoring a register - // and calls LiveIntervals::getInterval(), which creates and computes - // the live interval for the newly created register. However, once this - // live intervals is created, it doesn't change and since we usually reuse - // the Lane VGPR multiple times, this means any uses after the first aren't - // added to the live interval. - // - // To work around this, we add Lane VGPRs to the functions live out list, - // so that we can guarantee its live range will cover all of its uses. + unsigned LaneVGPRIdx = Offset / (64 * 4); + unsigned Lane = (Offset / 4) % 64; - for (MachineBasicBlock &MBB : *MF) { - if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) { - MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true)); - return VGPR; - } - } + struct SpilledReg Spill; - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Could not find S_ENDPGM instruction."); + if (!LaneVGPRs.count(LaneVGPRIdx)) { + unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); + LaneVGPRs[LaneVGPRIdx] = LaneVGPR; + MRI.setPhysRegUsed(LaneVGPR); - return VGPR; -} - -unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes( - MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) { - unsigned StartLane = CurrentLane; - CurrentLane += NumRegs; - if (!LaneVGPR) { - LaneVGPR = createLaneVGPR(MRI, MF); - } else { - if (CurrentLane >= MAX_LANES) { - StartLane = CurrentLane = 0; - LaneVGPR = createLaneVGPR(MRI, MF); + // Add this register as live-in to all blocks to avoid machine verifer + // complaining about use of an undefined physical register. + for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); + BI != BE; ++BI) { + BI->addLiveIn(LaneVGPR); } } - return StartLane; -} -void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex, - unsigned Reg, - int Lane) { - SpilledRegisters[FrameIndex] = SpilledReg(Reg, Lane); + Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; + Spill.Lane = Lane; + return Spill; } -const SIMachineFunctionInfo::SpilledReg& -SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) { - return SpilledRegisters[FrameIndex]; +unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>(); + // FIXME: We should get this information from kernel attributes if it + // is available. + return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); } diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 9684d285cec2..71852717d7e6 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -12,10 +12,11 @@ //===----------------------------------------------------------------------===// -#ifndef SIMACHINEFUNCTIONINFO_H_ -#define SIMACHINEFUNCTIONINFO_H_ +#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" +#include "SIRegisterInfo.h" #include <map> namespace llvm { @@ -26,6 +27,10 @@ class MachineRegisterInfo; /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { void anchor() override; + + unsigned TIDReg; + bool HasSpilledVGPRs; + public: struct SpilledReg { @@ -36,33 +41,25 @@ public: bool hasLane() { return Lane != -1;} }; - struct RegSpillTracker { - private: - unsigned CurrentLane; - std::map<unsigned, SpilledReg> SpilledRegisters; - public: - unsigned LaneVGPR; - RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { } - /// \p NumRegs The number of consecutive registers what need to be spilled. - /// This function will ensure that all registers are stored in - /// the same VGPR. - /// \returns The lane to be used for storing the first register. - unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF, - unsigned NumRegs = 1); - void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1); - const SpilledReg& getSpilledReg(unsigned FrameIndex); - bool programSpillsRegisters() { return !SpilledRegisters.empty(); } - }; - // SIMachineFunctionInfo definition SIMachineFunctionInfo(const MachineFunction &MF); + SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, + unsigned SubIdx); unsigned PSInputAddr; - struct RegSpillTracker SpillTracker; unsigned NumUserSGPRs; + std::map<unsigned, unsigned> LaneVGPRs; + unsigned LDSWaveSpillSize; + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; + unsigned getTIDReg() const { return TIDReg; }; + void setTIDReg(unsigned Reg) { TIDReg = Reg; } + bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } + void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } + + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; } // End namespace llvm -#endif //_SIMACHINEFUNCTIONINFO_H_ +#endif diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp new file mode 100644 index 000000000000..f0e7edec6b48 --- /dev/null +++ b/lib/Target/R600/SIPrepareScratchRegs.cpp @@ -0,0 +1,196 @@ +//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This pass loads scratch pointer and scratch offset into a register or a +/// frame index which can be used anywhere in the program. These values will +/// be used for spilling VGPRs. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +namespace { + +class SIPrepareScratchRegs : public MachineFunctionPass { + +private: + static char ID; + +public: + SIPrepareScratchRegs() : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI prepare scratch registers"; + } + +}; + +} // End anonymous namespace + +char SIPrepareScratchRegs::ID = 0; + +FunctionPass *llvm::createSIPrepareScratchRegs() { + return new SIPrepareScratchRegs(); +} + +bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + MachineBasicBlock *Entry = MF.begin(); + MachineBasicBlock::iterator I = Entry->begin(); + DebugLoc DL = I->getDebugLoc(); + + // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to + // run this pass. + if (!MFI->hasSpilledVGPRs()) + return false; + + unsigned ScratchPtrPreloadReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchOffsetPreloadReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + + if (!Entry->isLiveIn(ScratchPtrPreloadReg)) + Entry->addLiveIn(ScratchPtrPreloadReg); + + if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) + Entry->addLiveIn(ScratchOffsetPreloadReg); + + // Load the scratch pointer + unsigned ScratchPtrReg = + TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass); + int ScratchPtrFI = -1; + + if (ScratchPtrReg != AMDGPU::NoRegister) { + // Found an SGPR to use. + MRI.setPhysRegUsed(ScratchPtrReg); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B64), ScratchPtrReg) + .addReg(ScratchPtrPreloadReg); + } else { + // No SGPR is available, we must spill. + ScratchPtrFI = FrameInfo->CreateSpillStackObject(8, 4); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S64_SAVE)) + .addReg(ScratchPtrPreloadReg) + .addFrameIndex(ScratchPtrFI); + } + + // Load the scratch offset. + unsigned ScratchOffsetReg = + TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); + int ScratchOffsetFI = ~0; + + if (ScratchOffsetReg != AMDGPU::NoRegister) { + // Found an SGPR to use + MRI.setPhysRegUsed(ScratchOffsetReg); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) + .addReg(ScratchOffsetPreloadReg); + } else { + // No SGPR is available, we must spill. + ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) + .addReg(ScratchOffsetPreloadReg) + .addFrameIndex(ScratchOffsetFI); + } + + + // Now that we have the scratch pointer and offset values, we need to + // add them to all the SI_SPILL_V* instructions. + + RegScavenger RS; + bool UseRegScavenger = + (ScratchPtrReg == AMDGPU::NoRegister || + ScratchOffsetReg == AMDGPU::NoRegister); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + if (UseRegScavenger) + RS.enterBasicBlock(&MBB); + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + DebugLoc DL = MI.getDebugLoc(); + switch(MI.getOpcode()) { + default: break;; + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: + + // Scratch Pointer + if (ScratchPtrReg == AMDGPU::NoRegister) { + ScratchPtrReg = RS.scavengeRegister(&AMDGPU::SGPR_64RegClass, 0); + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S64_RESTORE), + ScratchPtrReg) + .addFrameIndex(ScratchPtrFI) + .addReg(AMDGPU::NoRegister) + .addReg(AMDGPU::NoRegister); + } else if (!MBB.isLiveIn(ScratchPtrReg)) { + MBB.addLiveIn(ScratchPtrReg); + } + + if (ScratchOffsetReg == AMDGPU::NoRegister) { + ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), + ScratchOffsetReg) + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::NoRegister) + .addReg(AMDGPU::NoRegister); + } else if (!MBB.isLiveIn(ScratchOffsetReg)) { + MBB.addLiveIn(ScratchOffsetReg); + } + + if (ScratchPtrReg == AMDGPU::NoRegister || + ScratchOffsetReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("ran out of SGPRs for spilling VGPRs"); + ScratchPtrReg = AMDGPU::SGPR0; + ScratchOffsetReg = AMDGPU::SGPR0; + } + MI.getOperand(2).setReg(ScratchPtrReg); + MI.getOperand(3).setReg(ScratchOffsetReg); + + break; + } + if (UseRegScavenger) + RS.forward(); + } + } + return true; +} diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 2a9a2ac5dd61..f9feea470f15 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -20,7 +20,10 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" using namespace llvm; SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st) @@ -30,7 +33,21 @@ SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st) BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::EXEC); + + // EXEC_LO and EXEC_HI could be allocated and used as regular register, + // but this seems likely to result in bugs, so I'm marking them as reserved. + Reserved.set(AMDGPU::EXEC_LO); + Reserved.set(AMDGPU::EXEC_HI); + Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); + Reserved.set(AMDGPU::FLAT_SCR); + Reserved.set(AMDGPU::FLAT_SCR_LO); + Reserved.set(AMDGPU::FLAT_SCR_HI); + + // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs + Reserved.set(AMDGPU::VGPR255); + Reserved.set(AMDGPU::VGPR254); + return Reserved; } @@ -43,23 +60,238 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const return Fn.getFrameInfo()->hasStackObjects(); } +static unsigned getNumSubRegsForSpillOp(unsigned Op) { + + switch (Op) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_RESTORE: + return 16; + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_RESTORE: + return 8; + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_RESTORE: + return 4; + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_RESTORE: + return 3; + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_RESTORE: + return 2; + case AMDGPU::SI_SPILL_S32_SAVE: + case AMDGPU::SI_SPILL_S32_RESTORE: + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + return 1; + default: llvm_unreachable("Invalid spill opcode"); + } +} + +void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, + unsigned Value, + unsigned ScratchPtr, + unsigned ScratchOffset, + int64_t Offset, + RegScavenger *RS) const { + + const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); + MachineBasicBlock *MBB = MI->getParent(); + const MachineFunction *MF = MI->getParent()->getParent(); + LLVMContext &Ctx = MF->getFunction()->getContext(); + DebugLoc DL = MI->getDebugLoc(); + bool IsLoad = TII->get(LoadStoreOp).mayLoad(); + + bool RanOutOfSGPRs = false; + unsigned SOffset = ScratchOffset; + + unsigned RsrcReg = RS->scavengeRegister(&AMDGPU::SReg_128RegClass, MI, 0); + if (RsrcReg == AMDGPU::NoRegister) { + RanOutOfSGPRs = true; + RsrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; + } + + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned Size = NumSubRegs * 4; + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B64), + getSubReg(RsrcReg, AMDGPU::sub0_sub1)) + .addReg(ScratchPtr) + .addReg(RsrcReg, RegState::ImplicitDefine); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), + getSubReg(RsrcReg, AMDGPU::sub2)) + .addImm(Rsrc & 0xffffffff) + .addReg(RsrcReg, RegState::ImplicitDefine); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), + getSubReg(RsrcReg, AMDGPU::sub3)) + .addImm(Rsrc >> 32) + .addReg(RsrcReg, RegState::ImplicitDefine); + + if (!isUInt<12>(Offset + Size)) { + SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + if (SOffset == AMDGPU::NoRegister) { + RanOutOfSGPRs = true; + SOffset = AMDGPU::SGPR0; + } + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + .addReg(ScratchOffset) + .addImm(Offset); + Offset = 0; + } + + if (RanOutOfSGPRs) + Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : + Value; + bool IsKill = (i == e - 1); + + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(RsrcReg, getKillRegState(IsKill)) + .addImm(Offset) + .addReg(SOffset, getKillRegState(IsKill)) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); + } +} + void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); + MachineBasicBlock *MBB = MI->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); - int64_t Offset = FrameInfo->getObjectOffset(Index); - FIOp.ChangeToImmediate(Offset); - if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { - unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addImm(Offset); - FIOp.ChangeToRegister(TmpReg, false); + switch (MI->getOpcode()) { + // SGPR register spill + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + if (Spill.VGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) + .addReg(SubReg) + .addImm(Spill.Lane); + + } + MI->eraseFromParent(); + break; + } + + // SGPR register restore + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + bool isM0 = SubReg == AMDGPU::M0; + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + if (Spill.VGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + } + + if (isM0) { + SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + if (isM0) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(SubReg); + } + } + TII->insertNOPs(MI, 3); + MI->eraseFromParent(); + break; + } + + // VGPR register spill + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: + buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_ptr)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); + MI->eraseFromParent(); + break; + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: { + buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_ptr)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); + MI->eraseFromParent(); + break; + } + + default: { + int64_t Offset = FrameInfo->getObjectOffset(Index); + FIOp.ChangeToImmediate(Offset); + if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); + BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); + FIOp.ChangeToRegister(TmpReg, false); + } + } } } @@ -67,7 +299,7 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( MVT VT) const { switch(VT.SimpleTy) { default: - case MVT::i32: return &AMDGPU::VReg_32RegClass; + case MVT::i32: return &AMDGPU::VGPR_32RegClass; } } @@ -78,13 +310,17 @@ unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - const TargetRegisterClass *BaseClasses[] = { - &AMDGPU::VReg_32RegClass, + static const TargetRegisterClass *BaseClasses[] = { + &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::VReg_96RegClass, + &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, - &AMDGPU::SReg_256RegClass + &AMDGPU::VReg_256RegClass, + &AMDGPU::SReg_256RegClass, + &AMDGPU::VReg_512RegClass }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -95,15 +331,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { return nullptr; } -bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const { - if (!RC) { - return false; - } - return !hasVGPRs(RC); -} - bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) || + return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || @@ -118,7 +347,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( } else if (SRC == &AMDGPU::SCCRegRegClass) { return &AMDGPU::VCCRegRegClass; } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { - return &AMDGPU::VReg_32RegClass; + return &AMDGPU::VGPR_32RegClass; } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { return &AMDGPU::VReg_64RegClass; } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { @@ -148,24 +377,61 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const { + + switch (Reg) { + case AMDGPU::VCC: + switch(Channel) { + case 0: return AMDGPU::VCC_LO; + case 1: return AMDGPU::VCC_HI; + default: llvm_unreachable("Invalid SubIdx for VCC"); + } + + case AMDGPU::FLAT_SCR: + switch (Channel) { + case 0: + return AMDGPU::FLAT_SCR_LO; + case 1: + return AMDGPU::FLAT_SCR_HI; + default: + llvm_unreachable("Invalid SubIdx for FLAT_SCR"); + } + break; + + case AMDGPU::EXEC: + switch (Channel) { + case 0: + return AMDGPU::EXEC_LO; + case 1: + return AMDGPU::EXEC_HI; + default: + llvm_unreachable("Invalid SubIdx for EXEC"); + } + break; + } + + const TargetRegisterClass *RC = getPhysRegClass(Reg); + // 32-bit registers don't have sub-registers, so we can just return the + // Reg. We need to have this check here, because the calculation below + // using getHWRegIndex() will fail with special 32-bit registers like + // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. + if (RC->getSize() == 4) { + assert(Channel == 0); + return Reg; + } + unsigned Index = getHWRegIndex(Reg); return SubRC->getRegister(Index + Channel); } -bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const { - switch (RCID) { - default: return false; - case AMDGPU::SSrc_32RegClassID: - case AMDGPU::SSrc_64RegClassID: - case AMDGPU::VSrc_32RegClassID: - case AMDGPU::VSrc_64RegClassID: - return true; - } +bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { + return OpType == AMDGPU::OPERAND_REG_IMM32; } -bool SIRegisterInfo::regClassCanUseImmediate( - const TargetRegisterClass *RC) const { - return regClassCanUseImmediate(RC->getID()); +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (opCanUseLiteralConstant(OpType)) + return true; + + return OpType == AMDGPU::OPERAND_REG_INLINE_C; } unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, @@ -183,6 +449,29 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); case SIRegisterInfo::SCRATCH_PTR: return AMDGPU::SGPR2_SGPR3; + case SIRegisterInfo::INPUT_PTR: + return AMDGPU::SGPR0_SGPR1; + case SIRegisterInfo::TIDIG_X: + return AMDGPU::VGPR0; + case SIRegisterInfo::TIDIG_Y: + return AMDGPU::VGPR1; + case SIRegisterInfo::TIDIG_Z: + return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); } + +/// \brief Returns a register that is not used at any point in the function. +/// If all registers are used, then this function will return +// AMDGPU::NoRegister. +unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const { + + for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); + I != E; ++I) { + if (!MRI.isPhysRegUsed(*I)) + return *I; + } + return AMDGPU::NoRegister; +} + diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index 5d0235c0f427..d14212c2b104 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// -#ifndef SIREGISTERINFO_H_ -#define SIREGISTERINFO_H_ +#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" @@ -42,11 +42,24 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getHWRegIndex(unsigned Reg) const override; /// \brief Return the 'base' register class for this register. - /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc. + /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; /// \returns true if this class contains only SGPR registers - bool isSGPRClass(const TargetRegisterClass *RC) const; + bool isSGPRClass(const TargetRegisterClass *RC) const { + if (!RC) + return false; + + return !hasVGPRs(RC); + } + + /// \returns true if this class ID contains only SGPR registers + bool isSGPRClassID(unsigned RCID) const { + if (static_cast<int>(RCID) == -1) + return false; + + return isSGPRClass(getRegClass(RCID)); + } /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; @@ -67,28 +80,41 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const; - /// \returns True if operands defined with this register class can accept - /// inline immediates. - bool regClassCanUseImmediate(int RCID) const; + /// \returns True if operands defined with this operand type can accept + /// a literal constant (i.e. any 32-bit immediate). + bool opCanUseLiteralConstant(unsigned OpType) const; - /// \returns True if operands defined with this register class can accept - /// inline immediates. - bool regClassCanUseImmediate(const TargetRegisterClass *RC) const; + /// \returns True if operands defined with this operand type can accept + /// an inline constant. i.e. An integer value in the range (-16, 64) or + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + bool opCanUseInlineConstant(unsigned OpType) const; enum PreloadedValue { TGID_X, TGID_Y, TGID_Z, SCRATCH_WAVE_OFFSET, - SCRATCH_PTR + SCRATCH_PTR, + INPUT_PTR, + TIDIG_X, + TIDIG_Y, + TIDIG_Z }; /// \brief Returns the physical register that \p Value is stored in. unsigned getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const; + unsigned findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const; + +private: + void buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, unsigned Value, + unsigned ScratchPtr, unsigned ScratchOffset, + int64_t Offset, RegScavenger *RS) const; }; } // End namespace llvm -#endif // SIREGISTERINFO_H_ +#endif diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 8974b6300625..1a1efb0c89a9 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -27,10 +27,28 @@ def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> { let HWEncoding = 106; } -def EXEC : SIReg<"EXEC", 126>; +def EXEC_LO : SIReg<"exec_lo", 126>; +def EXEC_HI : SIReg<"exec_hi", 127>; + +def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 126; +} + def SCC : SIReg<"SCC", 253>; def M0 : SIReg <"M0", 124>; +def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. +def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. + +// Pair to indicate location of scratch space for flat accesses. +def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 104; +} + // SGPR registers foreach Index = 0-101 in { def SGPR#Index : SIReg <"SGPR"#Index, Index>; @@ -152,20 +170,24 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, //===----------------------------------------------------------------------===// // Special register classes for predicates and the M0 register -def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>; +def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { + let CopyCost = -1; // Theoretically it is possible to read from SCC, + // but it should never be necessary. +} + def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>; // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, - (add SGPR_32, M0Reg, VCC_LO) + (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>; +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64, - (add SGPR_64Regs, VCCReg, EXECReg) +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, + (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) >; def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; @@ -175,8 +197,6 @@ def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256 def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>; - def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { @@ -191,17 +211,49 @@ def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>; +class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; +} + +class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; +} + +//===----------------------------------------------------------------------===// +// SSrc_* Operands with an SGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + +def SSrc_32 : RegImmOperand<SReg_32>; + +def SSrc_64 : RegImmOperand<SReg_64>; + //===----------------------------------------------------------------------===// -// [SV]Src_(32|64) register classes, can have either an immediate or an register +// SCSrc_* Operands with an SGPR or a inline constant //===----------------------------------------------------------------------===// -def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>; +def SCSrc_32 : RegInlineOperand<SReg_32>; -def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>; +//===----------------------------------------------------------------------===// +// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + +def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; + +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; + +def VSrc_32 : RegImmOperand<VS_32>; + +def VSrc_64 : RegImmOperand<VS_64>; + +//===----------------------------------------------------------------------===// +// VCSrc_* Operands with an SGPR, VGPR or an inline constant +//===----------------------------------------------------------------------===// -def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>; +def VCSrc_32 : RegInlineOperand<VS_32>; -def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; +def VCSrc_64 : RegInlineOperand<VS_64>; //===----------------------------------------------------------------------===// // SGPR and VGPR register classes diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td index 28b65b825855..9b1f676020bf 100644 --- a/lib/Target/R600/SISchedule.td +++ b/lib/Target/R600/SISchedule.td @@ -7,9 +7,85 @@ // //===----------------------------------------------------------------------===// // -// TODO: This is just a place holder for now. +// MachineModel definitions for Southern Islands (SI) // //===----------------------------------------------------------------------===// +def WriteBranch : SchedWrite; +def WriteExport : SchedWrite; +def WriteLDS : SchedWrite; +def WriteSALU : SchedWrite; +def WriteSMEM : SchedWrite; +def WriteVMEM : SchedWrite; -def SI_Itin : ProcessorItineraries <[], [], []>; +// Vector ALU instructions +def Write32Bit : SchedWrite; +def WriteQuarterRate32 : SchedWrite; + +def WriteFloatFMA : SchedWrite; + +def WriteDouble : SchedWrite; +def WriteDoubleAdd : SchedWrite; + +def SIFullSpeedModel : SchedMachineModel; +def SIQuarterSpeedModel : SchedMachineModel; + +// BufferSize = 0 means the processors are in-order. +let BufferSize = 0 in { + +// XXX: Are the resource counts correct? +def HWBranch : ProcResource<1>; +def HWExport : ProcResource<7>; // Taken from S_WAITCNT +def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT +def HWSALU : ProcResource<1>; +def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT +def HWVALU : ProcResource<1>; + +} + +class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, + int latency> : WriteRes<write, resources> { + let Latency = latency; +} + +class HWVALUWriteRes<SchedWrite write, int latency> : + HWWriteRes<write, [HWVALU], latency>; + + +// The latency numbers are taken from AMD Accelerated Parallel Processing +// guide. They may not be acurate. + +// The latency values are 1 / (operations / cycle) / 4. +multiclass SICommonWriteRes { + + def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ??? + def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ??? + def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64 + def : HWWriteRes<WriteSALU, [HWSALU], 1>; + def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? + def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + + def : HWVALUWriteRes<Write32Bit, 1>; + def : HWVALUWriteRes<WriteQuarterRate32, 4>; +} + + +let SchedModel = SIFullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 1>; +def : HWVALUWriteRes<WriteDouble, 4>; +def : HWVALUWriteRes<WriteDoubleAdd, 2>; + +} // End SchedModel = SIFullSpeedModel + +let SchedModel = SIQuarterSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 16>; +def : HWVALUWriteRes<WriteDouble, 16>; +def : HWVALUWriteRes<WriteDoubleAdd, 8>; + +} // End SchedModel = SIQuarterSpeedModel diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp index 745c4b65644d..f91d1177bbae 100644 --- a/lib/Target/R600/SIShrinkInstructions.cpp +++ b/lib/Target/R600/SIShrinkInstructions.cpp @@ -10,11 +10,13 @@ // #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" @@ -24,6 +26,8 @@ STATISTIC(NumInstructionsShrunk, "Number of 64-bit instruction reduced to 32-bit."); +STATISTIC(NumLiteralConstantsFolded, + "Number of literal constants folded into 32-bit instructions."); namespace llvm { void initializeSIShrinkInstructionsPass(PassRegistry&); @@ -41,13 +45,13 @@ public: SIShrinkInstructions() : MachineFunctionPass(ID) { } - virtual bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override; - virtual const char *getPassName() const override { + const char *getPassName() const override { return "SI Shrink Instructions"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -90,29 +94,83 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, const MachineOperand *Src1Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); - if (Src1 && (!isVGPR(Src1, TRI, MRI) || Src1Mod->getImm() != 0)) + if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) return false; - // We don't need to check src0, all input types are legal, so just make - // sure src0 isn't using any modifiers. - const MachineOperand *Src0Mod = - TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); - if (Src0Mod && Src0Mod->getImm() != 0) + // We don't need to check src0, all input types are legal, so just make sure + // src0 isn't using any modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) return false; // Check output modifiers - const MachineOperand *Omod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); - if (Omod && Omod->getImm() != 0) + if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) return false; - const MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); - return !Clamp || Clamp->getImm() == 0; + if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + return true; +} + +/// \brief This function checks \p MI for operands defined by a move immediate +/// instruction and then folds the literal constant into the instruction if it +/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction +/// and will only fold literal constants if we are still in SSA. +static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, + MachineRegisterInfo &MRI, bool TryToCommute = true) { + + if (!MRI.isSSA()) + return; + + assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || + TII->isVOPC(MI.getOpcode())); + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + + // Only one literal constant is allowed per instruction, so if src0 is a + // literal constant then we can't do any folding. + if (Src0->isImm() && TII->isLiteralConstant(*Src0)) + return; + + + // Literal constants and SGPRs can only be used in Src0, so if Src0 is an + // SGPR, we cannot commute the instruction, so we can't fold any literal + // constants. + if (Src0->isReg() && !isVGPR(Src0, TRI, MRI)) + return; + + // Try to fold Src0 + if (Src0->isReg()) { + unsigned Reg = Src0->getReg(); + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Def && Def->isMoveImmediate()) { + MachineOperand &MovSrc = Def->getOperand(1); + bool ConstantFolded = false; + + if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { + Src0->ChangeToImmediate(MovSrc.getImm()); + ConstantFolded = true; + } + if (ConstantFolded) { + if (MRI.use_empty(Reg)) + Def->eraseFromParent(); + ++NumLiteralConstantsFolded; + return; + } + } + } + + // We have failed to fold src0, so commute the instruction and try again. + if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + foldImmediates(MI, TII, MRI, false); + } bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( - MF.getTarget().getInstrInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); std::vector<unsigned> I1Defs; @@ -125,11 +183,23 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. + if (MI.getOpcode() == AMDGPU::S_MOV_B32) { + const MachineOperand &Src = MI.getOperand(1); + + if (Src.isImm()) { + if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); + } + + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; if (!canShrink(MI, TII, TRI, MRI)) { - // Try commtuing the instruction and see if that enables us to shrink + // Try commuting the instruction and see if that enables us to shrink // it. if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || !canShrink(MI, TII, TRI, MRI)) @@ -147,18 +217,17 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because the register allocator - // has trouble with sequences like this, which cause the allocator - // to run out of registes if vreg0 and vreg1 belong to the VCCReg - // register class: + // force them to use VCC here, because the register allocator has + // trouble with sequences like this, which cause the allocator to run + // out of registers if vreg0 and vreg1 belong to the VCCReg register + // class: // vreg0 = VOPC; // vreg1 = VOPC; // S_AND_B64 vreg0, vreg1 // - // So, instead of forcing the instruction to write to VCC, we provide a - // hint to the register allocator to use VCC and then we - // we will run this pass again after RA and shrink it if it outpus to - // VCC. + // So, instead of forcing the instruction to write to VCC, we provide + // a hint to the register allocator to use VCC and then we we will run + // this pass again after RA and shrink it if it outputs to VCC. MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); continue; } @@ -167,27 +236,28 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } // We can shrink this instruction - DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << "\n";); + DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); - MachineInstrBuilder MIB = + MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); // dst - MIB.addOperand(MI.getOperand(0)); + Inst32.addOperand(MI.getOperand(0)); - MIB.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (Src1) - MIB.addOperand(*Src1); - - for (const MachineOperand &MO : MI.implicit_operands()) - MIB.addOperand(MO); + Inst32.addOperand(*Src1); - DEBUG(dbgs() << "e32 MI = "; MI.dump(); dbgs() << "\n";); ++NumInstructionsShrunk; MI.eraseFromParent(); + + foldImmediates(*Inst32, TII, MRI); + DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); + + } } return false; diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp index 367963aebb00..9318dc11d55d 100644 --- a/lib/Target/R600/SITypeRewriter.cpp +++ b/lib/Target/R600/SITypeRewriter.cpp @@ -87,7 +87,7 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) { Value *BitCast = Builder.CreateBitCast(Ptr, PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); LoadInst *Load = Builder.CreateLoad(BitCast); - SmallVector <std::pair<unsigned, MDNode*>, 8> MD; + SmallVector<std::pair<unsigned, MDNode *>, 8> MD; I.getAllMetadataOtherThanDebugLoc(MD); for (unsigned i = 0, e = MD.size(); i != e; ++i) { Load->setMetadata(MD[i].first, MD[i].second); diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp index f437564f4b84..d723d6e3e8b7 100644 --- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp +++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp @@ -16,11 +16,15 @@ using namespace llvm; -/// \brief The target for the AMDGPU backend +/// \brief The target which suports all AMD GPUs. This will eventually +/// be deprecated and there will be a R600 target and a GCN target. Target llvm::TheAMDGPUTarget; +/// \brief The target for GCN GPUs +Target llvm::TheGCNTarget; /// \brief Extern function to initialize the targets for the AMDGPU backend extern "C" void LLVMInitializeR600TargetInfo() { RegisterTarget<Triple::r600, false> R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); + RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs"); } diff --git a/lib/Target/R600/VIInstrFormats.td b/lib/Target/R600/VIInstrFormats.td new file mode 100644 index 000000000000..5285d18ced46 --- /dev/null +++ b/lib/Target/R600/VIInstrFormats.td @@ -0,0 +1,145 @@ +//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// VI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class DSe_vi <bits<8> op> : Enc64 { + + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{16} = gds; + let Inst{24-17} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; +} + +class MUBUFe_vi <bits<7> op> : Enc64 { + + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{16} = lds; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUFe_vi <bits<4> op> : Enc64 { + + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{18-15} = op; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class SMEMe_vi <bits<8> op, bit imm> : Enc64 { + + bits<7> sbase; + bits<7> sdata; + bits<1> glc; + bits<20> offset; + + let Inst{5-0} = sbase{6-1}; + let Inst{12-6} = sdata; + let Inst{16} = glc; + let Inst{17} = imm; + let Inst{25-18} = op; + let Inst{31-26} = 0x30; //encoding + let Inst{51-32} = offset; +} + +class VOP3e_vi <bits<10> op> : Enc64 { + + bits<8> dst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = dst; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class EXPe_vi : EXPe { + let Inst{31-26} = 0x31; //encoding +} + +class VINTRPe_vi <bits<2> op> : VINTRPe <op> { + let Inst{31-26} = 0x35; // encoding +} diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td new file mode 100644 index 000000000000..07cfa29ae12b --- /dev/null +++ b/lib/Target/R600/VIInstructions.td @@ -0,0 +1,89 @@ +//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for VI and newer. +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isVI in { + +def V_LDEXP_F32 : VOP3InstVI <0x288, "v_ldexp_f32", VOP_F32_F32_I32, + AMDGPUldexp +>; +def V_BFM_B32 : VOP3InstVI <0x293, "v_bfm_b32", VOP_I32_I32_I32, AMDGPUbfm>; +def V_BCNT_U32_B32 : VOP3InstVI <0x28b, "v_bcnt_u32_b32", VOP_I32_I32_I32>; +def V_MBCNT_LO_U32_B32 : VOP3InstVI <0x28c, "v_mbcnt_lo_u32_b32", + VOP_I32_I32_I32 +>; +def V_MBCNT_HI_U32_B32 : VOP3InstVI <0x28d, "v_mbcnt_hi_u32_b32", + VOP_I32_I32_I32 +>; + +def V_CVT_PKRTZ_F16_F32 : VOP3InstVI <0x296, "v_cvt_pkrtz_f16_f32", + VOP_I32_F32_F32, int_SI_packf16 +>; + +defm BUFFER_LOAD_DWORD_VI : MUBUF_Load_Helper_vi < + 0x14, "buffer_load_dword", VGPR_32, i32, global_load +>; + +defm BUFFER_LOAD_FORMAT_XYZW_VI : MUBUF_Load_Helper_vi < + 0x03, "buffer_load_format_xyzw", VReg_128 +>; + +} // End SubtargetPredicate = isVI + +//===----------------------------------------------------------------------===// +// VOP2 Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isVI] in { + +def : Pat < + (int_SI_tid), + (V_MBCNT_HI_U32_B32 0xffffffff, + (V_MBCNT_LO_U32_B32 0xffffffff, 0)) +>; + +//===----------------------------------------------------------------------===// +// SMEM Patterns +//===----------------------------------------------------------------------===// + +// 1. Offset as 8bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +//===----------------------------------------------------------------------===// +// MUBUF Patterns +//===----------------------------------------------------------------------===// + +// Offset in an 32Bit VGPR +def : Pat < + (SIload_constant v4i32:$sbase, i32:$voff), + (BUFFER_LOAD_DWORD_VI_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) +>; + +// Offset in an 32Bit VGPR +def : Pat < + (SIload_constant v4i32:$sbase, i32:$voff), + (BUFFER_LOAD_DWORD_VI_OFFEN $sbase, $voff, 0, 0, 0, 0, 0) +>; + +/* int_SI_vs_load_input */ +def : Pat< + (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), + (BUFFER_LOAD_FORMAT_XYZW_VI_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0) +>; + +defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_VI_OFFSET, + BUFFER_LOAD_DWORD_VI_OFFEN, + BUFFER_LOAD_DWORD_VI_IDXEN, + BUFFER_LOAD_DWORD_VI_BOTHEN>; + +} // End Predicates = [isVI] |