aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/R600
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2015-01-18 16:17:27 +0000
committerDimitry Andric <dim@FreeBSD.org>2015-01-18 16:17:27 +0000
commit67c32a98315f785a9ec9d531c1f571a0196c7463 (patch)
tree4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Target/R600
parent9f61947910e6ab40de38e6b4034751ef1513200f (diff)
downloadsrc-67c32a98315f785a9ec9d531c1f571a0196c7463.tar.gz
src-67c32a98315f785a9ec9d531c1f571a0196c7463.zip
Vendor import of llvm RELEASE_360/rc1 tag r226102 (effectively, 3.6.0 RC1):vendor/llvm/llvm-release_360-r226102
Notes
Notes: svn path=/vendor/llvm/dist/; revision=277323 svn path=/vendor/llvm/llvm-release_360-r226102/; revision=277324; tag=vendor/llvm/llvm-release_360-r226102
Diffstat (limited to 'lib/Target/R600')
-rw-r--r--lib/Target/R600/AMDGPU.h17
-rw-r--r--lib/Target/R600/AMDGPU.td25
-rw-r--r--lib/Target/R600/AMDGPUAlwaysInlinePass.cpp66
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.cpp270
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.h38
-rw-r--r--lib/Target/R600/AMDGPUCallingConv.td32
-rw-r--r--lib/Target/R600/AMDGPUFrameLowering.h6
-rw-r--r--lib/Target/R600/AMDGPUISelDAGToDAG.cpp471
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.cpp1038
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h91
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.cpp58
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.h19
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.td63
-rw-r--r--lib/Target/R600/AMDGPUInstructions.td115
-rw-r--r--lib/Target/R600/AMDGPUIntrinsicInfo.cpp2
-rw-r--r--lib/Target/R600/AMDGPUIntrinsicInfo.h8
-rw-r--r--lib/Target/R600/AMDGPUMCInstLower.cpp23
-rw-r--r--lib/Target/R600/AMDGPUMCInstLower.h9
-rw-r--r--lib/Target/R600/AMDGPUMachineFunction.cpp4
-rw-r--r--lib/Target/R600/AMDGPUMachineFunction.h12
-rw-r--r--lib/Target/R600/AMDGPUPromoteAlloca.cpp48
-rw-r--r--lib/Target/R600/AMDGPURegisterInfo.cpp3
-rw-r--r--lib/Target/R600/AMDGPURegisterInfo.h8
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.cpp80
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.h58
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.cpp145
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.h52
-rw-r--r--lib/Target/R600/AMDGPUTargetTransformInfo.cpp20
-rw-r--r--lib/Target/R600/AMDILCFGStructurizer.cpp5
-rw-r--r--lib/Target/R600/AMDKernelCodeT.h704
-rw-r--r--lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp320
-rw-r--r--lib/Target/R600/AsmParser/CMakeLists.txt3
-rw-r--r--lib/Target/R600/AsmParser/LLVMBuild.txt23
-rw-r--r--lib/Target/R600/AsmParser/Makefile15
-rw-r--r--lib/Target/R600/CIInstructions.td42
-rw-r--r--lib/Target/R600/CMakeLists.txt8
-rw-r--r--lib/Target/R600/CaymanInstructions.td2
-rw-r--r--lib/Target/R600/EvergreenInstructions.td73
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp275
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h23
-rw-r--r--lib/Target/R600/LLVMBuild.txt5
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp13
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h6
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp23
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h18
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h6
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp19
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h7
-rw-r--r--lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp13
-rw-r--r--lib/Target/R600/Makefile4
-rw-r--r--lib/Target/R600/Processors.td32
-rw-r--r--lib/Target/R600/R600ClauseMergePass.cpp3
-rw-r--r--lib/Target/R600/R600ControlFlowFinalizer.cpp21
-rw-r--r--lib/Target/R600/R600Defines.h6
-rw-r--r--lib/Target/R600/R600EmitClauseMarkers.cpp3
-rw-r--r--lib/Target/R600/R600ExpandSpecialInstrs.cpp3
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp176
-rw-r--r--lib/Target/R600/R600ISelLowering.h6
-rw-r--r--lib/Target/R600/R600InstrFormats.td3
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp15
-rw-r--r--lib/Target/R600/R600InstrInfo.h12
-rw-r--r--lib/Target/R600/R600Instructions.td44
-rw-r--r--lib/Target/R600/R600MachineFunctionInfo.h6
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp33
-rw-r--r--lib/Target/R600/R600MachineScheduler.h4
-rw-r--r--lib/Target/R600/R600OptimizeVectorRegisters.cpp8
-rw-r--r--lib/Target/R600/R600Packetizer.cpp15
-rw-r--r--lib/Target/R600/R600RegisterInfo.h6
-rw-r--r--lib/Target/R600/SIDefines.h96
-rw-r--r--lib/Target/R600/SIFixSGPRCopies.cpp106
-rw-r--r--lib/Target/R600/SIFixSGPRLiveRanges.cpp147
-rw-r--r--lib/Target/R600/SIFoldOperands.cpp275
-rw-r--r--lib/Target/R600/SIISelLowering.cpp1643
-rw-r--r--lib/Target/R600/SIISelLowering.h57
-rw-r--r--lib/Target/R600/SIInsertWaits.cpp90
-rw-r--r--lib/Target/R600/SIInstrFormats.td256
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp1702
-rw-r--r--lib/Target/R600/SIInstrInfo.h195
-rw-r--r--lib/Target/R600/SIInstrInfo.td1817
-rw-r--r--lib/Target/R600/SIInstructions.td3016
-rw-r--r--lib/Target/R600/SILoadStoreOptimizer.cpp434
-rw-r--r--lib/Target/R600/SILowerControlFlow.cpp86
-rw-r--r--lib/Target/R600/SILowerI1Copies.cpp105
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.cpp96
-rw-r--r--lib/Target/R600/SIMachineFunctionInfo.h41
-rw-r--r--lib/Target/R600/SIPrepareScratchRegs.cpp196
-rw-r--r--lib/Target/R600/SIRegisterInfo.cpp355
-rw-r--r--lib/Target/R600/SIRegisterInfo.h50
-rw-r--r--lib/Target/R600/SIRegisterInfo.td78
-rw-r--r--lib/Target/R600/SISchedule.td80
-rw-r--r--lib/Target/R600/SIShrinkInstructions.cpp136
-rw-r--r--lib/Target/R600/SITypeRewriter.cpp2
-rw-r--r--lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp6
-rw-r--r--lib/Target/R600/VIInstrFormats.td145
-rw-r--r--lib/Target/R600/VIInstructions.td89
95 files changed, 11789 insertions, 4294 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index d7e94f75e123..fcf9eca80e96 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -8,8 +8,8 @@
/// \file
//===----------------------------------------------------------------------===//
-#ifndef AMDGPU_H
-#define AMDGPU_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H
+#define LLVM_LIB_TARGET_R600_AMDGPU_H
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
@@ -38,21 +38,31 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
// SI Passes
FunctionPass *createSITypeRewriter();
FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIShrinkInstructionsPass();
+FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
FunctionPass *createSIFixSGPRLiveRangesPass();
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIInsertWaits(TargetMachine &tm);
+FunctionPass *createSIPrepareScratchRegs();
+
+void initializeSIFoldOperandsPass(PassRegistry &);
+extern char &SIFoldOperandsID;
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
+void initializeSILoadStoreOptimizerPass(PassRegistry &);
+extern char &SILoadStoreOptimizerID;
+
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
+ModulePass *createAMDGPUAlwaysInlinePass();
/// \brief Creates an AMDGPU-specific Target Transformation Info pass.
ImmutablePass *
@@ -63,6 +73,7 @@ extern char &SIFixSGPRLiveRangesID;
extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
namespace AMDGPU {
enum TargetIndex {
@@ -127,4 +138,4 @@ enum AddressSpaces {
} // namespace AMDGPUAS
-#endif // AMDGPU_H
+#endif
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 5645f1a2322e..8a5ca613dc80 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -81,6 +81,17 @@ def FeatureCFALUBug : SubtargetFeature<"cfalubug",
"true",
"GPU has CF_ALU bug">;
+// XXX - This should probably be removed once enabled by default
+def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
+ "EnableLoadStoreOpt",
+ "true",
+ "Enable SI load/store optimizer pass">;
+
+def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
+ "FlatAddressSpace",
+ "true",
+ "Support flat address space">;
+
class SubtargetFeatureFetchLimit <string Value> :
SubtargetFeature <"fetch"#Value,
"TexVTXClauseSize",
@@ -135,16 +146,28 @@ def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
[Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
- FeatureWavefrontSize64]>;
+ FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
+
+def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+ [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+ FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
+
//===----------------------------------------------------------------------===//
def AMDGPUInstrInfo : InstrInfo {
let guessInstructionProperties = 1;
}
+def AMDGPUAsmParser : AsmParser {
+ // Some of the R600 registers have the same name, so this crashes.
+ // For example T0_XYZW and T0_XY both have the asm name T0.
+ let ShouldEmitMatchRegisterName = 0;
+}
+
def AMDGPU : Target {
// Pull in Instruction Info:
let InstructionSet = AMDGPUInstrInfo;
+ let AssemblyParsers = [AMDGPUAsmParser];
}
// Dummy Instruction itineraries for pseudo instructions
diff --git a/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp
new file mode 100644
index 000000000000..b545b456161f
--- /dev/null
+++ b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp
@@ -0,0 +1,66 @@
+//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass marks all internal functions as always_inline and creates
+/// duplicates of all other functions a marks the duplicates as always_inline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAlwaysInline : public ModulePass {
+
+ static char ID;
+
+public:
+ AMDGPUAlwaysInline() : ModulePass(ID) { }
+ bool runOnModule(Module &M) override;
+ const char *getPassName() const override { return "AMDGPU Always Inline Pass"; }
+};
+
+} // End anonymous namespace
+
+char AMDGPUAlwaysInline::ID = 0;
+
+bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+
+ std::vector<Function*> FuncsToClone;
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ Function &F = *I;
+ if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty())
+ FuncsToClone.push_back(&F);
+ }
+
+ for (Function *F : FuncsToClone) {
+ ValueToValueMapTy VMap;
+ Function *NewFunc = CloneFunction(F, VMap, false);
+ NewFunc->setLinkage(GlobalValue::InternalLinkage);
+ F->getParent()->getFunctionList().push_back(NewFunc);
+ F->replaceAllUsesWith(NewFunc);
+ }
+
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ Function &F = *I;
+ if (F.hasLocalLinkage()) {
+ F.addFnAttr(Attribute::AlwaysInline);
+ }
+ }
+ return false;
+}
+
+ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
+ return new AMDGPUAlwaysInline();
+}
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 73faaa183581..624f3919b409 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
#include "AMDGPUSubtarget.h"
#include "R600Defines.h"
#include "R600MachineFunctionInfo.h"
@@ -79,6 +80,7 @@ static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
extern "C" void LLVMInitializeR600AsmPrinter() {
TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
+ TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
}
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
@@ -97,9 +99,13 @@ void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+ // The starting address of all shader programs must be 256 bytes aligned.
+ MF.setAlignment(8);
+
SetupMachineFunction(MF);
- OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':'));
+ EmitFunctionHeader();
MCContext &Context = getObjFileLowering().getContext();
const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
@@ -109,7 +115,12 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
- if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ if (STM.isAmdHsaOS()) {
+ OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+ getSIProgramInfo(KernelInfo, MF);
+ EmitAmdKernelCodeT(MF, KernelInfo);
+ OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1));
+ } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
getSIProgramInfo(KernelInfo, MF);
EmitProgramInfoSI(MF, KernelInfo);
} else {
@@ -151,23 +162,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (STM.dumpCode()) {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- MF.dump();
-#endif
+ if (STM.dumpCode() && DisasmEnabled) {
- if (DisasmEnabled) {
- OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
- ELF::SHT_NOTE, 0,
- SectionKind::getReadOnly()));
+ OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
+ ELF::SHT_NOTE, 0,
+ SectionKind::getReadOnly()));
- for (size_t i = 0; i < DisasmLines.size(); ++i) {
- std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
- Comment += " ; " + HexLines[i] + "\n";
+ for (size_t i = 0; i < DisasmLines.size(); ++i) {
+ std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+ Comment += " ; " + HexLines[i] + "\n";
- OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
- OutStreamer.EmitBytes(StringRef(Comment));
- }
+ OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
+ OutStreamer.EmitBytes(StringRef(Comment));
}
}
@@ -177,8 +183,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
unsigned MaxGPR = 0;
bool killPixel = false;
- const R600RegisterInfo *RI
- = static_cast<const R600RegisterInfo*>(TM.getRegisterInfo());
+ const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>(
+ TM.getSubtargetImpl()->getRegisterInfo());
const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
@@ -236,12 +242,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) const {
+ const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
uint64_t CodeSize = 0;
unsigned MaxSGPR = 0;
unsigned MaxVGPR = 0;
bool VCCUsed = false;
- const SIRegisterInfo *RI
- = static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+ bool FlatUsed = false;
+ const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
+ TM.getSubtargetImpl()->getRegisterInfo());
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
@@ -262,6 +271,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
reg == AMDGPU::VCC_HI) {
VCCUsed = true;
continue;
+ } else if (reg == AMDGPU::FLAT_SCR ||
+ reg == AMDGPU::FLAT_SCR_LO ||
+ reg == AMDGPU::FLAT_SCR_HI) {
+ FlatUsed = true;
+ continue;
}
switch (reg) {
@@ -275,7 +289,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (AMDGPU::SReg_32RegClass.contains(reg)) {
isSGPR = true;
width = 1;
- } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
+ } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
isSGPR = false;
width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(reg)) {
@@ -322,9 +336,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (VCCUsed)
MaxSGPR += 2;
- ProgInfo.NumVGPR = MaxVGPR;
- ProgInfo.NumSGPR = MaxSGPR;
+ if (FlatUsed)
+ MaxSGPR += 2;
+
+ // We found the maximum register index. They start at 0, so add one to get the
+ // number of registers.
+ ProgInfo.NumVGPR = MaxVGPR + 1;
+ ProgInfo.NumSGPR = MaxSGPR + 1;
+ ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
+ ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
// register.
ProgInfo.FloatMode = getFPMode(MF);
@@ -338,22 +359,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+ ProgInfo.FlatUsed = FlatUsed;
+ ProgInfo.VCCUsed = VCCUsed;
ProgInfo.CodeLen = CodeSize;
-}
-
-void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
- const SIProgramInfo &KernelInfo) {
- const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- unsigned RsrcReg;
- switch (MFI->getShaderType()) {
- default: // Fall through
- case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
- case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
- case ShaderType::PIXEL: RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
- case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
- }
unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
@@ -364,52 +372,188 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
LDSAlignShift = 9;
}
- unsigned LDSBlocks =
- RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+ unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
+ MFI->getMaximumWorkGroupSize(MF);
+
+ ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
+ ProgInfo.LDSBlocks =
+ RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
// Scratch is allocated in 256 dword blocks.
unsigned ScratchAlignShift = 10;
// We need to program the hardware with the amount of scratch memory that
- // is used by the entire wave. KernelInfo.ScratchSize is the amount of
+ // is used by the entire wave. ProgInfo.ScratchSize is the amount of
// scratch memory used per thread.
- unsigned ScratchBlocks =
- RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
+ ProgInfo.ScratchBlocks =
+ RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
1 << ScratchAlignShift) >> ScratchAlignShift;
+ ProgInfo.ComputePGMRSrc1 =
+ S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
+ S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
+ S_00B848_PRIORITY(ProgInfo.Priority) |
+ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+ S_00B848_PRIV(ProgInfo.Priv) |
+ S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
+ S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+ S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+
+ ProgInfo.ComputePGMRSrc2 =
+ S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+ S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
+ S_00B84C_TGID_X_EN(1) |
+ S_00B84C_TGID_Y_EN(1) |
+ S_00B84C_TGID_Z_EN(1) |
+ S_00B84C_TG_SIZE_EN(1) |
+ S_00B84C_TIDIG_COMP_CNT(2) |
+ S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+}
+
+static unsigned getRsrcReg(unsigned ShaderType) {
+ switch (ShaderType) {
+ default: // Fall through
+ case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1;
+ case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+ case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+ case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+ }
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
+
if (MFI->getShaderType() == ShaderType::COMPUTE) {
OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
- const uint32_t ComputePGMRSrc1 =
- S_00B848_VGPRS(KernelInfo.NumVGPR / 4) |
- S_00B848_SGPRS(KernelInfo.NumSGPR / 8) |
- S_00B848_PRIORITY(KernelInfo.Priority) |
- S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
- S_00B848_PRIV(KernelInfo.Priv) |
- S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
- S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
- S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
-
- OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
+ OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
- const uint32_t ComputePGMRSrc2 =
- S_00B84C_LDS_SIZE(LDSBlocks) |
- S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
-
- OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
+ OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
- OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
+ OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+
+ // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
+ // 0" comment but I don't see a corresponding field in the register spec.
} else {
OutStreamer.EmitIntValue(RsrcReg, 4);
- OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
- S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
+ OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
+ S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
}
if (MFI->getShaderType() == ShaderType::PIXEL) {
OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
- OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
+ OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
}
}
+
+void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+ amd_kernel_code_t header;
+
+ memset(&header, 0, sizeof(header));
+
+ header.amd_code_version_major = AMD_CODE_VERSION_MAJOR;
+ header.amd_code_version_minor = AMD_CODE_VERSION_MINOR;
+
+ header.struct_byte_size = sizeof(amd_kernel_code_t);
+
+ header.target_chip = STM.getAmdKernelCodeChipID();
+
+ header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment());
+
+ header.compute_pgm_resource_registers =
+ KernelInfo.ComputePGMRSrc1 |
+ (KernelInfo.ComputePGMRSrc2 << 32);
+
+ // Code Properties:
+ header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
+ AMD_CODE_PROPERTY_IS_PTR64;
+
+ if (KernelInfo.FlatUsed)
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+ if (KernelInfo.ScratchBlocks)
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
+
+ header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+ header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+
+ // MFI->ABIArgOffset is the number of bytes for the kernel arguments
+ // plus 36. 36 is the number of bytes reserved at the begining of the
+ // input buffer to store work-group size information.
+ // FIXME: We should be adding the size of the implicit arguments
+ // to this value.
+ header.kernarg_segment_byte_size = MFI->ABIArgOffset;
+
+ header.wavefront_sgpr_count = KernelInfo.NumSGPR;
+ header.workitem_vgpr_count = KernelInfo.NumVGPR;
+
+ // FIXME: What values do I put for these alignments
+ header.kernarg_segment_alignment = 0;
+ header.group_segment_alignment = 0;
+ header.private_segment_alignment = 0;
+
+ header.code_type = 1; // HSA_EXT_CODE_KERNEL
+
+ header.wavefront_size = STM.getWavefrontSize();
+
+ if (isVerbose()) {
+ OutStreamer.emitRawComment("amd_code_version_major = " +
+ Twine(header.amd_code_version_major), false);
+ OutStreamer.emitRawComment("amd_code_version_minor = " +
+ Twine(header.amd_code_version_minor), false);
+ OutStreamer.emitRawComment("struct_byte_size = " +
+ Twine(header.struct_byte_size), false);
+ OutStreamer.emitRawComment("target_chip = " +
+ Twine(header.target_chip), false);
+ OutStreamer.emitRawComment(" compute_pgm_rsrc1: " +
+ Twine::utohexstr(KernelInfo.ComputePGMRSrc1), false);
+ OutStreamer.emitRawComment(" compute_pgm_rsrc2: " +
+ Twine::utohexstr(KernelInfo.ComputePGMRSrc2), false);
+ OutStreamer.emitRawComment("enable_sgpr_private_segment_buffer = " +
+ Twine((bool)(header.code_properties &
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false);
+ OutStreamer.emitRawComment("enable_sgpr_kernarg_segment_ptr = " +
+ Twine((bool)(header.code_properties &
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false);
+ OutStreamer.emitRawComment("private_element_size = 2 ", false);
+ OutStreamer.emitRawComment("is_ptr64 = " +
+ Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false);
+ OutStreamer.emitRawComment("workitem_private_segment_byte_size = " +
+ Twine(header.workitem_private_segment_byte_size),
+ false);
+ OutStreamer.emitRawComment("workgroup_group_segment_byte_size = " +
+ Twine(header.workgroup_group_segment_byte_size),
+ false);
+ OutStreamer.emitRawComment("gds_segment_byte_size = " +
+ Twine(header.gds_segment_byte_size), false);
+ OutStreamer.emitRawComment("kernarg_segment_byte_size = " +
+ Twine(header.kernarg_segment_byte_size), false);
+ OutStreamer.emitRawComment("wavefront_sgpr_count = " +
+ Twine(header.wavefront_sgpr_count), false);
+ OutStreamer.emitRawComment("workitem_vgpr_count = " +
+ Twine(header.workitem_vgpr_count), false);
+ OutStreamer.emitRawComment("code_type = " + Twine(header.code_type), false);
+ OutStreamer.emitRawComment("wavefront_size = " +
+ Twine((int)header.wavefront_size), false);
+ OutStreamer.emitRawComment("optimization_level = " +
+ Twine(header.optimization_level), false);
+ OutStreamer.emitRawComment("hsail_profile = " +
+ Twine(header.hsail_profile), false);
+ OutStreamer.emitRawComment("hsail_machine_model = " +
+ Twine(header.hsail_machine_model), false);
+ OutStreamer.emitRawComment("hsail_version_major = " +
+ Twine(header.hsail_version_major), false);
+ OutStreamer.emitRawComment("hsail_version_minor = " +
+ Twine(header.hsail_version_minor), false);
+ }
+
+ OutStreamer.EmitBytes(StringRef((char*)&header, sizeof(header)));
+}
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index 19907cfd013e..b360ae88f1e6 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AMDGPU_ASMPRINTER_H
-#define AMDGPU_ASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
+#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
#include "llvm/CodeGen/AsmPrinter.h"
#include <vector>
@@ -24,8 +24,8 @@ class AMDGPUAsmPrinter : public AsmPrinter {
private:
struct SIProgramInfo {
SIProgramInfo() :
- NumVGPR(0),
- NumSGPR(0),
+ VGPRBlocks(0),
+ SGPRBlocks(0),
Priority(0),
FloatMode(0),
Priv(0),
@@ -33,11 +33,19 @@ private:
DebugMode(0),
IEEEMode(0),
ScratchSize(0),
+ ComputePGMRSrc1(0),
+ LDSBlocks(0),
+ ScratchBlocks(0),
+ ComputePGMRSrc2(0),
+ NumVGPR(0),
+ NumSGPR(0),
+ FlatUsed(false),
+ VCCUsed(false),
CodeLen(0) {}
// Fields set in PGM_RSRC1 pm4 packet.
- uint32_t NumVGPR;
- uint32_t NumSGPR;
+ uint32_t VGPRBlocks;
+ uint32_t SGPRBlocks;
uint32_t Priority;
uint32_t FloatMode;
uint32_t Priv;
@@ -46,7 +54,21 @@ private:
uint32_t IEEEMode;
uint32_t ScratchSize;
+ uint64_t ComputePGMRSrc1;
+
+ // Fields set in PGM_RSRC2 pm4 packet.
+ uint32_t LDSBlocks;
+ uint32_t ScratchBlocks;
+
+ uint64_t ComputePGMRSrc2;
+
+ uint32_t NumVGPR;
+ uint32_t NumSGPR;
+ uint32_t LDSSize;
+ bool FlatUsed;
+
// Bonus information for debugging.
+ bool VCCUsed;
uint64_t CodeLen;
};
@@ -59,6 +81,8 @@ private:
/// can correctly setup the GPU state.
void EmitProgramInfoR600(const MachineFunction &MF);
void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+ void EmitAmdKernelCodeT(const MachineFunction &MF,
+ const SIProgramInfo &KernelInfo) const;
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
@@ -82,4 +106,4 @@ protected:
} // End anonymous llvm
-#endif //AMDGPU_ASMPRINTER_H
+#endif
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 3586c8826908..6ffa7a083583 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -59,16 +59,24 @@ def CC_AMDGPU_Kernel : CallingConv<[
]>;
def CC_AMDGPU : CallingConv<[
- CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() >= "
- "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
- "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"#
- "getShaderType() == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
- CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() < "
- "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
- "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()->"
- "getShaderType() == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
- CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
- ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>,
- CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
- ".getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>>
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >="
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()"
+ "->getShaderType() == ShaderType::COMPUTE",
+ CCDelegateTo<CC_AMDGPU_Kernel>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+ "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()"
+ "->getShaderType() == ShaderType::COMPUTE",
+ CCDelegateTo<CC_AMDGPU_Kernel>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+ CCDelegateTo<CC_SI>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+ CCDelegateTo<CC_R600>>
]>;
diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
index d18ede5004e1..15a6636a1aea 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.h
+++ b/lib/Target/R600/AMDGPUFrameLowering.h
@@ -12,8 +12,8 @@
/// machine.
//
//===----------------------------------------------------------------------===//
-#ifndef AMDILFRAME_LOWERING_H
-#define AMDILFRAME_LOWERING_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/Target/TargetFrameLowering.h"
@@ -42,4 +42,4 @@ public:
bool hasFP(const MachineFunction &MF) const override;
};
} // namespace llvm
-#endif // AMDILFRAME_LOWERING_H
+#endif
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index cc17b7ec6183..eaa506db96c3 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -65,6 +65,7 @@ private:
static bool checkPrivateAddress(const MachineMemOperand *Op);
static bool isGlobalStore(const StoreSDNode *N);
+ static bool isFlatStore(const StoreSDNode *N);
static bool isPrivateStore(const StoreSDNode *N);
static bool isLocalStore(const StoreSDNode *N);
static bool isRegionStore(const StoreSDNode *N);
@@ -72,30 +73,49 @@ private:
bool isCPLoad(const LoadSDNode *N) const;
bool isConstantLoad(const LoadSDNode *N, int cbID) const;
bool isGlobalLoad(const LoadSDNode *N) const;
+ bool isFlatLoad(const LoadSDNode *N) const;
bool isParamLoad(const LoadSDNode *N) const;
bool isPrivateLoad(const LoadSDNode *N) const;
bool isLocalLoad(const LoadSDNode *N) const;
bool isRegionLoad(const LoadSDNode *N) const;
- /// \returns True if the current basic block being selected is at control
- /// flow depth 0. Meaning that the current block dominates the
- // exit block.
- bool isCFDepth0() const;
-
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
SDValue& Offset);
bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
- bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
- SDValue &ImmOffset) const;
+ bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+ unsigned OffsetBits) const;
+ bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
+ bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+ SDValue &Offset1) const;
+ void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+ SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const;
+ bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ SDValue &Offset) const;
+ bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+ SDValue &VAddr, SDValue &Offset,
+ SDValue &SLC) const;
bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
- bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
- SDValue &SOffset, SDValue &Offset, SDValue &Offen,
- SDValue &Idxen, SDValue &GLC, SDValue &SLC,
+ bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
+ SDValue &Offset, SDValue &GLC, SDValue &SLC,
SDValue &TFE) const;
+ bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+ SDValue &Offset, SDValue &GLC) const;
+ SDNode *SelectAddrSpaceCast(SDNode *N);
+ bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp, SDValue &Omod) const;
+
+ bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Omod) const;
+ bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp,
+ SDValue &Omod) const;
SDNode *SelectADD_SUB_I64(SDNode *N);
SDNode *SelectDIV_SCALE(SDNode *N);
@@ -135,7 +155,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
switch (N->getMachineOpcode()) {
default: {
- const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
+ const MCInstrDesc &Desc =
+ TM.getSubtargetImpl()->getInstrInfo()->get(N->getMachineOpcode());
unsigned OpIdx = Desc.getNumDefs() + OpNo;
if (OpIdx >= Desc.getNumOperands())
return nullptr;
@@ -143,15 +164,17 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
if (RegClass == -1)
return nullptr;
- return TM.getRegisterInfo()->getRegClass(RegClass);
+ return TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RegClass);
}
case AMDGPU::REG_SEQUENCE: {
unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID);
+ const TargetRegisterClass *SuperRC =
+ TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RCID);
SDValue SubRegOp = N->getOperand(OpNo + 1);
unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
- return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
+ return TM.getSubtargetImpl()->getRegisterInfo()->getSubClassWithSubReg(
+ SuperRC, SubRegIdx);
}
}
}
@@ -239,10 +262,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
case AMDGPUISD::BUILD_VERTICAL_VECTOR:
case ISD::BUILD_VECTOR: {
unsigned RegClassID;
- const AMDGPURegisterInfo *TRI =
- static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
- const SIRegisterInfo *SIRI =
- static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+ const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo *>(
+ TM.getSubtargetImpl()->getRegisterInfo());
+ const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>(
+ TM.getSubtargetImpl()->getRegisterInfo());
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
@@ -263,7 +286,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
}
}
switch(NumVectorElts) {
- case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
+ case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
AMDGPU::SReg_32RegClassID;
break;
case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
@@ -470,7 +493,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
case AMDGPUISD::DIV_SCALE: {
return SelectDIV_SCALE(N);
}
+ case ISD::CopyToReg: {
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+ Lowering.legalizeTargetIndependentNode(N, *CurDAG);
+ break;
+ }
+ case ISD::ADDRSPACECAST:
+ return SelectAddrSpaceCast(N);
}
+
return SelectCode(N);
}
@@ -508,6 +540,10 @@ bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
}
+bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
}
@@ -539,6 +575,10 @@ bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const {
return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
}
+bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const {
+ return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const {
return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
}
@@ -568,23 +608,16 @@ bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
const Value *MemVal = N->getMemOperand()->getValue();
if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
!checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+ !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
!checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
!checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
!checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
- !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
+ !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
return true;
}
return false;
}
-bool AMDGPUDAGToDAGISel::isCFDepth0() const {
- // FIXME: Figure out a way to use DominatorTree analysis here.
- const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock();
- const Function *Fn = FuncInfo->Fn;
- return &Fn->front() == CurBlock || &Fn->back() == CurBlock;
-}
-
-
const char *AMDGPUDAGToDAGISel::getPassName() const {
return "AMDGPU DAG->DAG Pattern Instruction Selection";
}
@@ -687,14 +720,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
- unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- if (!isCFDepth0()) {
- Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32;
- CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32;
- }
-
SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
SDValue Carry(AddLo, 1);
SDNode *AddHi
@@ -721,34 +749,134 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
-
+ const SDValue False = CurDAG->getTargetConstant(0, MVT::i1);
SDValue Ops[] = {
- N->getOperand(0),
- N->getOperand(1),
- N->getOperand(2),
- Zero,
- Zero,
- Zero,
- Zero
+ Zero, // src0_modifiers
+ N->getOperand(0), // src0
+ Zero, // src1_modifiers
+ N->getOperand(1), // src1
+ Zero, // src2_modifiers
+ N->getOperand(2), // src2
+ False, // clamp
+ Zero // omod
};
return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
}
-static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
- return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32,
- Ptr), 0);
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+ unsigned OffsetBits) const {
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+ if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+ (OffsetBits == 8 && !isUInt<8>(Offset)))
+ return false;
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return CurDAG->SignBitIsZero(Base);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
+ SDValue &Offset) const {
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+ // (add n0, c0)
+ Base = N0;
+ Offset = N1;
+ return true;
+ }
+ }
+
+ // If we have a constant address, prefer to put the constant into the
+ // offset. This can save moves to load the constant address since multiple
+ // operations can share the zero base address register, and enables merging
+ // into read2 / write2 instructions.
+ if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ if (isUInt<16>(CAddr->getZExtValue())) {
+ SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
+ MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ SDLoc(Addr), MVT::i32, Zero);
+ Base = SDValue(MovZero, 0);
+ Offset = Addr;
+ return true;
+ }
+ }
+
+ // default case
+ Base = Addr;
+ Offset = CurDAG->getTargetConstant(0, MVT::i16);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
+ SDValue &Offset0,
+ SDValue &Offset1) const {
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ unsigned DWordOffset0 = C1->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+ // (add n0, c0)
+ if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+ Base = N0;
+ Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8);
+ return true;
+ }
+ }
+
+ if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+ assert(4 * DWordOffset0 == CAddr->getZExtValue());
+
+ if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
+ SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
+ MachineSDNode *MovZero
+ = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ SDLoc(Addr), MVT::i32, Zero);
+ Base = SDValue(MovZero, 0);
+ Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8);
+ return true;
+ }
+ }
+
+ // default case
+ Base = Addr;
+ Offset0 = CurDAG->getTargetConstant(0, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(1, MVT::i8);
+ return true;
}
static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
return isUInt<12>(Imm->getZExtValue());
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
- SDValue &Offset,
- SDValue &ImmOffset) const {
+void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+ SDValue &VAddr, SDValue &SOffset,
+ SDValue &Offset, SDValue &Offen,
+ SDValue &Idxen, SDValue &Addr64,
+ SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const {
SDLoc DL(Addr);
+ GLC = CurDAG->getTargetConstant(0, MVT::i1);
+ SLC = CurDAG->getTargetConstant(0, MVT::i1);
+ TFE = CurDAG->getTargetConstant(0, MVT::i1);
+
+ Idxen = CurDAG->getTargetConstant(0, MVT::i1);
+ Offen = CurDAG->getTargetConstant(0, MVT::i1);
+ Addr64 = CurDAG->getTargetConstant(0, MVT::i1);
+ SOffset = CurDAG->getTargetConstant(0, MVT::i32);
+
if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
@@ -757,57 +885,69 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
if (isLegalMUBUFImmOffset(C1)) {
if (N0.getOpcode() == ISD::ADD) {
- // (add (add N2, N3), C1)
+ // (add (add N2, N3), C1) -> addr64
SDValue N2 = N0.getOperand(0);
SDValue N3 = N0.getOperand(1);
- Ptr = wrapAddr64Rsrc(CurDAG, DL, N2);
- Offset = N3;
- ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
- return true;
+ Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
+ Ptr = N2;
+ VAddr = N3;
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+ return;
}
- // (add N0, C1)
- Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));;
- Offset = N0;
- ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
- return true;
+ // (add N0, C1) -> offset
+ VAddr = CurDAG->getTargetConstant(0, MVT::i32);
+ Ptr = N0;
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+ return;
}
}
if (Addr.getOpcode() == ISD::ADD) {
- // (add N0, N1)
+ // (add N0, N1) -> addr64
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
- Ptr = wrapAddr64Rsrc(CurDAG, DL, N0);
- Offset = N1;
- ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
- return true;
+ Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
+ Ptr = N0;
+ VAddr = N1;
+ Offset = CurDAG->getTargetConstant(0, MVT::i16);
+ return;
}
- // default case
- Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64));
- Offset = Addr;
- ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
- return true;
+ // default case -> offset
+ VAddr = CurDAG->getTargetConstant(0, MVT::i32);
+ Ptr = Addr;
+ Offset = CurDAG->getTargetConstant(0, MVT::i16);
+
}
-/// \brief Return a resource descriptor with the 'Add TID' bit enabled
-/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
-/// of the resource descriptor) to create an offset, which is added to the
-/// resource ponter.
-static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE;
+
+ SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE);
+
+ ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
+ if (C->getSExtValue()) {
+ SDLoc DL(Addr);
+
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
+ return true;
+ }
- uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
- 0xffffffff;
+ return false;
+}
- SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
- SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
- SDValue DataLo = DAG->getTargetConstant(
- Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
- SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32);
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+ SDValue &VAddr, SDValue &Offset,
+ SDValue &SLC) const {
+ SLC = CurDAG->getTargetConstant(0, MVT::i1);
- const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi };
- return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL,
- MVT::v4i32, Ops), 0);
+ return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset);
}
bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
@@ -816,16 +956,23 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();
- const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
MachineRegisterInfo &MRI = MF.getRegInfo();
-
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
unsigned ScratchPtrReg =
TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
unsigned ScratchOffsetReg =
TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+ Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
+ ScratchOffsetReg, MVT::i32);
- Rsrc = buildScratchRSRC(CurDAG, DL, CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64));
+ SDValue ScratchPtr =
+ CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+ MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64);
+ Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
@@ -863,20 +1010,150 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset, SDValue &Offen,
- SDValue &Idxen, SDValue &GLC,
- SDValue &SLC, SDValue &TFE) const {
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &GLC, SDValue &SLC,
+ SDValue &TFE) const {
+ SDValue Ptr, VAddr, Offen, Idxen, Addr64;
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(Subtarget.getInstrInfo());
- GLC = CurDAG->getTargetConstant(0, MVT::i1);
- SLC = CurDAG->getTargetConstant(0, MVT::i1);
- TFE = CurDAG->getTargetConstant(0, MVT::i1);
+ SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE);
- Idxen = CurDAG->getTargetConstant(0, MVT::i1);
- Offen = CurDAG->getTargetConstant(1, MVT::i1);
+ if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
+ !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
+ !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
+ uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
+ APInt::getAllOnesValue(32).getZExtValue(); // Size
+ SDLoc DL(Addr);
+
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+ SDValue &Soffset, SDValue &Offset,
+ SDValue &GLC) const {
+ SDValue SLC, TFE;
+
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+
+// FIXME: This is incorrect and only enough to be able to compile.
+SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+ AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
+ SDLoc DL(N);
+
+ assert(Subtarget.hasFlatAddressSpace() &&
+ "addrspacecast only supported with flat address space!");
+
+ assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+ ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
+ "Cannot cast address space to / from constant address!");
+
+ assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+ ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
+ "Can only cast to / from flat address space!");
+
+ // The flat instructions read the address as the index of the VGPR holding the
+ // address, so casting should just be reinterpreting the base VGPR, so just
+ // insert trunc / bitcast / zext.
+
+ SDValue Src = ASC->getOperand(0);
+ EVT DestVT = ASC->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+
+ unsigned SrcSize = SrcVT.getSizeInBits();
+ unsigned DestSize = DestVT.getSizeInBits();
+
+ if (SrcSize > DestSize) {
+ assert(SrcSize == 64 && DestSize == 32);
+ return CurDAG->getMachineNode(
+ TargetOpcode::EXTRACT_SUBREG,
+ DL,
+ DestVT,
+ Src,
+ CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32));
+ }
+
+
+ if (DestSize > SrcSize) {
+ assert(SrcSize == 32 && DestSize == 64);
+
+ // FIXME: This is probably wrong, we should never be defining
+ // a register class with both VGPRs and SGPRs
+ SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, MVT::i32);
+
+ const SDValue Ops[] = {
+ RC,
+ Src,
+ CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+ SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+ CurDAG->getConstant(0, MVT::i32)), 0),
+ CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)
+ };
+
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+ SDLoc(N), N->getValueType(0), Ops);
+ }
+
+ assert(SrcSize == 64 && DestSize == 64);
+ return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+
+ unsigned Mods = 0;
+
+ Src = In;
+
+ if (Src.getOpcode() == ISD::FNEG) {
+ Mods |= SISrcMods::NEG;
+ Src = Src.getOperand(0);
+ }
+
+ if (Src.getOpcode() == ISD::FABS) {
+ Mods |= SISrcMods::ABS;
+ Src = Src.getOperand(0);
+ }
+
+ SrcMods = CurDAG->getTargetConstant(Mods, MVT::i32);
+
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
+ SDValue &SrcMods, SDValue &Clamp,
+ SDValue &Omod) const {
+ // FIXME: Handle Clamp and Omod
+ Clamp = CurDAG->getTargetConstant(0, MVT::i32);
+ Omod = CurDAG->getTargetConstant(0, MVT::i32);
+
+ return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ SDValue &Omod) const {
+ // FIXME: Handle Omod
+ Omod = CurDAG->getTargetConstant(0, MVT::i32);
+
+ return SelectVOP3Mods(In, Src, SrcMods);
+}
- return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset);
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ SDValue &Clamp,
+ SDValue &Omod) const {
+ Clamp = Omod = CurDAG->getTargetConstant(0, MVT::i32);
+ return SelectVOP3Mods(In, Src, SrcMods);
}
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 5a46297b6032..206050d54a02 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -103,7 +103,7 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
}
AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
- TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+ TargetLowering(TM) {
Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
@@ -130,6 +130,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FROUND, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FREM, MVT::f32, Custom);
+ setOperationAction(ISD::FREM, MVT::f64, Custom);
+
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -213,18 +216,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
+ // There are no 64-bit extloads. These should be done as a 32-bit extload and
+ // an extension to 64-bit.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+ }
+
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+ }
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -243,7 +256,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
@@ -282,6 +296,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::UDIV, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
if (!Subtarget->hasFFBH())
@@ -310,7 +327,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SUB, VT, Expand);
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
- // TODO: Implement custom UREM / SREM routines.
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
@@ -342,12 +358,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
for (MVT VT : FloatVectorTypes) {
setOperationAction(ISD::FABS, VT, Expand);
+ setOperationAction(ISD::FMINNUM, VT, Expand);
+ setOperationAction(ISD::FMAXNUM, VT, Expand);
setOperationAction(ISD::FADD, VT, Expand);
setOperationAction(ISD::FCEIL, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
+ setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -370,22 +389,29 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::STORE);
+ setBooleanContents(ZeroOrNegativeOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
+ // SI at least has hardware support for floating point exceptions, but no way
+ // of using or handling them is implemented. They are also optional in OpenCL
+ // (Section 7.3)
+ setHasFloatingPointExceptions(false);
+
setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
// There are no integer divide instructions, and these expand to a pretty
// large sequence of instructions.
setIntDivIsCheap(false);
- setPow2DivIsCheap(false);
-
- // TODO: Investigate this when 64-bit divides are implemented.
- addBypassSlowDiv(64, 32);
+ setPow2SDivIsCheap(false);
+ setFsqrtIsCheap(true);
// FIXME: Need to really handle these.
MaxStoresPerMemcpy = 4096;
@@ -418,6 +444,29 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
}
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
+ ISD::LoadExtType,
+ EVT NewVT) const {
+
+ unsigned NewSize = NewVT.getStoreSizeInBits();
+
+ // If we are reducing to a 32-bit load, this is always better.
+ if (NewSize == 32)
+ return true;
+
+ EVT OldVT = N->getValueType(0);
+ unsigned OldSize = OldVT.getStoreSizeInBits();
+
+ // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+ // extloads, so doing one requires using a buffer_load. In cases where we
+ // still couldn't use a scalar load, using the wider load shouldn't really
+ // hurt anything.
+
+ // If the old size already had to be an extload, there's no harm in continuing
+ // to reduce the width.
+ return (OldSize < 32);
+}
+
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
EVT CastTy) const {
if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
@@ -431,18 +480,30 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
(LScalarSize < 32));
}
+// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
+// profitable with the expansion for 64-bit since it's generally good to
+// speculate things.
+// FIXME: These should really have the size as a parameter.
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+ return true;
+}
+
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+ return true;
+}
+
//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//
bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
assert(VT.isFloatingPoint());
- return VT == MVT::f32;
+ return VT == MVT::f32 || VT == MVT::f64;
}
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
assert(VT.isFloatingPoint());
- return VT == MVT::f32;
+ return VT == MVT::f32 || VT == MVT::f64;
}
bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
@@ -542,16 +603,18 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
- case ISD::SDIV: return LowerSDIV(Op, DAG);
- case ISD::SREM: return LowerSREM(Op, DAG);
case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+ case ISD::FREM: return LowerFREM(Op, DAG);
case ISD::FCEIL: return LowerFCEIL(Op, DAG);
case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
case ISD::FRINT: return LowerFRINT(Op, DAG);
case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+ case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
}
return Op;
}
@@ -606,7 +669,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
const SDValue &InitPtr,
SDValue Chain,
SelectionDAG &DAG) const {
- const DataLayout *TD = getTargetMachine().getDataLayout();
+ const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
SDLoc DL(InitPtr);
Type *InitTy = Init->getType();
@@ -679,22 +742,35 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
llvm_unreachable("Unhandled constant initializer");
}
+static bool hasDefinedInitializer(const GlobalValue *GV) {
+ const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+ if (!GVar || !GVar->hasInitializer())
+ return false;
+
+ if (isa<UndefValue>(GVar->getInitializer()))
+ return false;
+
+ return true;
+}
+
SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
SDValue Op,
SelectionDAG &DAG) const {
- const DataLayout *TD = getTargetMachine().getDataLayout();
+ const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
switch (G->getAddressSpace()) {
- default: llvm_unreachable("Global Address lowering not implemented for this "
- "address space");
case AMDGPUAS::LOCAL_ADDRESS: {
// XXX: What does the value of G->getOffset() mean?
assert(G->getOffset() == 0 &&
"Do not know what to do with an non-zero offset");
+ // TODO: We could emit code to handle the initialization somewhere.
+ if (hasDefinedInitializer(GV))
+ break;
+
unsigned Offset;
if (MFI->LocalMemoryObjects.count(GV) == 0) {
uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
@@ -706,7 +782,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
Offset = MFI->LocalMemoryObjects[GV];
}
- return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+ return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
}
case AMDGPUAS::CONSTANT_ADDRESS: {
MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
@@ -748,6 +824,12 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
}
}
+
+ const Function &Fn = *DAG.getMachineFunction().getFunction();
+ DiagnosticInfoUnsupported BadInit(Fn,
+ "initializer for address space");
+ DAG.getContext()->diagnose(BadInit);
+ return SDValue();
}
SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
@@ -778,8 +860,8 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL =
- static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+ const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+ getTargetMachine().getSubtargetImpl()->getFrameLowering());
FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
@@ -821,13 +903,21 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// first parameter must be the same as the first instruction.
SDValue Numerator = Op.getOperand(1);
SDValue Denominator = Op.getOperand(2);
+
+ // Note this order is opposite of the machine instruction's operations,
+ // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+ // intrinsic has the numerator as the first operand to match a normal
+ // division operation.
+
SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
- return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
- Src0, Denominator, Numerator);
+ return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+ Denominator, Numerator);
}
case Intrinsic::AMDGPU_div_fmas:
+ // FIXME: Dropping bool parameter. Work is needed to support the implicit
+ // read from VCC.
return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
@@ -849,7 +939,23 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::AMDGPU_rsq_clamped:
- return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ Type *Type = VT.getTypeForEVT(*DAG.getContext());
+ APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+ APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+ SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+ SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+ DAG.getConstantFP(Max, VT));
+ return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+ DAG.getConstantFP(Min, VT));
+ } else {
+ return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+ }
+
+ case Intrinsic::AMDGPU_ldexp:
+ return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
+ Op.getOperand(2));
case AMDGPUIntrinsic::AMDGPU_imax:
return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
@@ -918,6 +1024,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case AMDGPUIntrinsic::AMDGPU_brev:
return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
+ case Intrinsic::AMDGPU_class:
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+
case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
@@ -956,22 +1066,21 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
}
/// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
- SelectionDAG &DAG) const {
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- SDValue True = N->getOperand(2);
- SDValue False = N->getOperand(3);
- SDValue CC = N->getOperand(4);
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
+ EVT VT,
+ SDValue LHS,
+ SDValue RHS,
+ SDValue True,
+ SDValue False,
+ SDValue CC,
+ DAGCombinerInfo &DCI) const {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return SDValue();
- if (VT != MVT::f32 ||
- !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
+ if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
return SDValue();
- }
+ SelectionDAG &DAG = DCI.DAG;
ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
switch (CCOpcode) {
case ISD::SETOEQ:
@@ -986,24 +1095,49 @@ SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
case ISD::SETTRUE2:
case ISD::SETUO:
case ISD::SETO:
- llvm_unreachable("Operation should already be optimised!");
+ break;
case ISD::SETULE:
- case ISD::SETULT:
+ case ISD::SETULT: {
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ }
case ISD::SETOLE:
case ISD::SETOLT:
case ISD::SETLE:
case ISD::SETLT: {
- unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
+ // Ordered. Assume ordered for undefined.
+
+ // Only do this after legalization to avoid interfering with other combines
+ // which might occur.
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+ !DCI.isCalledByLegalizer())
+ return SDValue();
+
+ // We need to permute the operands to get the correct NaN behavior. The
+ // selected operand is the second one based on the failing compare with NaN,
+ // so permute it based on the compare type the hardware uses.
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+ }
+ case ISD::SETUGE:
+ case ISD::SETUGT: {
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
}
case ISD::SETGT:
case ISD::SETGE:
- case ISD::SETUGE:
case ISD::SETOGE:
- case ISD::SETUGT:
case ISD::SETOGT: {
- unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+ !DCI.isCalledByLegalizer())
+ return SDValue();
+
+ if (LHS == True)
+ return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+ return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
}
case ISD::SETCC_INVALID:
llvm_unreachable("Invalid setcc condcode!");
@@ -1011,12 +1145,53 @@ SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
return SDValue();
}
-SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
- SelectionDAG &DAG) const {
- LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
- EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+/// \brief Generate Min/Max node
+SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
+ EVT VT,
+ SDValue LHS,
+ SDValue RHS,
+ SDValue True,
+ SDValue False,
+ SDValue CC,
+ SelectionDAG &DAG) const {
+ if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+ return SDValue();
+
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+ switch (CCOpcode) {
+ case ISD::SETULE:
+ case ISD::SETULT: {
+ unsigned Opc = (LHS == True) ? AMDGPUISD::UMIN : AMDGPUISD::UMAX;
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
+ }
+ case ISD::SETLE:
+ case ISD::SETLT: {
+ unsigned Opc = (LHS == True) ? AMDGPUISD::SMIN : AMDGPUISD::SMAX;
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
+ }
+ case ISD::SETGT:
+ case ISD::SETGE: {
+ unsigned Opc = (LHS == True) ? AMDGPUISD::SMAX : AMDGPUISD::SMIN;
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
+ }
+ case ISD::SETUGE:
+ case ISD::SETUGT: {
+ unsigned Opc = (LHS == True) ? AMDGPUISD::UMAX : AMDGPUISD::UMIN;
+ return DAG.getNode(Opc, DL, VT, LHS, RHS);
+ }
+ default:
+ return SDValue();
+ }
+}
+
+SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
+ SelectionDAG &DAG) const {
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ EVT MemVT = Load->getMemoryVT();
+ EVT MemEltVT = MemVT.getVectorElementType();
+
EVT LoadVT = Op.getValueType();
- EVT EltVT = Op.getValueType().getVectorElementType();
+ EVT EltVT = LoadVT.getVectorElementType();
EVT PtrVT = Load->getBasePtr().getValueType();
unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
@@ -1024,17 +1199,19 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
SmallVector<SDValue, 8> Chains;
SDLoc SL(Op);
+ unsigned MemEltSize = MemEltVT.getStoreSize();
+ MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
- for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ for (unsigned i = 0; i < NumElts; ++i) {
SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
- DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
+ DAG.getConstant(i * MemEltSize, PtrVT));
SDValue NewLoad
= DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
Load->getChain(), Ptr,
- MachinePointerInfo(Load->getMemOperand()->getValue()),
+ SrcValue.getWithOffset(i * MemEltSize),
MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->getAlignment());
+ Load->isInvariant(), Load->getAlignment());
Loads.push_back(NewLoad.getValue(0));
Chains.push_back(NewLoad.getValue(1));
}
@@ -1047,6 +1224,55 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
return DAG.getMergeValues(Ops, SL);
}
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // If this is a 2 element vector, we really want to scalarize and not create
+ // weird 1 element vectors.
+ if (VT.getVectorNumElements() == 2)
+ return ScalarizeVectorLoad(Op, DAG);
+
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ SDValue BasePtr = Load->getBasePtr();
+ EVT PtrVT = BasePtr.getValueType();
+ EVT MemVT = Load->getMemoryVT();
+ SDLoc SL(Op);
+ MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+ EVT LoVT, HiVT;
+ EVT LoMemVT, HiMemVT;
+ SDValue Lo, Hi;
+
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+ std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+ SDValue LoLoad
+ = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+ Load->getChain(), BasePtr,
+ SrcValue,
+ LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
+ Load->isInvariant(), Load->getAlignment());
+
+ SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+ DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
+
+ SDValue HiLoad
+ = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
+ Load->getChain(), HiPtr,
+ SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+ HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
+ Load->isInvariant(), Load->getAlignment());
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
+ DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ LoLoad.getValue(1), HiLoad.getValue(1))
+ };
+
+ return DAG.getMergeValues(Ops, SL);
+}
+
SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1105,8 +1331,8 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
Store->getAlignment());
}
-SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
+ SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
EVT EltVT = Store->getValue().getValueType().getVectorElementType();
@@ -1116,21 +1342,77 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SmallVector<SDValue, 8> Chains;
+ unsigned EltSize = MemEltVT.getStoreSize();
+ MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+
for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
- Store->getValue(), DAG.getConstant(i, MVT::i32));
- SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
- Store->getBasePtr(),
- DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
- PtrVT));
- Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
- MachinePointerInfo(Store->getMemOperand()->getValue()),
- MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
- Store->getAlignment()));
+ Store->getValue(),
+ DAG.getConstant(i, MVT::i32));
+
+ SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT);
+ SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
+ SDValue NewStore =
+ DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
+ SrcValue.getWithOffset(i * EltSize),
+ MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
+ Store->getAlignment());
+ Chains.push_back(NewStore);
}
+
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
}
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+ SelectionDAG &DAG) const {
+ StoreSDNode *Store = cast<StoreSDNode>(Op);
+ SDValue Val = Store->getValue();
+ EVT VT = Val.getValueType();
+
+ // If this is a 2 element vector, we really want to scalarize and not create
+ // weird 1 element vectors.
+ if (VT.getVectorNumElements() == 2)
+ return ScalarizeVectorStore(Op, DAG);
+
+ EVT MemVT = Store->getMemoryVT();
+ SDValue Chain = Store->getChain();
+ SDValue BasePtr = Store->getBasePtr();
+ SDLoc SL(Op);
+
+ EVT LoVT, HiVT;
+ EVT LoMemVT, HiMemVT;
+ SDValue Lo, Hi;
+
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+ std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+
+ EVT PtrVT = BasePtr.getValueType();
+ SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+ DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
+
+ MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+ SDValue LoStore
+ = DAG.getTruncStore(Chain, SL, Lo,
+ BasePtr,
+ SrcValue,
+ LoMemVT,
+ Store->isNonTemporal(),
+ Store->isVolatile(),
+ Store->getAlignment());
+ SDValue HiStore
+ = DAG.getTruncStore(Chain, SL, Hi,
+ HiPtr,
+ SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+ HiMemVT,
+ Store->isNonTemporal(),
+ Store->isVolatile(),
+ Store->getAlignment());
+
+ return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
+}
+
+
SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1138,24 +1420,6 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
EVT MemVT = Load->getMemoryVT();
- if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
- // We can do the extload to 32-bits, and then need to separately extend to
- // 64-bits.
-
- SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
- Load->getChain(),
- Load->getBasePtr(),
- MemVT,
- Load->getMemOperand());
-
- SDValue Ops[] = {
- DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
- ExtLoad32.getValue(1)
- };
-
- return DAG.getMergeValues(Ops, DL);
- }
-
if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
assert(VT == MVT::i1 && "Only i1 non-extloads expected");
// FIXME: Copied from PPC
@@ -1228,7 +1492,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
Store->getValue().getValueType().isVector()) {
- return SplitVectorStore(Op, DAG);
+ return ScalarizeVectorStore(Op, DAG);
}
EVT MemVT = Store->getMemoryVT();
@@ -1273,249 +1537,179 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
+// This is a shortcut for integer division because we have fast i32<->f32
+// conversions, and fast f32 reciprocal instructions. The fractional part of a
+// float is enough to accurately represent up to a 24-bit integer.
+SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
SDLoc DL(Op);
- EVT OVT = Op.getValueType();
+ EVT VT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- MVT INTTY;
- MVT FLTTY;
- if (!OVT.isVector()) {
- INTTY = MVT::i32;
- FLTTY = MVT::f32;
- } else if (OVT.getVectorNumElements() == 2) {
- INTTY = MVT::v2i32;
- FLTTY = MVT::v2f32;
- } else if (OVT.getVectorNumElements() == 4) {
- INTTY = MVT::v4i32;
- FLTTY = MVT::v4f32;
- }
- unsigned bitsize = OVT.getScalarType().getSizeInBits();
- // char|short jq = ia ^ ib;
- SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
-
- // jq = jq >> (bitsize - 2)
- jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
-
- // jq = jq | 0x1
- jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
-
- // jq = (int)jq
- jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+ MVT IntVT = MVT::i32;
+ MVT FltVT = MVT::f32;
+
+ ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+ ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+
+ if (VT.isVector()) {
+ unsigned NElts = VT.getVectorNumElements();
+ IntVT = MVT::getVectorVT(MVT::i32, NElts);
+ FltVT = MVT::getVectorVT(MVT::f32, NElts);
+ }
+
+ unsigned BitSize = VT.getScalarType().getSizeInBits();
+
+ SDValue jq = DAG.getConstant(1, IntVT);
+
+ if (sign) {
+ // char|short jq = ia ^ ib;
+ jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
+
+ // jq = jq >> (bitsize - 2)
+ jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT));
+
+ // jq = jq | 0x1
+ jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT));
+
+ // jq = (int)jq
+ jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
+ }
// int ia = (int)LHS;
- SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+ SDValue ia = sign ?
+ DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
// int ib, (int)RHS;
- SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+ SDValue ib = sign ?
+ DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
// float fa = (float)ia;
- SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+ SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
// float fb = (float)ib;
- SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+ SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
// float fq = native_divide(fa, fb);
- SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY,
- fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb));
+ SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
+ fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
// fq = trunc(fq);
- fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+ fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
// float fqneg = -fq;
- SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+ SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
// float fr = mad(fqneg, fb, fa);
- SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
- DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
+ SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
+ DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
// int iq = (int)fq;
- SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+ SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
// fr = fabs(fr);
- fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+ fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
// fb = fabs(fb);
- fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
-
- // int cv = fr >= fb;
- SDValue cv;
- if (INTTY == MVT::i32) {
- cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
- } else {
- cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
- }
- // jq = (cv ? jq : 0);
- jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
- DAG.getConstant(0, OVT));
- // dst = iq + jq;
- iq = DAG.getSExtOrTrunc(iq, DL, OVT);
- iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
- return iq;
-}
-
-SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OVT = Op.getValueType();
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- // The LowerSDIV32 function generates equivalent to the following IL.
- // mov r0, LHS
- // mov r1, RHS
- // ilt r10, r0, 0
- // ilt r11, r1, 0
- // iadd r0, r0, r10
- // iadd r1, r1, r11
- // ixor r0, r0, r10
- // ixor r1, r1, r11
- // udiv r0, r0, r1
- // ixor r10, r10, r11
- // iadd r0, r0, r10
- // ixor DST, r0, r10
-
- // mov r0, LHS
- SDValue r0 = LHS;
-
- // mov r1, RHS
- SDValue r1 = RHS;
-
- // ilt r10, r0, 0
- SDValue r10 = DAG.getSelectCC(DL,
- r0, DAG.getConstant(0, OVT),
- DAG.getConstant(-1, OVT),
- DAG.getConstant(0, OVT),
- ISD::SETLT);
-
- // ilt r11, r1, 0
- SDValue r11 = DAG.getSelectCC(DL,
- r1, DAG.getConstant(0, OVT),
- DAG.getConstant(-1, OVT),
- DAG.getConstant(0, OVT),
- ISD::SETLT);
+ fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
- // iadd r0, r0, r10
- r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+ EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
- // iadd r1, r1, r11
- r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
-
- // ixor r0, r0, r10
- r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-
- // ixor r1, r1, r11
- r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+ // int cv = fr >= fb;
+ SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
- // udiv r0, r0, r1
- r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+ // jq = (cv ? jq : 0);
+ jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT));
- // ixor r10, r10, r11
- r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+ // dst = trunc/extend to legal type
+ iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
- // iadd r0, r0, r10
- r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+ // dst = iq + jq;
+ SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
- // ixor DST, r0, r10
- SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
- return DST;
-}
+ // Rem needs compensation, it's easier to recompute it
+ SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
+ Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
-SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
- return SDValue(Op.getNode(), 0);
+ SDValue Res[2] = {
+ Div,
+ Rem
+ };
+ return DAG.getMergeValues(Res, DL);
}
-SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
- EVT OVT = Op.getValueType().getScalarType();
-
- if (OVT == MVT::i64)
- return LowerSDIV64(Op, DAG);
-
- if (OVT.getScalarType() == MVT::i32)
- return LowerSDIV32(Op, DAG);
+void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) const {
+ assert(Op.getValueType() == MVT::i64);
- if (OVT == MVT::i16 || OVT == MVT::i8) {
- // FIXME: We should be checking for the masked bits. This isn't reached
- // because i8 and i16 are not legal types.
- return LowerSDIV24(Op, DAG);
- }
-
- return SDValue(Op.getNode(), 0);
-}
-
-SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
- EVT OVT = Op.getValueType();
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- // The LowerSREM32 function generates equivalent to the following IL.
- // mov r0, LHS
- // mov r1, RHS
- // ilt r10, r0, 0
- // ilt r11, r1, 0
- // iadd r0, r0, r10
- // iadd r1, r1, r11
- // ixor r0, r0, r10
- // ixor r1, r1, r11
- // udiv r20, r0, r1
- // umul r20, r20, r1
- // sub r0, r0, r20
- // iadd r0, r0, r10
- // ixor DST, r0, r10
+ EVT VT = Op.getValueType();
+ EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
- // mov r0, LHS
- SDValue r0 = LHS;
+ SDValue one = DAG.getConstant(1, HalfVT);
+ SDValue zero = DAG.getConstant(0, HalfVT);
- // mov r1, RHS
- SDValue r1 = RHS;
+ //HiLo split
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+ SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
- // ilt r10, r0, 0
- SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+ SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
- // ilt r11, r1, 0
- SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
+ // Get Speculative values
+ SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+ SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
- // iadd r0, r0, r10
- r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+ SDValue REM_Hi = zero;
+ SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
- // iadd r1, r1, r11
- r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+ SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+ SDValue DIV_Lo = zero;
- // ixor r0, r0, r10
- r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+ const unsigned halfBitWidth = HalfVT.getSizeInBits();
- // ixor r1, r1, r11
- r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+ for (unsigned i = 0; i < halfBitWidth; ++i) {
+ SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+ // Get Value of high bit
+ SDValue HBit;
+ if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+ HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+ } else {
+ HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+ HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+ }
- // udiv r20, r0, r1
- SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+ SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+ DAG.getConstant(halfBitWidth - 1, HalfVT));
+ REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+ REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
- // umul r20, r20, r1
- r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
+ REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+ REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
- // sub r0, r0, r20
- r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
- // iadd r0, r0, r10
- r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
- // ixor DST, r0, r10
- SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
- return DST;
-}
+ SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+ SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
-SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
- return SDValue(Op.getNode(), 0);
-}
+ DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
-SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
- EVT OVT = Op.getValueType();
+ // Update REM
- if (OVT.getScalarType() == MVT::i64)
- return LowerSREM64(Op, DAG);
+ SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
- if (OVT.getScalarType() == MVT::i32)
- return LowerSREM32(Op, DAG);
+ REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
+ REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+ REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
+ }
- return SDValue(Op.getNode(), 0);
+ SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+ SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+ Results.push_back(DIV);
+ Results.push_back(REM);
}
SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
@@ -1523,15 +1717,31 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
SDLoc DL(Op);
EVT VT = Op.getValueType();
+ if (VT == MVT::i64) {
+ SmallVector<SDValue, 2> Results;
+ LowerUDIVREM64(Op, DAG, Results);
+ return DAG.getMergeValues(Results, DL);
+ }
+
SDValue Num = Op.getOperand(0);
SDValue Den = Op.getOperand(1);
+ if (VT == MVT::i32) {
+ if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) &&
+ DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) {
+ // TODO: We technically could do this for i64, but shouldn't that just be
+ // handled by something generally reducing 64-bit division on 32-bit
+ // values to 32-bit?
+ return LowerDIVREM24(Op, DAG, false);
+ }
+ }
+
// RCP = URECIP(Den) = 2^32 / Den + e
// e is rounding error.
SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
- // RCP_LO = umulo(RCP, Den) */
- SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
+ // RCP_LO = mul(RCP, Den) */
+ SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
// RCP_HI = mulhu (RCP, Den) */
SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
@@ -1562,7 +1772,7 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
// Num_S_Remainder = Quotient * Den
- SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
+ SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
// Remainder = Num - Num_S_Remainder
SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
@@ -1627,12 +1837,22 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
SDLoc DL(Op);
EVT VT = Op.getValueType();
- SDValue Zero = DAG.getConstant(0, VT);
- SDValue NegOne = DAG.getConstant(-1, VT);
-
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
+ if (VT == MVT::i32) {
+ if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 &&
+ DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) {
+ // TODO: We technically could do this for i64, but shouldn't that just be
+ // handled by something generally reducing 64-bit division on 32-bit
+ // values to 32-bit?
+ return LowerDIVREM24(Op, DAG, true);
+ }
+ }
+
+ SDValue Zero = DAG.getConstant(0, VT);
+ SDValue NegOne = DAG.getConstant(-1, VT);
+
SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
@@ -1660,6 +1880,20 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
return DAG.getMergeValues(Res, DL);
}
+// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
+SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ EVT VT = Op.getValueType();
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+
+ SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
+ SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
+
+ return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
+}
+
SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -1702,7 +1936,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
const unsigned ExpBits = 11;
// Extract the exponent.
- SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
+ SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
Hi,
DAG.getConstant(FractBits - 32, MVT::i32),
DAG.getConstant(ExpBits, MVT::i32));
@@ -1793,13 +2027,43 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
+ bool Signed) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+ DAG.getConstant(0, MVT::i32));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+ DAG.getConstant(1, MVT::i32));
+
+ SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
+ SL, MVT::f64, Hi);
+
+ SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
+
+ SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
+ DAG.getConstant(32, MVT::i32));
+
+ return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
+}
+
SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue S0 = Op.getOperand(0);
- SDLoc DL(Op);
- if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
+ if (S0.getValueType() != MVT::i64)
return SDValue();
+ EVT DestVT = Op.getValueType();
+ if (DestVT == MVT::f64)
+ return LowerINT_TO_FP64(Op, DAG, false);
+
+ assert(DestVT == MVT::f32);
+
+ SDLoc DL(Op);
+
// f32 uint_to_fp i64
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
DAG.getConstant(0, MVT::i32));
@@ -1812,16 +2076,62 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
}
-SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
- unsigned BitsDiff,
- SelectionDAG &DAG) const {
- MVT VT = Op.getSimpleValueType();
- SDLoc DL(Op);
- SDValue Shift = DAG.getConstant(BitsDiff, VT);
- // Shift left by 'Shift' bits.
- SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
- // Signed shift Right by 'Shift' bits.
- return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
+SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)
+ return LowerINT_TO_FP64(Op, DAG, true);
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
+ bool Signed) const {
+ SDLoc SL(Op);
+
+ SDValue Src = Op.getOperand(0);
+
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+ SDValue K0
+ = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), MVT::f64);
+ SDValue K1
+ = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), MVT::f64);
+
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
+
+ SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
+
+
+ SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
+
+ SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
+ MVT::i32, FloorMul);
+ SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
+
+ SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi);
+
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+
+ if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+ return LowerFP64_TO_INT(Op, DAG, true);
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+
+ if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+ return LowerFP64_TO_INT(Op, DAG, false);
+
+ return SDValue();
}
SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
@@ -1887,7 +2197,8 @@ template <typename IntTy>
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
uint32_t Offset, uint32_t Width) {
if (Width + Offset < 32) {
- IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
+ uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
+ IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
return DAG.getConstant(Result, MVT::i32);
}
@@ -1916,7 +2227,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SDValue Value = SN->getValue();
EVT VT = Value.getValueType();
- if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
+ if (isTypeLegal(VT) || SN->isVolatile() ||
+ !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
return SDValue();
LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
@@ -1992,9 +2304,30 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
simplifyI24(N1, DCI);
return SDValue();
}
- case ISD::SELECT_CC: {
- return CombineMinMax(N, DAG);
+ case ISD::SELECT: {
+ SDValue Cond = N->getOperand(0);
+ if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ SDValue CC = Cond.getOperand(2);
+
+ SDValue True = N->getOperand(1);
+ SDValue False = N->getOperand(2);
+
+ if (VT == MVT::f32)
+ return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+
+ // TODO: Implement min / max Evergreen instructions.
+ if (VT == MVT::i32 &&
+ Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
+ }
}
+
+ break;
+ }
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
assert(!N->getValueType(0).isVector() &&
@@ -2039,37 +2372,40 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
}
- if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
if (Signed) {
return constantFoldBFE<int32_t>(DAG,
- Val->getSExtValue(),
+ CVal->getSExtValue(),
OffsetVal,
WidthVal);
}
return constantFoldBFE<uint32_t>(DAG,
- Val->getZExtValue(),
+ CVal->getZExtValue(),
OffsetVal,
WidthVal);
}
- APInt Demanded = APInt::getBitsSet(32,
- OffsetVal,
- OffsetVal + WidthVal);
-
if ((OffsetVal + WidthVal) >= 32) {
SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
BitsFrom, ShiftVal);
}
- APInt KnownZero, KnownOne;
- TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
- !DCI.isBeforeLegalizeOps());
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
- TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
- DCI.CommitTargetLoweringOpt(TLO);
+ if (BitsFrom.hasOneUse()) {
+ APInt Demanded = APInt::getBitsSet(32,
+ OffsetVal,
+ OffsetVal + WidthVal);
+
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+ TLI.SimplifyDemandedBits(BitsFrom, Demanded,
+ KnownZero, KnownOne, TLO)) {
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
}
break;
@@ -2167,12 +2503,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(CLAMP)
- NODE_NAME_CASE(FMAX)
+ NODE_NAME_CASE(MAD)
+ NODE_NAME_CASE(FMAX_LEGACY)
NODE_NAME_CASE(SMAX)
NODE_NAME_CASE(UMAX)
- NODE_NAME_CASE(FMIN)
+ NODE_NAME_CASE(FMIN_LEGACY)
NODE_NAME_CASE(SMIN)
NODE_NAME_CASE(UMIN)
+ NODE_NAME_CASE(FMAX3)
+ NODE_NAME_CASE(SMAX3)
+ NODE_NAME_CASE(UMAX3)
+ NODE_NAME_CASE(FMIN3)
+ NODE_NAME_CASE(SMIN3)
+ NODE_NAME_CASE(UMIN3)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
NODE_NAME_CASE(DIV_FMAS)
@@ -2182,6 +2525,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RSQ)
NODE_NAME_CASE(RSQ_LEGACY)
NODE_NAME_CASE(RSQ_CLAMPED)
+ NODE_NAME_CASE(LDEXP)
+ NODE_NAME_CASE(FP_CLASS)
NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(BFE_U32)
NODE_NAME_CASE(BFE_I32)
@@ -2213,6 +2558,46 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
}
}
+SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = Operand.getValueType();
+
+ if (VT == MVT::f32) {
+ RefinementSteps = 0;
+ return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
+ }
+
+ // TODO: There is also f64 rsq instruction, but the documentation is less
+ // clear on its precision.
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = Operand.getValueType();
+
+ if (VT == MVT::f32) {
+ // Reciprocal, < 1 ulp error.
+ //
+ // This reciprocal approximation converges to < 0.5 ulp error with one
+ // newton rhapson performed with two fused multiple adds (FMAs).
+
+ RefinementSteps = 0;
+ return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+ }
+
+ // TODO: There is also f64 rcp instruction, but the documentation is less
+ // clear on its precision.
+
+ return SDValue();
+}
+
static void computeKnownBitsForMinMax(const SDValue Op0,
const SDValue Op1,
APInt &KnownZero,
@@ -2276,17 +2661,8 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
unsigned BitWidth = 32;
uint32_t Width = CWidth->getZExtValue() & 0x1f;
- if (Width == 0) {
- KnownZero = APInt::getAllOnesValue(BitWidth);
- KnownOne = APInt::getNullValue(BitWidth);
- return;
- }
- // FIXME: This could do a lot more. If offset is 0, should be the same as
- // sign_extend_inreg implementation, but that involves duplicating it.
- if (Opc == AMDGPUISD::BFE_I32)
- KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
- else
+ if (Opc == AMDGPUISD::BFE_U32)
KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
break;
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 624d4e0c1967..15f529c40a31 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -13,8 +13,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AMDGPUISELLOWERING_H
-#define AMDGPUISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
#include "llvm/Target/TargetLowering.h"
@@ -43,25 +43,22 @@ private:
/// \brief Split a vector store into multiple scalar stores.
/// \returns The resulting chain.
- SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+ SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
- SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
- unsigned BitsDiff,
- SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -73,12 +70,25 @@ protected:
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const;
- /// \brief Split a vector load into multiple scalar loads.
- SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
+
+ /// \brief Split a vector load into a scalar load of each component.
+ SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
+ /// \brief Split a vector load into 2 loads of half the vector.
+ SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
+ /// \brief Split a vector store into a scalar store of each component.
+ SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
+ /// \brief Split a vector store into 2 stores of half the vector.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
+ void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) const;
bool isHWTrueValue(SDValue Op) const;
bool isHWFalseValue(SDValue Op) const;
@@ -114,8 +124,14 @@ public:
bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
bool ShouldShrinkFPConstant(EVT VT) const override;
+ bool shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtType,
+ EVT ExtVT) const override;
bool isLoadBitCastBeneficial(EVT, EVT) const override;
+ bool isCheapToSpeculateCttz() const override;
+ bool isCheapToSpeculateCtlz() const override;
+
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -132,9 +148,33 @@ public:
SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
- SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const;
+ SDValue CombineFMinMaxLegacy(SDLoc DL,
+ EVT VT,
+ SDValue LHS,
+ SDValue RHS,
+ SDValue True,
+ SDValue False,
+ SDValue CC,
+ DAGCombinerInfo &DCI) const;
+ SDValue CombineIMinMax(SDLoc DL,
+ EVT VT,
+ SDValue LHS,
+ SDValue RHS,
+ SDValue True,
+ SDValue False,
+ SDValue CC,
+ SelectionDAG &DAG) const;
+
const char* getTargetNodeName(unsigned Opcode) const override;
+ SDValue getRsqrtEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const override;
+ SDValue getRecipEstimate(SDValue Operand,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const override;
+
virtual SDNode *PostISelFolding(MachineSDNode *N,
SelectionDAG &DAG) const {
return N;
@@ -149,10 +189,8 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
- virtual unsigned ComputeNumSignBitsForTargetNode(
- SDValue Op,
- const SelectionDAG &DAG,
- unsigned Depth = 0) const override;
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
@@ -176,17 +214,24 @@ enum {
DWORDADDR,
FRACT,
CLAMP,
+ MAD, // Multiply + add with same result as the separate operations.
// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
// Denormals handled on some parts.
COS_HW,
SIN_HW,
- FMAX,
+ FMAX_LEGACY,
SMAX,
UMAX,
- FMIN,
+ FMIN_LEGACY,
SMIN,
UMIN,
+ FMAX3,
+ SMAX3,
+ UMAX3,
+ FMIN3,
+ SMIN3,
+ UMIN3,
URECIP,
DIV_SCALE,
DIV_FMAS,
@@ -199,6 +244,8 @@ enum {
RSQ,
RSQ_LEGACY,
RSQ_CLAMPED,
+ LDEXP,
+ FP_CLASS,
DOT4,
BFE_U32, // Extract range of bits with zero extension to 32-bits.
BFE_I32, // Extract range of bits with sign extension to 32-bits.
@@ -248,4 +295,4 @@ enum {
} // End namespace llvm
-#endif // AMDGPUISELLOWERING_H
+#endif
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index fef5b8cac5ba..5beaa6841c94 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -86,21 +86,6 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
// TODO: Implement this function
return nullptr;
}
-bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
- MachineBasicBlock &MBB) const {
- while (iter != MBB.end()) {
- switch (iter->getOpcode()) {
- default:
- break;
- case AMDGPU::BRANCH_COND_i32:
- case AMDGPU::BRANCH_COND_f32:
- case AMDGPU::BRANCH:
- return true;
- };
- ++iter;
- }
- return false;
-}
void
AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -147,7 +132,6 @@ bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const
} else if (isRegisterStore(*MI)) {
int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::val);
- AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
@@ -215,15 +199,30 @@ AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
return 0;
}
-bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
- int64_t Offset1, int64_t Offset2,
- unsigned NumLoads) const {
- assert(Offset2 > Offset1
- && "Second offset should be larger than first offset!");
- // If we have less than 16 loads in a row, and the offsets are within 16,
- // then schedule together.
- // TODO: Make the loads schedule near if it fits in a cacheline
- return (NumLoads < 16 && (Offset2 - Offset1) < 16);
+bool AMDGPUInstrInfo::enableClusterLoads() const {
+ return true;
+}
+
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+ int64_t Offset0, int64_t Offset1,
+ unsigned NumLoads) const {
+ assert(Offset1 > Offset0 &&
+ "Second offset should be larger than first offset!");
+ // If we have less than 16 loads in a row, and the offsets are within 64
+ // bytes, then schedule together.
+
+ // A cacheline is 64 bytes (for global memory).
+ return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
}
bool
@@ -320,7 +319,10 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
return -1;
}
- Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
+ Offset = MF.getTarget()
+ .getSubtargetImpl()
+ ->getFrameLowering()
+ ->getFrameIndexOffset(MF, -1);
return getIndirectIndexBegin(MF) + Offset;
}
@@ -335,12 +337,12 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
}
// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
-// header files, so we need to wrap it in a function that takes unsigned
+// header files, so we need to wrap it in a function that takes unsigned
// instead.
namespace llvm {
namespace AMDGPU {
int getMCOpcode(uint16_t Opcode, unsigned Gen) {
- return getMCOpcode(Opcode);
+ return getMCOpcodeGen(Opcode, (enum Subtarget)Gen);
}
}
}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index d5041f558163..da9833d25a52 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -13,10 +13,9 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AMDGPUINSTRUCTIONINFO_H
-#define AMDGPUINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
-#include "AMDGPUInstrInfo.h"
#include "AMDGPURegisterInfo.h"
#include "llvm/Target/TargetInstrInfo.h"
#include <map>
@@ -41,8 +40,6 @@ class MachineInstrBuilder;
class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
private:
const AMDGPURegisterInfo RI;
- bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
- MachineBasicBlock &MBB) const;
virtual void anchor();
protected:
const AMDGPUSubtarget &ST;
@@ -74,11 +71,6 @@ public:
LiveVariables *LV) const override;
- virtual void copyPhysReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI, DebugLoc DL,
- unsigned DestReg, unsigned SrcReg,
- bool KillSrc) const = 0;
-
bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -101,6 +93,7 @@ protected:
MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops,
MachineInstr *LoadMI) const override;
+public:
/// \returns the smallest register index that will be accessed by an indirect
/// read or write or -1 if indirect addressing is not used by this program.
int getIndirectIndexBegin(const MachineFunction &MF) const;
@@ -109,7 +102,6 @@ protected:
/// read or write or -1 if indirect addressing is not used by this program.
int getIndirectIndexEnd(const MachineFunction &MF) const;
-public:
bool canFoldMemoryOperand(const MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops) const override;
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
@@ -120,6 +112,9 @@ public:
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
bool UnfoldLoad, bool UnfoldStore,
unsigned *LoadRegIndex = nullptr) const override;
+
+ bool enableClusterLoads() const override;
+
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const override;
@@ -196,4 +191,4 @@ namespace AMDGPU {
#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63)
#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
-#endif // AMDGPUINSTRINFO_H
+#endif
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 820f1a80d75e..0e34392bd50d 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -23,6 +23,14 @@ def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
[SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
>;
+def AMDGPULdExpOp : SDTypeProfile<1, 2,
+ [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPUFPClassOp : SDTypeProfile<1, 2,
+ [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
+>;
+
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
@@ -52,12 +60,20 @@ def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
-// out = max(a, b) a and b are floats
-def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
- [SDNPCommutative, SDNPAssociative]
+def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+
+def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+
+// out = max(a, b) a and b are floats, where a nan comparison fails.
+// This is not commutative because this gives the second operand:
+// x < nan ? x : nan -> nan
+// nan < x ? nan : x -> x
+def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
+ []
>;
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>;
// out = max(a, b) a and b are signed ints
def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
@@ -69,12 +85,12 @@ def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
[SDNPCommutative, SDNPAssociative]
>;
-// out = min(a, b) a and b are floats
-def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
- [SDNPCommutative, SDNPAssociative]
+// out = min(a, b) a and b are floats, where a nan comparison fails.
+def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
+ []
>;
-// out = min(a, b) a snd b are signed ints
+// out = min(a, b) a and b are signed ints
def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
[SDNPCommutative, SDNPAssociative]
>;
@@ -84,6 +100,37 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
[SDNPCommutative, SDNPAssociative]
>;
+// FIXME: TableGen doesn't like commutative instructions with more
+// than 2 operands.
+// out = max(a, b, c) a, b and c are floats
+def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b, and c are signed ints
+def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b and c are unsigned ints
+def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are floats
+def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are signed ints
+def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
SDTIntToFPOp, []>;
@@ -130,7 +177,7 @@ def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
//
// src0: vec4(src, 0, 0, mask)
-// src1: dst - rat offset (aka pointer) in dwords
+// src1: dst - rat offset (aka pointer) in dwords
def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
SDTypeProfile<0, 2, []>,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index cd3560378e57..4e536c37b0bd 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -23,6 +23,8 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio
let Pattern = pattern;
let Itinerary = NullALU;
+ let isCodeGenOnly = 1;
+
let TSFlags{63} = isRegisterLoad;
let TSFlags{62} = isRegisterStore;
}
@@ -71,6 +73,11 @@ def COND_OEQ : PatLeaf <
[{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
>;
+def COND_ONE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
+>;
+
def COND_OGT : PatLeaf <
(cond),
[{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
@@ -91,23 +98,28 @@ def COND_OLE : PatLeaf <
[{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
>;
-def COND_UNE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
->;
def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
//===----------------------------------------------------------------------===//
-// PatLeafs for unsigned comparisons
+// PatLeafs for unsigned / unordered comparisons
//===----------------------------------------------------------------------===//
+def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
+def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+// XXX - For some reason R600 version is preferring to use unordered
+// for setne?
+def COND_UNE_NE : PatLeaf <
+ (cond),
+ [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
+>;
+
//===----------------------------------------------------------------------===//
// PatLeafs for signed comparisons
//===----------------------------------------------------------------------===//
@@ -133,7 +145,7 @@ def COND_NE : PatLeaf <
def COND_NULL : PatLeaf <
(cond),
- [{return false;}]
+ [{(void)N; return false;}]
>;
//===----------------------------------------------------------------------===//
@@ -195,6 +207,14 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+ return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+ return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
}]>;
@@ -223,6 +243,14 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+ return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+ return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
}]>;
@@ -248,6 +276,11 @@ def az_extloadi32_global : PatFrag<(ops node:$ptr),
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+def az_extloadi32_flat : PatFrag<(ops node:$ptr),
+ (az_extloadi32 node:$ptr), [{
+ return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
def az_extloadi32_constant : PatFrag<(ops node:$ptr),
(az_extloadi32 node:$ptr), [{
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
@@ -263,6 +296,16 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
return isGlobalStore(dyn_cast<StoreSDNode>(N));
}]>;
+def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
+ (truncstorei8 node:$val, node:$ptr), [{
+ return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
+ (truncstorei16 node:$val, node:$ptr), [{
+ return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
def local_store : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
return isLocalStore(dyn_cast<StoreSDNode>(N));
@@ -282,6 +325,17 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return isLocalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+ return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
+}]>;
+
+def local_load_aligned8bytes : Aligned8Bytes <
+ (ops node:$ptr), (local_load node:$ptr)
+>;
+
+def local_store_aligned8bytes : Aligned8Bytes <
+ (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr)
+>;
class local_binary_atomic_op<SDNode atomic_op> :
PatFrag<(ops node:$ptr, node:$value),
@@ -307,6 +361,7 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr),
return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
+
def atomic_cmp_swap_32_local :
PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
(atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
@@ -323,6 +378,45 @@ def atomic_cmp_swap_64_local :
AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
}]>;
+def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def flat_store : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
+ (AMDGPUstore_mskor node:$val, node:$ptr), [{
+ return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
+}]>;
+
+class global_binary_atomic_op<SDNode atomic_op> : PatFrag<
+ (ops node:$ptr, node:$value),
+ (atomic_op node:$ptr, node:$value),
+ [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]
+>;
+
+def atomic_swap_global : global_binary_atomic_op<atomic_swap>;
+def atomic_add_global : global_binary_atomic_op<atomic_load_add>;
+def atomic_and_global : global_binary_atomic_op<atomic_load_and>;
+def atomic_max_global : global_binary_atomic_op<atomic_load_max>;
+def atomic_min_global : global_binary_atomic_op<atomic_load_min>;
+def atomic_or_global : global_binary_atomic_op<atomic_load_or>;
+def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
+def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
+def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
+def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
+
+//===----------------------------------------------------------------------===//
+// Misc Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def fmad : PatFrag <
+ (ops node:$src0, node:$src1, node:$src2),
+ (fadd (fmul node:$src0, node:$src1), node:$src2)
+>;
class Constants {
int TWO_PI = 0x40c90fdb;
@@ -442,8 +536,9 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
// BFI_INT patterns
-multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> {
-
+multiclass BFIPatterns <Instruction BFI_INT,
+ Instruction LoadImm32,
+ RegisterClass RC64> {
// Definition from ISA doc:
// (y & x) | (z & ~x)
def : Pat <
@@ -465,8 +560,8 @@ multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> {
def : Pat <
(f64 (fcopysign f64:$src0, f64:$src1)),
- (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
- (i32 (EXTRACT_SUBREG $src0, sub0)), sub0),
+ (REG_SEQUENCE RC64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
(BFI_INT (LoadImm32 0x7fffffff),
(i32 (EXTRACT_SUBREG $src0, sub1)),
(i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
index 58916a995496..e94bb6013d83 100644
--- a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
#include "AMDGPUGenIntrinsics.inc"
#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
: TargetIntrinsicInfo() {}
std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h
index 5be68a217da5..4c95b5ec0974 100644
--- a/lib/Target/R600/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h
@@ -11,8 +11,8 @@
/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
//
//===-----------------------------------------------------------------------===//
-#ifndef AMDGPU_INTRINSICINFO_H
-#define AMDGPU_INTRINSICINFO_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
#include "llvm/IR/Intrinsics.h"
#include "llvm/Target/TargetIntrinsicInfo.h"
@@ -33,7 +33,7 @@ enum ID {
class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
public:
- AMDGPUIntrinsicInfo(TargetMachine *tm);
+ AMDGPUIntrinsicInfo();
std::string getName(unsigned IntrId, Type **Tys = nullptr,
unsigned numTys = 0) const override;
unsigned lookupName(const char *Name, unsigned Len) const override;
@@ -45,4 +45,4 @@ public:
} // end namespace llvm
-#endif // AMDGPU_INTRINSICINFO_H
+#endif
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index ce5c41ceb267..1995ef2b0c9e 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -40,8 +40,13 @@ AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
{ }
enum AMDGPUMCInstLower::SISubtarget
-AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const {
- return AMDGPUMCInstLower::SI;
+AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned Gen) const {
+ switch (Gen) {
+ default:
+ return AMDGPUMCInstLower::SI;
+ case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+ return AMDGPUMCInstLower::VI;
+ }
}
unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
@@ -63,13 +68,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
switch (MO.getType()) {
default:
llvm_unreachable("unknown operand type");
- case MachineOperand::MO_FPImmediate: {
- const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
- assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
- "Only floating point immediates are supported at the moment.");
- MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
- break;
- }
case MachineOperand::MO_Immediate:
MCOp = MCOperand::CreateImm(MO.getImm());
break;
@@ -104,7 +102,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
#ifdef _DEBUG
StringRef Err;
- if (!TM.getInstrInfo()->verifyInstruction(MI, Err)) {
+ if (!TM.getSubtargetImpl()->getInstrInfo()->verifyInstruction(MI, Err)) {
errs() << "Warning: Illegal instruction detected: " << Err << "\n";
MI->dump();
}
@@ -128,8 +126,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
std::string &DisasmLine = DisasmLines.back();
raw_string_ostream DisasmStream(DisasmLine);
- AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *TM.getInstrInfo(),
- *TM.getRegisterInfo());
+ AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
+ *TM.getSubtargetImpl()->getInstrInfo(),
+ *TM.getSubtargetImpl()->getRegisterInfo());
InstPrinter.printInst(&TmpInst, DisasmStream, StringRef());
// Disassemble instruction/operands to hex representation.
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index 58fe34d32d31..0ae4d11bf1d1 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -8,8 +8,8 @@
/// \file
//===----------------------------------------------------------------------===//
-#ifndef AMDGPU_MCINSTLOWER_H
-#define AMDGPU_MCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
+#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
namespace llvm {
@@ -22,7 +22,8 @@ class AMDGPUMCInstLower {
// This must be kept in sync with the SISubtarget class in SIInstrInfo.td
enum SISubtarget {
- SI = 0
+ SI = 0,
+ VI = 1
};
MCContext &Ctx;
@@ -45,4 +46,4 @@ public:
} // End namespace llvm
-#endif //AMDGPU_MCINSTLOWER_H
+#endif
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index 90af80113ece..0f3f9e26528b 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -12,7 +12,9 @@ void AMDGPUMachineFunction::anchor() {}
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
ShaderType(ShaderType::COMPUTE),
- LDSSize(0) {
+ LDSSize(0),
+ ScratchSize(0),
+ IsKernel(true) {
AttributeSet Set = MF.getFunction()->getAttributes();
Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
ShaderTypeAttribute);
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
index 0854d588eeba..f5e4694e76f6 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h
@@ -10,8 +10,8 @@
/// \file
//===----------------------------------------------------------------------===//
-#ifndef AMDGPUMACHINEFUNCTION_H
-#define AMDGPUMACHINEFUNCTION_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
#include "llvm/CodeGen/MachineFunction.h"
#include <map>
@@ -30,10 +30,16 @@ public:
/// Number of bytes in the LDS that are being used.
unsigned LDSSize;
+ /// Start of implicit kernel args
+ unsigned ABIArgOffset;
+
unsigned getShaderType() const {
return ShaderType;
}
+
+ unsigned ScratchSize;
+ bool IsKernel;
};
}
-#endif // AMDGPUMACHINEFUNCTION_H
+#endif
diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
index 218750d445e6..b81fef47d55a 100644
--- a/lib/Target/R600/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
@@ -36,11 +36,9 @@ class AMDGPUPromoteAlloca : public FunctionPass,
public:
AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
LocalMemAvailable(0) { }
- virtual bool doInitialization(Module &M);
- virtual bool runOnFunction(Function &F);
- virtual const char *getPassName() const {
- return "AMDGPU Promote Alloca";
- }
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+ const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
void visitAlloca(AllocaInst &I);
};
@@ -107,14 +105,16 @@ static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
ArrayTy->getArrayNumElements());
}
-static Value* calculateVectorIndex(Value *Ptr,
- std::map<GetElementPtrInst*, Value*> GEPIdx) {
+static Value *
+calculateVectorIndex(Value *Ptr,
+ const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
if (isa<AllocaInst>(Ptr))
return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
- return GEPIdx[GEP];
+ auto I = GEPIdx.find(GEP);
+ return I == GEPIdx.end() ? nullptr : I->second;
}
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
@@ -234,7 +234,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
return true;
}
-static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+ bool Success = true;
for (User *User : Val->users()) {
if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
continue;
@@ -242,11 +243,20 @@ static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
WorkList.push_back(User);
continue;
}
+
+ // FIXME: Correctly handle ptrtoint instructions.
+ Instruction *UseInst = dyn_cast<Instruction>(User);
+ if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
+ return false;
+
if (!User->getType()->isPointerTy())
continue;
+
WorkList.push_back(User);
- collectUsesWithPtrTypes(User, WorkList);
+
+ Success &= collectUsesWithPtrTypes(User, WorkList);
}
+ return Success;
}
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
@@ -274,6 +284,13 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
return;
}
+ std::vector<Value*> WorkList;
+
+ if (!collectUsesWithPtrTypes(&I, WorkList)) {
+ DEBUG(dbgs() << " Do not know how to convert all uses\n");
+ return;
+ }
+
DEBUG(dbgs() << "Promoting alloca to local memory\n");
LocalMemAvailable -= AllocaSize;
@@ -320,10 +337,6 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
I.replaceAllUsesWith(Offset);
I.eraseFromParent();
- std::vector<Value*> WorkList;
-
- collectUsesWithPtrTypes(Offset, WorkList);
-
for (std::vector<Value*>::iterator i = WorkList.begin(),
e = WorkList.end(); i != e; ++i) {
Value *V = *i;
@@ -331,6 +344,13 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
if (!Call) {
Type *EltTy = V->getType()->getPointerElementType();
PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+ // The operand's value should be corrected on its own.
+ if (isa<AddrSpaceCastInst>(V))
+ continue;
+
+ // FIXME: It doesn't really make sense to try to do this for all
+ // instructions.
V->mutateType(NewTy);
continue;
}
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 34332808f865..57b054bc2a61 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -42,8 +42,7 @@ void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- assert(!"Subroutines not supported yet");
- return 0;
+ return AMDGPU::NoRegister;
}
unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 46aa7a17dfca..f27576ab9736 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -13,8 +13,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AMDGPUREGISTERINFO_H
-#define AMDGPUREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
#include "llvm/ADT/BitVector.h"
#include "llvm/Target/TargetRegisterInfo.h"
@@ -51,7 +51,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
unsigned getSubRegFromChannel(unsigned Channel) const;
const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
- virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;
@@ -62,4 +62,4 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
} // End namespace llvm
-#endif // AMDIDSAREGISTERINFO_H
+#endif
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index e3c2a50ab828..597e558e6634 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -13,12 +13,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUSubtarget.h"
+#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
+#include "R600MachineScheduler.h"
+#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallString.h"
-
using namespace llvm;
#define DEBUG_TYPE "amdgpu-subtarget"
@@ -28,26 +29,23 @@ using namespace llvm;
#define GET_SUBTARGETINFO_CTOR
#include "AMDGPUGenSubtargetInfo.inc"
-AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) :
- AMDGPUGenSubtargetInfo(TT, GPU, FS),
- DevName(GPU),
- Is64bit(false),
- DumpCode(false),
- R600ALUInst(false),
- HasVertexCache(false),
- TexVTXClauseSize(0),
- Gen(AMDGPUSubtarget::R600),
- FP64(false),
- FP64Denormals(false),
- FP32Denormals(false),
- CaymanISA(false),
- EnableIRStructurizer(true),
- EnablePromoteAlloca(false),
- EnableIfCvt(true),
- WavefrontSize(0),
- CFALUBug(false),
- LocalMemorySize(0),
- InstrItins(getInstrItineraryForCPU(GPU)) {
+static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
+ std::string Ret = "e-p:32:32";
+
+ if (ST.is64bit()) {
+ // 32-bit private, local, and region pointers. 64-bit global and constant.
+ Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+ }
+
+ Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
+ "-v512:512-v1024:1024-v2048:2048-n32:64";
+
+ return Ret;
+}
+
+AMDGPUSubtarget &
+AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) {
+ // Determine default and user-specified characteristics
// On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
// enabled, but some instructions do not respect them and they run at the
// double precision rate, so don't enable by default.
@@ -61,16 +59,37 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) :
ParseSubtargetFeatures(GPU, FullFS);
+ // FIXME: I don't think think Evergreen has any useful support for
+ // denormals, but should be checked. Should we issue a warning somewhere
+ // if someone tries to enable these?
if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- InstrInfo.reset(new R600InstrInfo(*this));
-
- // FIXME: I don't think think Evergreen has any useful support for
- // denormals, but should be checked. Should we issue a warning somewhere if
- // someone tries to enable these?
FP32Denormals = false;
FP64Denormals = false;
+ }
+ return *this;
+}
+
+AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
+ TargetMachine &TM)
+ : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
+ DumpCode(false), R600ALUInst(false), HasVertexCache(false),
+ TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
+ FP64Denormals(false), FP32Denormals(false), CaymanISA(false),
+ FlatAddressSpace(false), EnableIRStructurizer(true),
+ EnablePromoteAlloca(false), EnableIfCvt(true),
+ EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
+ DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))),
+ FrameLowering(TargetFrameLowering::StackGrowsUp,
+ 64 * 16, // Maximum stack alignment (long16)
+ 0),
+ InstrItins(getInstrItineraryForCPU(GPU)),
+ TargetTriple(TT) {
+ if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+ InstrInfo.reset(new R600InstrInfo(*this));
+ TLInfo.reset(new R600TargetLowering(TM));
} else {
InstrInfo.reset(new SIInstrInfo(*this));
+ TLInfo.reset(new SITargetLowering(TM));
}
}
@@ -87,3 +106,10 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const {
llvm_unreachable("Illegal wavefront size.");
}
}
+
+unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
+ switch(getGeneration()) {
+ default: llvm_unreachable("ChipID unknown");
+ case SEA_ISLANDS: return 12;
+ }
+}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index a844b37b6be5..90179d79d25d 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -12,25 +12,26 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AMDGPUSUBTARGET_H
-#define AMDGPUSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
#include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600ISelLowering.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#define GET_SUBTARGETINFO_HEADER
#include "AMDGPUGenSubtargetInfo.inc"
-#define MAX_CB_SIZE (1 << 16)
-
namespace llvm {
class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
- std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
-
public:
enum Generation {
R600 = 0,
@@ -38,7 +39,8 @@ public:
EVERGREEN,
NORTHERN_ISLANDS,
SOUTHERN_ISLANDS,
- SEA_ISLANDS
+ SEA_ISLANDS,
+ VOLCANIC_ISLANDS,
};
private:
@@ -53,24 +55,41 @@ private:
bool FP64Denormals;
bool FP32Denormals;
bool CaymanISA;
+ bool FlatAddressSpace;
bool EnableIRStructurizer;
bool EnablePromoteAlloca;
bool EnableIfCvt;
+ bool EnableLoadStoreOpt;
unsigned WavefrontSize;
bool CFALUBug;
int LocalMemorySize;
+ const DataLayout DL;
+ AMDGPUFrameLowering FrameLowering;
+ std::unique_ptr<AMDGPUTargetLowering> TLInfo;
+ std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
InstrItineraryData InstrItins;
+ Triple TargetTriple;
public:
- AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
+ AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM);
+ AMDGPUSubtarget &initializeSubtargetDependencies(StringRef GPU, StringRef FS);
- const AMDGPUInstrInfo *getInstrInfo() const {
+ const AMDGPUFrameLowering *getFrameLowering() const override {
+ return &FrameLowering;
+ }
+ const AMDGPUInstrInfo *getInstrInfo() const override {
return InstrInfo.get();
}
-
- const InstrItineraryData &getInstrItineraryData() const {
- return InstrItins;
+ const AMDGPURegisterInfo *getRegisterInfo() const override {
+ return &InstrInfo->getRegisterInfo();
+ }
+ AMDGPUTargetLowering *getTargetLowering() const override {
+ return TLInfo.get();
+ }
+ const DataLayout *getDataLayout() const override { return &DL; }
+ const InstrItineraryData *getInstrItineraryData() const override {
+ return &InstrItins;
}
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -107,6 +126,10 @@ public:
return FP64Denormals;
}
+ bool hasFlatAddressSpace() const {
+ return FlatAddressSpace;
+ }
+
bool hasBFE() const {
return (getGeneration() >= EVERGREEN);
}
@@ -158,6 +181,10 @@ public:
return EnableIfCvt;
}
+ bool loadStoreOptEnabled() const {
+ return EnableLoadStoreOpt;
+ }
+
unsigned getWavefrontSize() const {
return WavefrontSize;
}
@@ -173,6 +200,8 @@ public:
return LocalMemorySize;
}
+ unsigned getAmdKernelCodeChipID() const;
+
bool enableMachineScheduler() const override {
return getGeneration() <= NORTHERN_ISLANDS;
}
@@ -192,8 +221,11 @@ public:
bool r600ALUEncoding() const {
return R600ALUInst;
}
+ bool isAmdHsaOS() const {
+ return TargetTriple.getOS() == Triple::AMDHSA;
+ }
};
} // End namespace llvm
-#endif // AMDGPUSUBTARGET_H
+#endif
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 56ba719e6863..a1da7172d53f 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -22,6 +22,7 @@
#include "SIInstrInfo.h"
#include "llvm/Analysis/Passes.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Verifier.h"
@@ -38,6 +39,7 @@ using namespace llvm;
extern "C" void LLVMInitializeR600Target() {
// Register the target
RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
+ RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
}
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
@@ -48,46 +50,20 @@ static MachineSchedRegistry
SchedCustomRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
-static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
- std::string Ret = "e-p:32:32";
-
- if (ST.is64bit()) {
- // 32-bit local, and region pointers. 64-bit private, global, and constant.
- Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
- }
-
- Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
- "-v512:512-v1024:1024-v2048:2048-n32:64";
-
- return Ret;
-}
-
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
- StringRef CPU, StringRef FS,
- TargetOptions Options,
- Reloc::Model RM, CodeModel::Model CM,
- CodeGenOpt::Level OptLevel
-)
-:
- LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
- Subtarget(TT, CPU, FS),
- Layout(computeDataLayout(Subtarget)),
- FrameLowering(TargetFrameLowering::StackGrowsUp,
- 64 * 16 // Maximum stack alignment (long16)
- , 0),
- IntrinsicInfo(this),
- InstrItins(&Subtarget.getInstrItineraryData()) {
- // TLInfo uses InstrInfo so it must be initialized after.
- if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- TLInfo.reset(new R600TargetLowering(*this));
- } else {
- TLInfo.reset(new SITargetLowering(*this));
- }
+ StringRef CPU, StringRef FS,
+ TargetOptions Options, Reloc::Model RM,
+ CodeModel::Model CM,
+ CodeGenOpt::Level OptLevel)
+ : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+ TLOF(new TargetLoweringObjectFileELF()),
+ Subtarget(TT, CPU, FS, *this), IntrinsicInfo() {
setRequiresStructuredCFG(true);
initAsmInfo();
}
AMDGPUTargetMachine::~AMDGPUTargetMachine() {
+ delete TLOF;
}
namespace {
@@ -108,13 +84,14 @@ public:
return nullptr;
}
- virtual void addCodeGenPrepare();
+ void addIRPasses() override;
+ void addCodeGenPrepare() override;
bool addPreISel() override;
bool addInstSelector() override;
- bool addPreRegAlloc() override;
- bool addPostRegAlloc() override;
- bool addPreSched2() override;
- bool addPreEmitPass() override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
};
} // End of anonymous namespace
@@ -134,6 +111,19 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
PM.add(createAMDGPUTargetTransformInfoPass(this));
}
+void AMDGPUPassConfig::addIRPasses() {
+ // Function calls are not supported, so make sure we inline everything.
+ addPass(createAMDGPUAlwaysInlinePass());
+ addPass(createAlwaysInlinerPass());
+ // We need to add the barrier noop pass, otherwise adding the function
+ // inlining pass will cause all of the PassConfigs passes to be run
+ // one function at a time, which means if we have a nodule with two
+ // functions, then we will generate code for the first function
+ // without ever running any passes on the second.
+ addPass(createBarrierNoopPass());
+ TargetPassConfig::addIRPasses();
+}
+
void AMDGPUPassConfig::addCodeGenPrepare() {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
if (ST.isPromoteAllocaEnabled()) {
@@ -161,61 +151,82 @@ AMDGPUPassConfig::addPreISel() {
}
bool AMDGPUPassConfig::addInstSelector() {
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
- addPass(createSILowerI1CopiesPass());
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ addPass(createSILowerI1CopiesPass());
+ addPass(createSIFixSGPRCopiesPass(*TM));
+ addPass(createSIFoldOperandsPass());
+ }
+
return false;
}
-bool AMDGPUPassConfig::addPreRegAlloc() {
+void AMDGPUPassConfig::addPreRegAlloc() {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
addPass(createR600VectorRegMerger(*TM));
} else {
- addPass(createSIFixSGPRCopiesPass(*TM));
- // SIFixSGPRCopies can generate a lot of duplicate instructions,
- // so we need to run MachineCSE afterwards.
- addPass(&MachineCSEID);
- addPass(createSIShrinkInstructionsPass());
- initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
- insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID);
+ if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
+ // Don't do this with no optimizations since it throws away debug info by
+ // merging nonadjacent loads.
+
+ // This should be run after scheduling, but before register allocation. It
+ // also need extra copies to the address operand to be eliminated.
+ initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+ insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
+ }
+
+ addPass(createSIShrinkInstructionsPass(), false);
+ addPass(createSIFixSGPRLiveRangesPass(), false);
}
- return false;
}
-bool AMDGPUPassConfig::addPostRegAlloc() {
+void AMDGPUPassConfig::addPostRegAlloc() {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
- addPass(createSIShrinkInstructionsPass());
if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
- addPass(createSIInsertWaits(*TM));
+ addPass(createSIPrepareScratchRegs(), false);
+ addPass(createSIShrinkInstructionsPass(), false);
}
- return false;
}
-bool AMDGPUPassConfig::addPreSched2() {
+void AMDGPUPassConfig::addPreSched2() {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- addPass(createR600EmitClauseMarkers());
+ addPass(createR600EmitClauseMarkers(), false);
if (ST.isIfCvtEnabled())
- addPass(&IfConverterID);
+ addPass(&IfConverterID, false);
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- addPass(createR600ClauseMergePass(*TM));
- return false;
+ addPass(createR600ClauseMergePass(*TM), false);
+ if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ addPass(createSIInsertWaits(*TM), false);
+ }
}
-bool AMDGPUPassConfig::addPreEmitPass() {
+void AMDGPUPassConfig::addPreEmitPass() {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
- addPass(createAMDGPUCFGStructurizerPass());
- addPass(createR600ExpandSpecialInstrsPass(*TM));
- addPass(&FinalizeMachineBundlesID);
- addPass(createR600Packetizer(*TM));
- addPass(createR600ControlFlowFinalizer(*TM));
+ addPass(createAMDGPUCFGStructurizerPass(), false);
+ addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+ addPass(&FinalizeMachineBundlesID, false);
+ addPass(createR600Packetizer(*TM), false);
+ addPass(createR600ControlFlowFinalizer(*TM), false);
} else {
- addPass(createSILowerControlFlowPass(*TM));
+ addPass(createSILowerControlFlowPass(*TM), false);
}
-
- return false;
}
+
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+GCNTargetMachine::GCNTargetMachine(const Target &T, StringRef TT, StringRef FS,
+ StringRef CPU, TargetOptions Options, Reloc::Model RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL) :
+ AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { }
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 3bb15beb6bf1..66b30700d883 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AMDGPU_TARGET_MACHINE_H
-#define AMDGPU_TARGET_MACHINE_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
+#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
#include "AMDGPUFrameLowering.h"
#include "AMDGPUInstrInfo.h"
@@ -24,48 +24,48 @@
namespace llvm {
-class AMDGPUTargetMachine : public LLVMTargetMachine {
+//===----------------------------------------------------------------------===//
+// AMDGPU Target Machine (R600+)
+//===----------------------------------------------------------------------===//
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+protected:
+ TargetLoweringObjectFile *TLOF;
AMDGPUSubtarget Subtarget;
- const DataLayout Layout;
- AMDGPUFrameLowering FrameLowering;
AMDGPUIntrinsicInfo IntrinsicInfo;
- std::unique_ptr<AMDGPUTargetLowering> TLInfo;
- const InstrItineraryData *InstrItins;
public:
AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
StringRef CPU, TargetOptions Options, Reloc::Model RM,
CodeModel::Model CM, CodeGenOpt::Level OL);
~AMDGPUTargetMachine();
- const AMDGPUFrameLowering *getFrameLowering() const override {
- return &FrameLowering;
- }
- const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
- return &IntrinsicInfo;
- }
- const AMDGPUInstrInfo *getInstrInfo() const override {
- return getSubtargetImpl()->getInstrInfo();
- }
const AMDGPUSubtarget *getSubtargetImpl() const override {
return &Subtarget;
}
- const AMDGPURegisterInfo *getRegisterInfo() const override {
- return &getInstrInfo()->getRegisterInfo();
- }
- AMDGPUTargetLowering *getTargetLowering() const override {
- return TLInfo.get();
- }
- const InstrItineraryData *getInstrItineraryData() const override {
- return InstrItins;
+ const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+ return &IntrinsicInfo;
}
- const DataLayout *getDataLayout() const override { return &Layout; }
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
/// \brief Register R600 analysis passes with a pass manager.
void addAnalysisPasses(PassManagerBase &PM) override;
+ TargetLoweringObjectFile *getObjFileLowering() const override {
+ return TLOF;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+class GCNTargetMachine : public AMDGPUTargetMachine {
+
+public:
+ GCNTargetMachine(const Target &T, StringRef TT, StringRef FS,
+ StringRef CPU, TargetOptions Options, Reloc::Model RM,
+ CodeModel::Model CM, CodeGenOpt::Level OL);
};
} // End namespace llvm
-#endif // AMDGPU_TARGET_MACHINE_H
+#endif
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index 88934b65876e..e7bc00635f75 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -52,7 +52,7 @@ public:
AMDGPUTTI(const AMDGPUTargetMachine *TM)
: ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
- TLI(TM->getTargetLowering()) {
+ TLI(TM->getSubtargetImpl()->getTargetLowering()) {
initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
}
@@ -74,16 +74,14 @@ public:
bool hasBranchDivergence() const override;
- void getUnrollingPreferences(Loop *L,
+ void getUnrollingPreferences(const Function *F, Loop *L,
UnrollingPreferences &UP) const override;
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override;
unsigned getNumberOfRegisters(bool Vector) const override;
unsigned getRegisterBitWidth(bool Vector) const override;
- unsigned getMaximumUnrollFactor() const override;
-
- /// @}
+ unsigned getMaxInterleaveFactor() const override;
};
} // end anonymous namespace
@@ -99,8 +97,14 @@ llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
bool AMDGPUTTI::hasBranchDivergence() const { return true; }
-void AMDGPUTTI::getUnrollingPreferences(Loop *L,
+void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L,
UnrollingPreferences &UP) const {
+ UP.Threshold = 300; // Twice the default.
+ UP.Count = UINT_MAX;
+ UP.Partial = true;
+
+ // TODO: Do we want runtime unrolling?
+
for (const BasicBlock *BB : L->getBlocks()) {
for (const Instruction &I : *BB) {
const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
@@ -120,7 +124,7 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L,
//
// Don't use the maximum allowed value here as it will make some
// programs way too big.
- UP.Threshold = 500;
+ UP.Threshold = 800;
}
}
}
@@ -147,7 +151,7 @@ unsigned AMDGPUTTI::getRegisterBitWidth(bool) const {
return 32;
}
-unsigned AMDGPUTTI::getMaximumUnrollFactor() const {
+unsigned AMDGPUTTI::getMaxInterleaveFactor() const {
// Semi-arbitrary large amount.
return 64;
}
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index f3a03914391c..ee6e8ecfb29d 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -11,6 +11,7 @@
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "R600InstrInfo.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/SmallVector.h"
@@ -160,7 +161,7 @@ public:
bool prepare();
bool runOnMachineFunction(MachineFunction &MF) override {
- TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+ TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI = &TII->getRegisterInfo();
DEBUG(MF.dump(););
OrderedBlks.clear();
@@ -337,7 +338,7 @@ protected:
void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
- /// This is work around solution for findNearestCommonDominator not avaiable
+ /// This is work around solution for findNearestCommonDominator not available
/// to post dom a proper fix should go to Dominators.h.
MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
MachineBasicBlock *MBB2);
diff --git a/lib/Target/R600/AMDKernelCodeT.h b/lib/Target/R600/AMDKernelCodeT.h
new file mode 100644
index 000000000000..4d3041ff3db8
--- /dev/null
+++ b/lib/Target/R600/AMDKernelCodeT.h
@@ -0,0 +1,704 @@
+//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeT.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODET_H
+#define AMDKERNELCODET_H
+
+#include <cstddef>
+#include <cstdint>
+
+//---------------------------------------------------------------------------//
+// AMD Kernel Code, and its dependencies //
+//---------------------------------------------------------------------------//
+
+typedef uint8_t hsa_powertwo8_t;
+typedef uint32_t hsa_ext_code_kind_t;
+typedef uint8_t hsa_ext_brig_profile8_t;
+typedef uint8_t hsa_ext_brig_machine_model8_t;
+typedef uint64_t hsa_ext_control_directive_present64_t;
+typedef uint16_t hsa_ext_exception_kind16_t;
+typedef uint32_t hsa_ext_code_kind32_t;
+
+typedef struct hsa_dim3_s {
+ uint32_t x;
+ uint32_t y;
+ uint32_t z;
+} hsa_dim3_t;
+
+/// The version of the amd_*_code_t struct. Minor versions must be
+/// backward compatible.
+typedef uint32_t amd_code_version32_t;
+enum amd_code_version_t {
+ AMD_CODE_VERSION_MAJOR = 0,
+ AMD_CODE_VERSION_MINOR = 1
+};
+
+/// The values used to define the number of bytes to use for the
+/// swizzle element size.
+enum amd_element_byte_size_t {
+ AMD_ELEMENT_2_BYTES = 0,
+ AMD_ELEMENT_4_BYTES = 1,
+ AMD_ELEMENT_8_BYTES = 2,
+ AMD_ELEMENT_16_BYTES = 3
+};
+
+/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+/// COMPUTE_PGM_RSRC2 registers.
+typedef uint64_t amd_compute_pgm_resource_register64_t;
+
+/// Every amd_*_code_t has the following properties, which are composed of
+/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
+/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
+/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
+///
+/// (Note that bit fields cannot be used as their layout is
+/// implementation defined in the C standard and so cannot be used to
+/// specify an ABI)
+typedef uint32_t amd_code_property32_t;
+enum amd_code_property_mask_t {
+
+ /// Enable the setup of the SGPR user data registers
+ /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
+ /// for initial register state.
+ ///
+ /// The total number of SGPRuser data registers requested must not
+ /// exceed 16. Any requests beyond 16 will be ignored.
+ ///
+ /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
+ /// SGPR user data registers enabled up to 16).
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
+
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
+
+ /// Control wave ID base counter for GDS ordered-append. Used to set
+ /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
+ /// ORDERED_APPEND_MODE also needs to be settable)
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10,
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
+ AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
+
+ /// The interleave (swizzle) element size in bytes required by the
+ /// code for private memory. This must be 2, 4, 8 or 16. This value
+ /// is provided to the finalizer when it is invoked and is recorded
+ /// here. The hardware will interleave the memory requests of each
+ /// lane of a wavefront by this element size to ensure each
+ /// work-item gets a distinct memory memory location. Therefore, the
+ /// finalizer ensures that all load and store operations done to
+ /// private memory do not exceed this size. For example, if the
+ /// element size is 4 (32-bits or dword) and a 64-bit value must be
+ /// loaded, the finalizer will generate two 32-bit loads. This
+ /// ensures that the interleaving will get the the work-item
+ /// specific dword for both halves of the 64-bit value. If it just
+ /// did a 64-bit load then it would get one dword which belonged to
+ /// its own work-item, but the second dword would belong to the
+ /// adjacent lane work-item since the interleaving is in dwords.
+ ///
+ /// The value used must match the value that the runtime configures
+ /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
+ /// is generally DWORD.
+ ///
+ /// Use values from the amd_element_byte_size_t enum.
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
+
+ /// Are global memory addresses 64 bits. Must match
+ /// amd_kernel_code_t.hsail_machine_model ==
+ /// HSA_MACHINE_LARGE. Must also match
+ /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
+ /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
+ AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13,
+ AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
+
+ /// Indicate if the generated ISA is using a dynamically sized call
+ /// stack. This can happen if calls are implemented using a call
+ /// stack and recursion, alloca or calls to indirect functions are
+ /// present. In these cases the Finalizer cannot compute the total
+ /// private segment size at compile time. In this case the
+ /// workitem_private_segment_byte_size only specifies the statically
+ /// know private segment size, and additional space must be added
+ /// for the call stack.
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14,
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
+
+ /// Indicate if code generated has support for debugging.
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15,
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
+ AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT
+};
+
+/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// control directives. These control how the finalizer generates code. This
+/// struct is used both as an argument to hsaFinalizeKernel to specify values for
+/// the control directives, and is used in HsaKernelCode to record the values of
+/// the control directives that the finalize used when generating the code which
+/// either came from the finalizer argument or explicit HSAIL control
+/// directives. See the definition of the control directives in HSA Programmer's
+/// Reference Manual which also defines how the values specified as finalizer
+/// arguments have to agree with the control directives in the HSAIL code.
+typedef struct hsa_ext_control_directives_s {
+ /// This is a bit set indicating which control directives have been
+ /// specified. If the value is 0 then there are no control directives specified
+ /// and the rest of the fields can be ignored. The bits are accessed using the
+ /// hsa_ext_control_directives_present_mask_t. Any control directive that is not
+ /// enabled in this bit set must have the value of all 0s.
+ hsa_ext_control_directive_present64_t enabled_control_directives;
+
+ /// If enableBreakExceptions is not enabled then must be 0, otherwise must be
+ /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK
+ /// policy enabled. If this set is not empty then the generated code may have
+ /// lower performance than if the set is empty. If the kernel being finalized
+ /// has any enablebreakexceptions control directives, then the values specified
+ /// by this argument are unioned with the values in these control
+ /// directives. If any of the functions the kernel calls have an
+ /// enablebreakexceptions control directive, then they must be equal or a
+ /// subset of, this union.
+ hsa_ext_exception_kind16_t enable_break_exceptions;
+
+ /// If enableDetectExceptions is not enabled then must be 0, otherwise must be
+ /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT
+ /// policy enabled. If this set is not empty then the generated code may have
+ /// lower performance than if the set is empty. However, an implementation
+ /// should endeavour to make the performance impact small. If the kernel being
+ /// finalized has any enabledetectexceptions control directives, then the
+ /// values specified by this argument are unioned with the values in these
+ /// control directives. If any of the functions the kernel calls have an
+ /// enabledetectexceptions control directive, then they must be equal or a
+ /// subset of, this union.
+ hsa_ext_exception_kind16_t enable_detect_exceptions;
+
+ /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of
+ /// dynamic group segment can be allocated for a dispatch, otherwise the value
+ /// specifies the maximum number of bytes of dynamic group segment that can be
+ /// allocated for a dispatch. If the kernel being finalized has any
+ /// maxdynamicsize control directives, then the values must be the same, and
+ /// must be the same as this argument if it is enabled. This value can be used
+ /// by the finalizer to determine the maximum number of bytes of group memory
+ /// used by each work-group by adding this value to the group memory required
+ /// for all group segment variables used by the kernel and all functions it
+ /// calls, and group memory used to implement other HSAIL features such as
+ /// fbarriers and the detect exception operations. This can allow the finalizer
+ /// to determine the expected number of work-groups that can be executed by a
+ /// compute unit and allow more resources to be allocated to the work-items if
+ /// it is known that fewer work-groups can be executed due to group memory
+ /// limitations.
+ uint32_t max_dynamic_group_size;
+
+ /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater
+ /// than 0. See HSA Programmer's Reference Manual description of
+ /// maxflatgridsize control directive.
+ uint32_t max_flat_grid_size;
+
+ /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be
+ /// greater than 0. See HSA Programmer's Reference Manual description of
+ /// maxflatworkgroupsize control directive.
+ uint32_t max_flat_workgroup_size;
+
+ /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the
+ /// finalizer is free to generate ISA that may result in any number of
+ /// work-groups executing on a single compute unit. Otherwise, the finalizer
+ /// should attempt to generate ISA that will allow the specified number of
+ /// work-groups to execute on a single compute unit. This is only a hint and
+ /// can be ignored by the finalizer. If the kernel being finalized, or any of
+ /// the functions it calls, has a requested control directive, then the values
+ /// must be the same. This can be used to determine the number of resources
+ /// that should be allocated to a single work-group and work-item. For example,
+ /// a low value may allow more resources to be allocated, resulting in higher
+ /// per work-item performance, as it is known there will never be more than the
+ /// specified number of work-groups actually executing on the compute
+ /// unit. Conversely, a high value may allocate fewer resources, resulting in
+ /// lower per work-item performance, which is offset by the fact it allows more
+ /// work-groups to actually execute on the compute unit.
+ uint32_t requested_workgroups_per_cu;
+
+ /// If not enabled then all elements for Dim3 must be 0, otherwise every
+ /// element must be greater than 0. See HSA Programmer's Reference Manual
+ /// description of requiredgridsize control directive.
+ hsa_dim3_t required_grid_size;
+
+ /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be
+ /// 0, and the produced code can be dispatched with any legal work-group range
+ /// consistent with the dispatch dimensions. Otherwise, the code produced must
+ /// always be dispatched with the specified work-group range. No element of the
+ /// specified range must be 0. It must be consistent with required_dimensions
+ /// and max_flat_workgroup_size. If the kernel being finalized, or any of the
+ /// functions it calls, has a requiredworkgroupsize control directive, then the
+ /// values must be the same. Specifying a value can allow the finalizer to
+ /// optimize work-group id operations, and if the number of work-items in the
+ /// work-group is less than the WAVESIZE then barrier operations can be
+ /// optimized to just a memory fence.
+ hsa_dim3_t required_workgroup_size;
+
+ /// If requiredDim is not enabled then must be 0 and the produced kernel code
+ /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is
+ /// 1..3 and the code produced must only be dispatched with a dimension that
+ /// matches. Other values are illegal. If the kernel being finalized, or any of
+ /// the functions it calls, has a requireddimsize control directive, then the
+ /// values must be the same. This can be used to optimize the code generated to
+ /// compute the absolute and flat work-group and work-item id, and the dim
+ /// HSAIL operations.
+ uint8_t required_dim;
+
+ /// Reserved. Must be 0.
+ uint8_t reserved[75];
+} hsa_ext_control_directives_t;
+
+/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
+/// Code Object to set up the hardware to execute the kernel dispatch.
+///
+/// Initial Kernel Register State.
+///
+/// Initial kernel register state will be set up by CP/SPI prior to the start
+/// of execution of every wavefront. This is limited by the constraints of the
+/// current hardware.
+///
+/// The order of the SGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enable_sgpr_* bit fields. The register numbers used for enabled registers
+/// are dense starting at SGPR0: the first enabled register is SGPR0, the next
+/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR
+/// number.
+///
+/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
+/// apply to all waves of the grid. It is possible to specify more than 16 User
+/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
+/// are actually initialized. These are then immediately followed by the System
+/// SGPRs that are set up by ADC/SPI and can have different values for each wave
+/// of the grid dispatch.
+///
+/// SGPR register initial state is defined as follows:
+///
+/// Private Segment Buffer (enable_sgpr_private_segment_buffer):
+/// Number of User SGPR registers: 4. V# that can be used, together with
+/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg
+/// segments using a segment address. It must be set as follows:
+/// - Base address: of the scratch memory area used by the dispatch. It
+/// does not include the scratch wave offset. It will be the per process
+/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
+/// example there may be a per pipe offset, or per AQL Queue offset).
+/// - Stride + data_format: Element Size * Index Stride (???)
+/// - Cache swizzle: ???
+/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
+/// scratch)
+/// - Num records: Flat Scratch Work Item Size / Element Size (???)
+/// - Dst_sel_*: ???
+/// - Num_format: ???
+/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
+/// agree with amd_kernel_code_t.privateElementSize)
+/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
+/// be number of wavefront lanes for scratch, must agree with
+/// amd_kernel_code_t.wavefrontSize)
+/// - Add tid enable: 1
+/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
+/// - Hash_enable: ???
+/// - Heap: ???
+/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
+/// - Type: 0 (a buffer) (???)
+///
+/// Dispatch Ptr (enable_sgpr_dispatch_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
+/// for kernel actually executing.
+///
+/// Queue Ptr (enable_sgpr_queue_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
+/// AQL queue on which the dispatch packet was queued.
+///
+/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
+/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
+/// is directly copied from the kernargPtr in the dispatch packet. Having CP
+/// load it once avoids loading it at the beginning of every wavefront.
+///
+/// Dispatch Id (enable_sgpr_dispatch_id):
+/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
+/// packet being executed.
+///
+/// Flat Scratch Init (enable_sgpr_flat_scratch_init):
+/// Number of User SGPR registers: 2. This is 2 SGPRs.
+///
+/// For CI/VI:
+/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
+/// to base of memory for scratch for this dispatch. This is the same offset
+/// used in computing the Scratch Segment Buffer base address. The value of
+/// Scratch Wave Offset must be added by the kernel code and moved to
+/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
+///
+/// The second SGPR is 32 bit byte size of a single work-item’s scratch
+/// memory usage. This is directly loaded from the dispatch packet Private
+/// Segment Byte Size and rounded up to a multiple of DWORD.
+///
+/// \todo [Does CP need to round this to >4 byte alignment?]
+///
+/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
+/// flat memory instructions. Having CP load it once avoids loading it at
+/// the beginning of every wavefront.
+///
+/// For PI:
+/// This is the 64 bit base address of the scratch backing memory for
+/// allocated by CP for this dispatch.
+///
+/// Private Segment Size (enable_sgpr_private_segment_size):
+/// Number of User SGPR registers: 1. The 32 bit byte size of a single
+/// work-item’s scratch memory allocation. This is the value from the dispatch
+/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
+///
+/// \todo [Does CP need to round this to >4 byte alignment?]
+///
+/// Having CP load it once avoids loading it at the beginning of every
+/// wavefront.
+///
+/// \todo [This will not be used for CI/VI since it is the same value as
+/// the second SGPR of Flat Scratch Init. However, it is need for PI which
+/// changes meaning of Flat Scratchg Init..]
+///
+/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the X dimension for the grid being executed. Computed from
+/// the fields in the HsaDispatchPacket as
+/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
+///
+/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the Y dimension for the grid being executed. Computed from
+/// the fields in the HsaDispatchPacket as
+/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
+///
+/// Only initialized if <16 previous SGPRs initialized.
+///
+/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
+/// Number of User SGPR registers: 1. 32 bit count of the number of
+/// work-groups in the Z dimension for the grid being executed. Computed
+/// from the fields in the HsaDispatchPacket as
+/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
+///
+/// Only initialized if <16 previous SGPRs initialized.
+///
+/// Work-Group Id X (enable_sgpr_workgroup_id_x):
+/// Number of System SGPR registers: 1. 32 bit work group id in X dimension
+/// of grid for wavefront. Always present.
+///
+/// Work-Group Id Y (enable_sgpr_workgroup_id_y):
+/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension
+/// of grid for wavefront.
+///
+/// Work-Group Id Z (enable_sgpr_workgroup_id_z):
+/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension
+/// of grid for wavefront. If present then Work-group Id Y will also be
+/// present
+///
+/// Work-Group Info (enable_sgpr_workgroup_info):
+/// Number of System SGPR registers: 1. {first_wave, 14’b0000,
+/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
+///
+/// Private Segment Wave Byte Offset
+/// (enable_sgpr_private_segment_wave_byte_offset):
+/// Number of System SGPR registers: 1. 32 bit byte offset from base of
+/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg
+/// segment address when using Scratch Segment Buffer. It must be added to
+/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
+///
+///
+/// The order of the VGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enableVgpr* bit fields. The register numbers used for enabled registers
+/// are dense starting at VGPR0: the first enabled register is VGPR0, the next
+/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR
+/// number.
+///
+/// VGPR register initial state is defined as follows:
+///
+/// Work-Item Id X (always initialized):
+/// Number of registers: 1. 32 bit work item id in X dimension of work-group
+/// for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+/// Number of registers: 1. 32 bit work item id in Y dimension of work-group
+/// for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+/// Number of registers: 1. 32 bit work item id in Z dimension of work-group
+/// for wavefront lane.
+///
+///
+/// The setting of registers is being done by existing GPU hardware as follows:
+/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
+/// registers.
+/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any
+/// combination including none.
+/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot
+/// be added into the value Flat Scratch Offset which would avoid the
+/// Finalizer generated prolog having to do the add.
+/// 4) The VGPRs are set by SPI which only supports specifying either (X),
+/// (X, Y) or (X, Y, Z).
+///
+/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
+/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and
+/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
+///
+/// The global segment can be accessed either using flat operations or buffer
+/// operations. If buffer operations are used then the Global Buffer used to
+/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
+/// segment address is not passed into the kernel code by CP since its base
+/// address is always 0. Instead the Finalizer generates prolog code to
+/// initialize 4 SGPRs with a V# that has the following properties, and then
+/// uses that in the buffer instructions:
+/// - base address of 0
+/// - no swizzle
+/// - ATC=1
+/// - MTYPE set to support memory coherence specified in
+/// amd_kernel_code_t.globalMemoryCoherence
+///
+/// When the Global Buffer is used to access the Kernarg segment, must add the
+/// dispatch packet kernArgPtr to a kernarg segment address before using this V#.
+/// Alternatively scalar loads can be used if the kernarg offset is uniform, as
+/// the kernarg segment is constant for the duration of the kernel execution.
+///
+typedef struct amd_kernel_code_s {
+ /// The AMD major version of the Code Object. Must be the value
+ /// AMD_CODE_VERSION_MAJOR.
+ amd_code_version32_t amd_code_version_major;
+
+ /// The AMD minor version of the Code Object. Minor versions must be
+ /// backward compatible. Must be the value
+ /// AMD_CODE_VERSION_MINOR.
+ amd_code_version32_t amd_code_version_minor;
+
+ /// The byte size of this struct. Must be set to
+ /// sizeof(amd_kernel_code_t). Used for backward
+ /// compatibility.
+ uint32_t struct_byte_size;
+
+ /// The target chip instruction set for which code has been
+ /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration
+ /// in sc/Interface/SCCommon.h.
+ uint32_t target_chip;
+
+ /// Byte offset (possibly negative) from start of amd_kernel_code_t
+ /// object to kernel's entry point instruction. The actual code for
+ /// the kernel is required to be 256 byte aligned to match hardware
+ /// requirements (SQ cache line is 16). The code must be position
+ /// independent code (PIC) for AMD devices to give runtime the
+ /// option of copying code to discrete GPU memory or APU L2
+ /// cache. The Finalizer should endeavour to allocate all kernel
+ /// machine code in contiguous memory pages so that a device
+ /// pre-fetcher will tend to only pre-fetch Kernel Code objects,
+ /// improving cache performance.
+ int64_t kernel_code_entry_byte_offset;
+
+ /// Range of bytes to consider prefetching expressed as an offset
+ /// and size. The offset is from the start (possibly negative) of
+ /// amd_kernel_code_t object. Set both to 0 if no prefetch
+ /// information is available.
+ ///
+ /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did
+ /// not make the size a uint64_t as prefetching more than 4GiB seems
+ /// excessive.
+ int64_t kernel_code_prefetch_byte_offset;
+ uint64_t kernel_code_prefetch_byte_size;
+
+ /// Number of bytes of scratch backing memory required for full
+ /// occupancy of target chip. This takes into account the number of
+ /// bytes of scratch per work-item, the wavefront size, the maximum
+ /// number of wavefronts per CU, and the number of CUs. This is an
+ /// upper limit on scratch. If the grid being dispatched is small it
+ /// may only need less than this. If the kernel uses no scratch, or
+ /// the Finalizer has not computed this value, it must be 0.
+ uint64_t max_scratch_backing_memory_byte_size;
+
+ /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+ /// COMPUTE_PGM_RSRC2 registers.
+ amd_compute_pgm_resource_register64_t compute_pgm_resource_registers;
+
+ /// Code properties. See amd_code_property_mask_t for a full list of
+ /// properties.
+ amd_code_property32_t code_properties;
+
+ /// The amount of memory required for the combined private, spill
+ /// and arg segments for a work-item in bytes. If
+ /// is_dynamic_callstack is 1 then additional space must be added to
+ /// this value for the call stack.
+ uint32_t workitem_private_segment_byte_size;
+
+ /// The amount of group segment memory required by a work-group in
+ /// bytes. This does not include any dynamically allocated group
+ /// segment memory that may be added when the kernel is
+ /// dispatched.
+ uint32_t workgroup_group_segment_byte_size;
+
+ /// Number of byte of GDS required by kernel dispatch. Must be 0 if
+ /// not using GDS.
+ uint32_t gds_segment_byte_size;
+
+ /// The size in bytes of the kernarg segment that holds the values
+ /// of the arguments to the kernel. This could be used by CP to
+ /// prefetch the kernarg segment pointed to by the dispatch packet.
+ uint64_t kernarg_segment_byte_size;
+
+ /// Number of fbarrier's used in the kernel and all functions it
+ /// calls. If the implementation uses group memory to allocate the
+ /// fbarriers then that amount must already be included in the
+ /// workgroup_group_segment_byte_size total.
+ uint32_t workgroup_fbarrier_count;
+
+ /// Number of scalar registers used by a wavefront. This includes
+ /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
+ /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
+ /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
+ uint16_t wavefront_sgpr_count;
+
+ /// Number of vector registers used by each work-item. Used to set
+ /// COMPUTE_PGM_RSRC1.VGPRS.
+ uint16_t workitem_vgpr_count;
+
+ /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
+ /// first fixed VGPR number reserved.
+ uint16_t reserved_vgpr_first;
+
+ /// The number of consecutive VGPRs reserved by the client. If
+ /// is_debug_supported then this count includes VGPRs reserved
+ /// for debugger use.
+ uint16_t reserved_vgpr_count;
+
+ /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
+ /// first fixed SGPR number reserved.
+ uint16_t reserved_sgpr_first;
+
+ /// The number of consecutive SGPRs reserved by the client. If
+ /// is_debug_supported then this count includes SGPRs reserved
+ /// for debugger use.
+ uint16_t reserved_sgpr_count;
+
+ /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+ /// fixed SGPR number used to hold the wave scratch offset for the
+ /// entire kernel execution, or uint16_t(-1) if the register is not
+ /// used or not known.
+ uint16_t debug_wavefront_private_segment_offset_sgpr;
+
+ /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+ /// fixed SGPR number of the first of 4 SGPRs used to hold the
+ /// scratch V# used for the entire kernel execution, or uint16_t(-1)
+ /// if the registers are not used or not known.
+ uint16_t debug_private_segment_buffer_sgpr;
+
+ /// The maximum byte alignment of variables used by the kernel in
+ /// the specified memory segment. Expressed as a power of two. Must
+ /// be at least HSA_POWERTWO_16.
+ hsa_powertwo8_t kernarg_segment_alignment;
+ hsa_powertwo8_t group_segment_alignment;
+ hsa_powertwo8_t private_segment_alignment;
+
+ uint8_t reserved3;
+
+ /// Type of code object.
+ hsa_ext_code_kind32_t code_type;
+
+ /// Reserved for code properties if any are defined in the future.
+ /// There are currently no code properties so this field must be 0.
+ uint32_t reserved4;
+
+ /// Wavefront size expressed as a power of two. Must be a power of 2
+ /// in range 1..64 inclusive. Used to support runtime query that
+ /// obtains wavefront size, which may be used by application to
+ /// allocated dynamic group memory and set the dispatch work-group
+ /// size.
+ hsa_powertwo8_t wavefront_size;
+
+ /// The optimization level specified when the kernel was
+ /// finalized.
+ uint8_t optimization_level;
+
+ /// The HSAIL profile defines which features are used. This
+ /// information is from the HSAIL version directive. If this
+ /// amd_kernel_code_t is not generated from an HSAIL compilation
+ /// unit then must be 0.
+ hsa_ext_brig_profile8_t hsail_profile;
+
+ /// The HSAIL machine model gives the address sizes used by the
+ /// code. This information is from the HSAIL version directive. If
+ /// not generated from an HSAIL compilation unit then must still
+ /// indicate for what machine mode the code is generated.
+ hsa_ext_brig_machine_model8_t hsail_machine_model;
+
+ /// The HSAIL major version. This information is from the HSAIL
+ /// version directive. If this amd_kernel_code_t is not
+ /// generated from an HSAIL compilation unit then must be 0.
+ uint32_t hsail_version_major;
+
+ /// The HSAIL minor version. This information is from the HSAIL
+ /// version directive. If this amd_kernel_code_t is not
+ /// generated from an HSAIL compilation unit then must be 0.
+ uint32_t hsail_version_minor;
+
+ /// Reserved for HSAIL target options if any are defined in the
+ /// future. There are currently no target options so this field
+ /// must be 0.
+ uint16_t reserved5;
+
+ /// Reserved. Must be 0.
+ uint16_t reserved6;
+
+ /// The values should be the actually values used by the finalizer
+ /// in generating the code. This may be the union of values
+ /// specified as finalizer arguments and explicit HSAIL control
+ /// directives. If the finalizer chooses to ignore a control
+ /// directive, and not generate constrained code, then the control
+ /// directive should not be marked as enabled even though it was
+ /// present in the HSAIL or finalizer argument. The values are
+ /// intended to reflect the constraints that the code actually
+ /// requires to correctly execute, not the values that were
+ /// actually specified at finalize time.
+ hsa_ext_control_directives_t control_directive;
+
+ /// The code can immediately follow the amd_kernel_code_t, or can
+ /// come after subsequent amd_kernel_code_t structs when there are
+ /// multiple kernels in the compilation unit.
+
+} amd_kernel_code_t;
+
+#endif // AMDKERNELCODET_H
diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
new file mode 100644
index 000000000000..3b4ba1a8e8e9
--- /dev/null
+++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
@@ -0,0 +1,320 @@
+//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAsmParser : public MCTargetAsmParser {
+ MCSubtargetInfo &STI;
+ MCAsmParser &Parser;
+
+
+ /// @name Auto-generated Match Functions
+ /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AMDGPUGenAsmMatcher.inc"
+
+ /// }
+
+public:
+ AMDGPUAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+ const MCInstrInfo &_MII,
+ const MCTargetOptions &Options)
+ : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+ setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ }
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands, MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) override;
+ bool ParseDirective(AsmToken DirectiveID) override;
+ OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+ bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) override;
+
+ bool parseCnt(int64_t &IntVal);
+ OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
+};
+
+class AMDGPUOperand : public MCParsedAsmOperand {
+ enum KindTy {
+ Token,
+ Immediate
+ } Kind;
+
+public:
+ AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+ struct TokOp {
+ const char *Data;
+ unsigned Length;
+ };
+
+ struct ImmOp {
+ int64_t Val;
+ };
+
+ union {
+ TokOp Tok;
+ ImmOp Imm;
+ };
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ Inst.addOperand(MCOperand::CreateImm(getImm()));
+ }
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ llvm_unreachable("addRegOperands");
+ }
+ StringRef getToken() const {
+ return StringRef(Tok.Data, Tok.Length);
+ }
+ bool isToken() const override {
+ return Kind == Token;
+ }
+
+ bool isImm() const override {
+ return Kind == Immediate;
+ }
+
+ int64_t getImm() const {
+ return Imm.Val;
+ }
+
+ bool isReg() const override {
+ return false;
+ }
+
+ unsigned getReg() const override {
+ return 0;
+ }
+
+ bool isMem() const override {
+ return false;
+ }
+
+ SMLoc getStartLoc() const override {
+ return SMLoc();
+ }
+
+ SMLoc getEndLoc() const override {
+ return SMLoc();
+ }
+
+ void print(raw_ostream &OS) const override { }
+
+ static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val) {
+ auto Op = llvm::make_unique<AMDGPUOperand>(Immediate);
+ Op->Imm.Val = Val;
+ return Op;
+ }
+
+ static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc) {
+ auto Res = llvm::make_unique<AMDGPUOperand>(Token);
+ Res->Tok.Data = Str.data();
+ Res->Tok.Length = Str.size();
+ return Res;
+ }
+
+ bool isSWaitCnt() const;
+};
+
+}
+
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+ return true;
+}
+
+
+bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+ OperandVector &Operands,
+ MCStreamer &Out,
+ uint64_t &ErrorInfo,
+ bool MatchingInlineAsm) {
+ MCInst Inst;
+
+ switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+ case Match_Success:
+ Inst.setLoc(IDLoc);
+ Out.EmitInstruction(Inst, STI);
+ return false;
+ case Match_MissingFeature:
+ return Error(IDLoc, "instruction use requires an option to be enabled");
+ case Match_MnemonicFail:
+ return Error(IDLoc, "unrecognized instruction mnemonic");
+ case Match_InvalidOperand: {
+ if (ErrorInfo != ~0ULL) {
+ if (ErrorInfo >= Operands.size())
+ return Error(IDLoc, "too few operands for instruction");
+
+ }
+ return Error(IDLoc, "invalid operand for instruction");
+ }
+ }
+ llvm_unreachable("Implement any new match types added!");
+}
+
+bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
+ return true;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+
+ // Try to parse with a custom parser
+ OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+ // If we successfully parsed the operand or if there as an error parsing,
+ // we are done.
+ if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
+ return ResTy;
+
+ switch(getLexer().getKind()) {
+ case AsmToken::Integer: {
+ int64_t IntVal;
+ if (getParser().parseAbsoluteExpression(IntVal))
+ return MatchOperand_ParseFail;
+ Operands.push_back(AMDGPUOperand::CreateImm(IntVal));
+ return MatchOperand_Success;
+ }
+ default:
+ return MatchOperand_NoMatch;
+ }
+}
+
+bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ StringRef Name,
+ SMLoc NameLoc, OperandVector &Operands) {
+ // Add the instruction mnemonic
+ Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc));
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ return false;
+
+ AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name);
+ switch (Res) {
+ case MatchOperand_Success: return false;
+ case MatchOperand_ParseFail: return Error(NameLoc,
+ "Failed parsing operand");
+ case MatchOperand_NoMatch: return Error(NameLoc, "Not a valid operand");
+ }
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// s_waitcnt
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
+ StringRef CntName = Parser.getTok().getString();
+ int64_t CntVal;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::LParen))
+ return true;
+
+ Parser.Lex();
+ if (getLexer().isNot(AsmToken::Integer))
+ return true;
+
+ if (getParser().parseAbsoluteExpression(CntVal))
+ return true;
+
+ if (getLexer().isNot(AsmToken::RParen))
+ return true;
+
+ Parser.Lex();
+ if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
+ Parser.Lex();
+
+ int CntShift;
+ int CntMask;
+
+ if (CntName == "vmcnt") {
+ CntMask = 0xf;
+ CntShift = 0;
+ } else if (CntName == "expcnt") {
+ CntMask = 0x7;
+ CntShift = 4;
+ } else if (CntName == "lgkmcnt") {
+ CntMask = 0x7;
+ CntShift = 8;
+ } else {
+ return true;
+ }
+
+ IntVal &= ~(CntMask << CntShift);
+ IntVal |= (CntVal << CntShift);
+ return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
+ // Disable all counters by default.
+ // vmcnt [3:0]
+ // expcnt [6:4]
+ // lgkmcnt [10:8]
+ int64_t CntVal = 0x77f;
+
+ switch(getLexer().getKind()) {
+ default: return MatchOperand_ParseFail;
+ case AsmToken::Integer:
+ // The operand can be an integer value.
+ if (getParser().parseAbsoluteExpression(CntVal))
+ return MatchOperand_ParseFail;
+ break;
+
+ case AsmToken::Identifier:
+ do {
+ if (parseCnt(CntVal))
+ return MatchOperand_ParseFail;
+ } while(getLexer().isNot(AsmToken::EndOfStatement));
+ break;
+ }
+ Operands.push_back(AMDGPUOperand::CreateImm(CntVal));
+ return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSWaitCnt() const {
+ return isImm();
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeR600AsmParser() {
+ RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget);
+ RegisterMCAsmParser<AMDGPUAsmParser> B(TheGCNTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AMDGPUGenAsmMatcher.inc"
+
diff --git a/lib/Target/R600/AsmParser/CMakeLists.txt b/lib/Target/R600/AsmParser/CMakeLists.txt
new file mode 100644
index 000000000000..1b42af73740e
--- /dev/null
+++ b/lib/Target/R600/AsmParser/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMR600AsmParser
+ AMDGPUAsmParser.cpp
+ )
diff --git a/lib/Target/R600/AsmParser/LLVMBuild.txt b/lib/Target/R600/AsmParser/LLVMBuild.txt
new file mode 100644
index 000000000000..940e4cee6dfd
--- /dev/null
+++ b/lib/Target/R600/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/R600/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = R600AsmParser
+parent = R600
+required_libraries = MC MCParser R600Desc R600Info Support
+add_to_library_groups = R600
diff --git a/lib/Target/R600/AsmParser/Makefile b/lib/Target/R600/AsmParser/Makefile
new file mode 100644
index 000000000000..e6689b54b6ba
--- /dev/null
+++ b/lib/Target/R600/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/R600/AsmParser/Makefile ----------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600AsmParser
+
+# Hack: we need to include 'main' R600 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/CIInstructions.td b/lib/Target/R600/CIInstructions.td
new file mode 100644
index 000000000000..fdb58bbebd54
--- /dev/null
+++ b/lib/Target/R600/CIInstructions.td
@@ -0,0 +1,42 @@
+//===-- CIInstructions.td - CI Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+
+
+def isCIVI : Predicate <
+ "Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
+ "Subtarget.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isCIVI in {
+
+defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
+ VOP_F64_F64, ftrunc
+>;
+defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
+ VOP_F64_F64, fceil
+>;
+defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
+ VOP_F64_F64, ffloor
+>;
+defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
+ VOP_F64_F64, frint
+>;
+defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
+ VOP_F32_F32
+>;
+defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
+ VOP_F32_F32
+>;
+} // End SubtargetPredicate = isCIVI
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 49a7f8aa18c8..5a4bae2f93cc 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -6,13 +6,15 @@ tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
-tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
add_public_tablegen_target(AMDGPUCommonTableGen)
add_llvm_target(R600CodeGen
AMDILCFGStructurizer.cpp
+ AMDGPUAlwaysInlinePass.cpp
AMDGPUAsmPrinter.cpp
AMDGPUFrameLowering.cpp
AMDGPUIntrinsicInfo.cpp
@@ -41,17 +43,21 @@ add_llvm_target(R600CodeGen
SIAnnotateControlFlow.cpp
SIFixSGPRCopies.cpp
SIFixSGPRLiveRanges.cpp
+ SIFoldOperands.cpp
SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
+ SILoadStoreOptimizer.cpp
SILowerControlFlow.cpp
SILowerI1Copies.cpp
SIMachineFunctionInfo.cpp
+ SIPrepareScratchRegs.cpp
SIRegisterInfo.cpp
SIShrinkInstructions.cpp
SITypeRewriter.cpp
)
+add_subdirectory(AsmParser)
add_subdirectory(InstPrinter)
add_subdirectory(TargetInfo)
add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index 26303452c101..58b5ce24b4a3 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -46,6 +46,8 @@ def SIN_cm : SIN_Common<0x8D>;
def COS_cm : COS_Common<0x8E>;
} // End isVector = 1
+defm : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+
def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index 484e52250d1b..f24f76b7fe16 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -69,6 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+defm : RsqPat<RECIPSQRT_IEEE_eg, f32>;
def SIN_eg : SIN_Common<0x8D>;
def COS_eg : COS_Common<0x8E>;
@@ -256,6 +257,12 @@ def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
let Predicates = [isEGorCayman] in {
+// Should be predicated on FeatureFP64
+// def FMA_64 : R600_3OP <
+// 0xA, "FMA_64",
+// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+// >;
+
// BFE_UINT - bit_extract, an optimization for mask and shift
// Src0 = Input
// Src1 = Offset
@@ -295,7 +302,7 @@ def : Pat<(i32 (sext_inreg i32:$src, i8)),
def : Pat<(i32 (sext_inreg i32:$src, i16)),
(BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
-defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>;
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
[(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
@@ -312,6 +319,7 @@ def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
+def FMA_eg : FMA_Common<0x7>;
def ASHR_eg : ASHR_Common<0x15>;
def LSHR_eg : LSHR_Common<0x16>;
def LSHL_eg : LSHL_Common<0x17>;
@@ -466,21 +474,47 @@ class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
let DisableEncoding = "$dst";
}
-class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
+class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+ string dst =""> :
R600_LDS <
- lds_op,
- (outs),
+ lds_op, outs,
(ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
- " "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+ " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
pattern> {
+
+ field string BaseOp;
+
+ let LDS_1A1D = 0;
let LDS_1A2D = 1;
}
+class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS_1A2D <lds_op, (outs), name, pattern> {
+ let BaseOp = name;
+}
+
+class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+ R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> {
+
+ let BaseOp = name;
+ let usesCustomInserter = 1;
+ let DisableEncoding = "$dst";
+}
+
def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >;
+def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >;
+def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >;
+def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >;
+def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >;
+def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >;
+def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >;
+def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >;
+def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >;
def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
[(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
>;
@@ -496,6 +530,33 @@ def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
[(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
>;
+def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
+ [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
+>;
+def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
+ [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
+>;
+def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
+ [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
+ [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
+ [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
+ [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
+ [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
+>;
+def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
+ [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
+>;
+def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
+ [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))]
+>;
def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
[(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
>;
@@ -529,7 +590,7 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
// SHA-256 Patterns
def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
-def : FROUNDPat <CNDGE_eg>;
+def : FROUNDPat <CNDGE_eg, CNDGT_eg>;
def EG_ExportSwz : ExportSwzInst {
let Word1{19-16} = 0; // BURST_COUNT
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 0927040cb5bc..8271c6f45fb9 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -10,8 +10,10 @@
#include "AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/MathExtras.h"
@@ -40,6 +42,81 @@ void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
}
+void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " offen";
+}
+
+void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " idxen";
+}
+
+void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " addr64";
+}
+
+void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm()) {
+ O << " offset:";
+ printU16ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ uint16_t Imm = MI->getOperand(OpNo).getImm();
+ if (Imm != 0) {
+ O << " offset:";
+ printU16ImmDecOperand(MI, OpNo, O);
+ }
+}
+
+void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << " offset0:";
+ printU8ImmDecOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ O << " offset1:";
+ printU8ImmDecOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " glc";
+}
+
+void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " slc";
+}
+
+void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " tfe";
+}
+
void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
switch (reg) {
case AMDGPU::VCC:
@@ -54,6 +131,27 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
case AMDGPU::M0:
O << "m0";
return;
+ case AMDGPU::FLAT_SCR:
+ O << "flat_scratch";
+ return;
+ case AMDGPU::VCC_LO:
+ O << "vcc_lo";
+ return;
+ case AMDGPU::VCC_HI:
+ O << "vcc_hi";
+ return;
+ case AMDGPU::EXEC_LO:
+ O << "exec_lo";
+ return;
+ case AMDGPU::EXEC_HI:
+ O << "exec_hi";
+ return;
+ case AMDGPU::FLAT_SCR_LO:
+ O << "flat_scratch_lo";
+ return;
+ case AMDGPU::FLAT_SCR_HI:
+ O << "flat_scratch_hi";
+ return;
default:
break;
}
@@ -110,26 +208,62 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
}
-void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) {
int32_t SImm = static_cast<int32_t>(Imm);
if (SImm >= -16 && SImm <= 64) {
O << SImm;
return;
}
- if (Imm == FloatToBits(1.0f) ||
- Imm == FloatToBits(-1.0f) ||
- Imm == FloatToBits(0.5f) ||
- Imm == FloatToBits(-0.5f) ||
- Imm == FloatToBits(2.0f) ||
- Imm == FloatToBits(-2.0f) ||
- Imm == FloatToBits(4.0f) ||
- Imm == FloatToBits(-4.0f)) {
- O << BitsToFloat(Imm);
+ if (Imm == FloatToBits(0.0f))
+ O << "0.0";
+ else if (Imm == FloatToBits(1.0f))
+ O << "1.0";
+ else if (Imm == FloatToBits(-1.0f))
+ O << "-1.0";
+ else if (Imm == FloatToBits(0.5f))
+ O << "0.5";
+ else if (Imm == FloatToBits(-0.5f))
+ O << "-0.5";
+ else if (Imm == FloatToBits(2.0f))
+ O << "2.0";
+ else if (Imm == FloatToBits(-2.0f))
+ O << "-2.0";
+ else if (Imm == FloatToBits(4.0f))
+ O << "4.0";
+ else if (Imm == FloatToBits(-4.0f))
+ O << "-4.0";
+ else
+ O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
+ int64_t SImm = static_cast<int64_t>(Imm);
+ if (SImm >= -16 && SImm <= 64) {
+ O << SImm;
return;
}
- O << formatHex(static_cast<uint64_t>(Imm));
+ if (Imm == DoubleToBits(0.0))
+ O << "0.0";
+ else if (Imm == DoubleToBits(1.0))
+ O << "1.0";
+ else if (Imm == DoubleToBits(-1.0))
+ O << "-1.0";
+ else if (Imm == DoubleToBits(0.5))
+ O << "0.5";
+ else if (Imm == DoubleToBits(-0.5))
+ O << "-0.5";
+ else if (Imm == DoubleToBits(2.0))
+ O << "2.0";
+ else if (Imm == DoubleToBits(-2.0))
+ O << "-2.0";
+ else if (Imm == DoubleToBits(4.0))
+ O << "4.0";
+ else if (Imm == DoubleToBits(-4.0))
+ O << "-4.0";
+ else
+ llvm_unreachable("64-bit literal constants not supported");
}
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -147,27 +281,55 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
break;
}
} else if (Op.isImm()) {
- printImmediate(Op.getImm(), O);
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ int RCID = Desc.OpInfo[OpNo].RegClass;
+ if (RCID != -1) {
+ const MCRegisterClass &ImmRC = MRI.getRegClass(RCID);
+ if (ImmRC.getSize() == 4)
+ printImmediate32(Op.getImm(), O);
+ else if (ImmRC.getSize() == 8)
+ printImmediate64(Op.getImm(), O);
+ else
+ llvm_unreachable("Invalid register class size");
+ } else {
+ // We hit this for the immediate instruction bits that don't yet have a
+ // custom printer.
+ // TODO: Eventually this should be unnecessary.
+ O << formatDec(Op.getImm());
+ }
} else if (Op.isFPImm()) {
- O << Op.getFPImm();
+ // We special case 0.0 because otherwise it will be printed as an integer.
+ if (Op.getFPImm() == 0.0)
+ O << "0.0";
+ else {
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass);
+
+ if (ImmRC.getSize() == 4)
+ printImmediate32(FloatToBits(Op.getFPImm()), O);
+ else if (ImmRC.getSize() == 8)
+ printImmediate64(DoubleToBits(Op.getFPImm()), O);
+ else
+ llvm_unreachable("Invalid register class size");
+ }
} else if (Op.isExpr()) {
const MCExpr *Exp = Op.getExpr();
Exp->print(O);
} else {
- assert(!"unknown operand type in printOperand");
+ llvm_unreachable("unknown operand type in printOperand");
}
}
void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
unsigned InputModifiers = MI->getOperand(OpNo).getImm();
- if (InputModifiers & 0x1)
- O << "-";
- if (InputModifiers & 0x2)
- O << "|";
+ if (InputModifiers & SISrcMods::NEG)
+ O << '-';
+ if (InputModifiers & SISrcMods::ABS)
+ O << '|';
printOperand(MI, OpNo + 1, O);
- if (InputModifiers & 0x2)
- O << "|";
+ if (InputModifiers & SISrcMods::ABS)
+ O << '|';
}
void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
@@ -181,7 +343,7 @@ void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
} else if (Imm == 0) {
O << "P10";
} else {
- assert(!"Invalid interpolation parameter slot");
+ llvm_unreachable("Invalid interpolation parameter slot");
}
}
@@ -214,6 +376,23 @@ void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
printIfSet(MI, OpNo, O, "_SAT");
}
+void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ if (MI->getOperand(OpNo).getImm())
+ O << " clamp";
+}
+
+void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ int Imm = MI->getOperand(OpNo).getImm();
+ if (Imm == SIOutMods::MUL2)
+ O << " mul:2";
+ else if (Imm == SIOutMods::MUL4)
+ O << " mul:4";
+ else if (Imm == SIOutMods::DIV2)
+ O << " div:2";
+}
+
void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
int32_t Imm = MI->getOperand(OpNo).getImm();
@@ -281,7 +460,7 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
sel -= 512;
int cb = sel >> 12;
sel &= 4095;
- O << cb << "[" << sel << "]";
+ O << cb << '[' << sel << ']';
} else if (sel >= 448) {
sel -= 448;
O << sel;
@@ -290,7 +469,7 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
}
if (sel >= 0)
- O << "." << chans[chan];
+ O << '.' << chans[chan];
}
void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
@@ -323,25 +502,25 @@ void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
unsigned Sel = MI->getOperand(OpNo).getImm();
switch (Sel) {
case 0:
- O << "X";
+ O << 'X';
break;
case 1:
- O << "Y";
+ O << 'Y';
break;
case 2:
- O << "Z";
+ O << 'Z';
break;
case 3:
- O << "W";
+ O << 'W';
break;
case 4:
- O << "0";
+ O << '0';
break;
case 5:
- O << "1";
+ O << '1';
break;
case 7:
- O << "_";
+ O << '_';
break;
default:
break;
@@ -353,10 +532,10 @@ void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
unsigned CT = MI->getOperand(OpNo).getImm();
switch (CT) {
case 0:
- O << "U";
+ O << 'U';
break;
case 1:
- O << "N";
+ O << 'N';
break;
default:
break;
@@ -368,10 +547,10 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
int KCacheMode = MI->getOperand(OpNo).getImm();
if (KCacheMode > 0) {
int KCacheBank = MI->getOperand(OpNo - 2).getImm();
- O << "CB" << KCacheBank <<":";
+ O << "CB" << KCacheBank << ':';
int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
- int LineSize = (KCacheMode == 1)?16:32;
- O << KCacheAddr * 16 << "-" << KCacheAddr * 16 + LineSize;
+ int LineSize = (KCacheMode == 1) ? 16 : 32;
+ O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
}
}
@@ -415,12 +594,26 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
unsigned Vmcnt = SImm16 & 0xF;
unsigned Expcnt = (SImm16 >> 4) & 0xF;
unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
- if (Vmcnt != 0xF)
- O << "vmcnt(" << Vmcnt << ") ";
- if (Expcnt != 0x7)
- O << "expcnt(" << Expcnt << ") ";
- if (Lgkmcnt != 0x7)
- O << "lgkmcnt(" << Lgkmcnt << ")";
+
+ bool NeedSpace = false;
+
+ if (Vmcnt != 0xF) {
+ O << "vmcnt(" << Vmcnt << ')';
+ NeedSpace = true;
+ }
+
+ if (Expcnt != 0x7) {
+ if (NeedSpace)
+ O << ' ';
+ O << "expcnt(" << Expcnt << ')';
+ NeedSpace = true;
+ }
+
+ if (Lgkmcnt != 0x7) {
+ if (NeedSpace)
+ O << ' ';
+ O << "lgkmcnt(" << Lgkmcnt << ')';
+ }
}
#include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 6ca717076cde..1d43c7acbe74 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -10,8 +10,8 @@
/// \file
//===----------------------------------------------------------------------===//
-#ifndef AMDGPUINSTPRINTER_H
-#define AMDGPUINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
@@ -34,9 +34,22 @@ public:
private:
void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printRegOperand(unsigned RegNo, raw_ostream &O);
- void printImmediate(uint32_t Imm, raw_ostream &O);
+ void printImmediate32(uint32_t I, raw_ostream &O);
+ void printImmediate64(uint64_t I, raw_ostream &O);
void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
@@ -45,6 +58,8 @@ private:
StringRef Asm, StringRef Default = "");
static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -65,4 +80,4 @@ private:
} // End namespace llvm
-#endif // AMDGPUINSTRPRINTER_H
+#endif
diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt
index 908872b55cd2..f3f254fdcbad 100644
--- a/lib/Target/R600/LLVMBuild.txt
+++ b/lib/Target/R600/LLVMBuild.txt
@@ -16,17 +16,18 @@
;===------------------------------------------------------------------------===;
[common]
-subdirectories = InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo
[component_0]
type = TargetGroup
name = R600
parent = Target
+has_asmparser = 1
has_asmprinter = 1
[component_1]
type = Library
name = R600CodeGen
parent = R600
-required_libraries = Analysis AsmPrinter CodeGen Core MC R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC R600AsmParser R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils
add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index d55f27b04554..d0c634fb7e42 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -29,7 +29,7 @@ public:
const MCAsmLayout &Layout) override {
//XXX: Implement if necessary.
}
- void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
+ void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
const MCFragment *Fragment, const MCFixup &Fixup,
MCValue Target, bool &IsPCRel,
uint64_t &FixedValue) override {
@@ -57,9 +57,7 @@ public:
assert(!"Not implemented");
}
bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
- bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
- return true;
- }
+ bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
};
@@ -116,6 +114,13 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
return Infos[Kind - FirstTargetFixupKind];
}
+bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+ for (unsigned i = 0; i < Count; ++i)
+ OW->Write8(0);
+
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// ELFAMDGPUAsmBackend class
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h
index 4b12e548a56f..01021d67ffd9 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h
@@ -7,8 +7,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AMDGPUFIXUPKINDS_H
-#define LLVM_AMDGPUFIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
#include "llvm/MC/MCFixup.h"
@@ -31,4 +31,4 @@ enum Fixups {
}
}
-#endif // LLVM_AMDGPUFIXUPKINDS_H
+#endif
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 78bbe0a163c9..19d89fb27caa 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -11,21 +11,15 @@
#include "AMDGPUMCAsmInfo.h"
using namespace llvm;
-AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() {
HasSingleParameterDotFile = false;
//===------------------------------------------------------------------===//
- HasSubsectionsViaSymbols = true;
- HasMachoZeroFillDirective = false;
- HasMachoTBSSDirective = false;
- HasStaticCtorDtorReferenceInStaticMode = false;
- LinkerRequiresNonEmptyDwarfLines = true;
MaxInstLength = 16;
SeparatorString = "\n";
CommentString = ";";
- LabelSuffix = ":";
+ PrivateLabelPrefix = "";
InlineAsmStart = ";#ASMSTART";
InlineAsmEnd = ";#ASMEND";
- AssemblerDialect = 0;
//===--- Data Emission Directives -------------------------------------===//
ZeroDirective = ".zero";
@@ -35,28 +29,15 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
Data16bitsDirective = ".short\t";
Data32bitsDirective = ".long\t";
Data64bitsDirective = ".quad\t";
- GPRel32Directive = nullptr;
SunStyleELFSectionSwitchSyntax = true;
UsesELFSectionDirectiveForBSS = true;
- //===--- Alignment Information ----------------------------------------===//
- AlignmentIsInBytes = true;
- TextAlignFillValue = 0;
-
//===--- Global Variable Emission Directives --------------------------===//
- GlobalDirective = ".global";
- HasSetDirective = false;
HasAggressiveSymbolFolding = true;
COMMDirectiveAlignmentIsInBytes = false;
HasDotTypeDotSizeDirective = false;
HasNoDeadStrip = true;
WeakRefDirective = ".weakref\t";
//===--- Dwarf Emission Directives -----------------------------------===//
- HasLEB128 = true;
SupportsDebugInformation = true;
}
-
-const MCSection*
-AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
- return nullptr;
-}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 59aebece540a..8f75c76c4257 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -1,4 +1,4 @@
-//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface ----------===//
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,18 +11,22 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AMDGPUMCASMINFO_H
-#define AMDGPUMCASMINFO_H
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
namespace llvm {
class StringRef;
-class AMDGPUMCAsmInfo : public MCAsmInfo {
+// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
+// you will need to make sure your new class sets PrivateGlobalPrefix to
+// a prefix that won't appeary in a fuction name. The default value
+// for PrivateGlobalPrefix is 'L', so it will consider any function starting
+// with 'L' as a local symbol.
+class AMDGPUMCAsmInfo : public MCAsmInfoELF {
public:
explicit AMDGPUMCAsmInfo(StringRef &TT);
- const MCSection* getNonexecutableStackSection(MCContext &CTX) const override;
};
} // namespace llvm
-#endif // AMDGPUMCASMINFO_H
+#endif
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index d5e432de564c..c95742762233 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AMDGPUCODEEMITTER_H
-#define AMDGPUCODEEMITTER_H
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/Support/raw_ostream.h"
@@ -47,4 +47,4 @@ public:
} // End namespace llvm
-#endif // AMDGPUCODEEMITTER_H
+#endif
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 38a295659f96..83403ba04870 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUMCTargetDesc.h"
#include "AMDGPUMCAsmInfo.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "SIDefines.h"
#include "llvm/MC/MCCodeGenInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -84,31 +85,37 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
MCContext &Ctx, MCAsmBackend &MAB,
- raw_ostream &_OS,
- MCCodeEmitter *_Emitter,
- const MCSubtargetInfo &STI,
- bool RelaxAll,
- bool NoExecStack) {
- return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false);
+ raw_ostream &_OS, MCCodeEmitter *_Emitter,
+ const MCSubtargetInfo &STI, bool RelaxAll) {
+ return createELFStreamer(Ctx, MAB, _OS, _Emitter, false);
}
extern "C" void LLVMInitializeR600TargetMC() {
RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
+ RegisterMCAsmInfo<AMDGPUMCAsmInfo> Z(TheGCNTarget);
TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
+ TargetRegistry::RegisterMCCodeGenInfo(TheGCNTarget, createAMDGPUMCCodeGenInfo);
TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
+ TargetRegistry::RegisterMCInstrInfo(TheGCNTarget, createAMDGPUMCInstrInfo);
TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
+ TargetRegistry::RegisterMCRegInfo(TheGCNTarget, createAMDGPUMCRegisterInfo);
TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
+ TargetRegistry::RegisterMCSubtargetInfo(TheGCNTarget, createAMDGPUMCSubtargetInfo);
TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
+ TargetRegistry::RegisterMCInstPrinter(TheGCNTarget, createAMDGPUMCInstPrinter);
TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
+ TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createAMDGPUMCCodeEmitter);
TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
+ TargetRegistry::RegisterMCAsmBackend(TheGCNTarget, createAMDGPUAsmBackend);
TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
+ TargetRegistry::RegisterMCObjectStreamer(TheGCNTarget, createMCStreamer);
}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index f6b3376da32c..bc8cd53d84b4 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -13,8 +13,8 @@
//===----------------------------------------------------------------------===//
//
-#ifndef AMDGPUMCTARGETDESC_H
-#define AMDGPUMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
#include "llvm/ADT/StringRef.h"
@@ -30,6 +30,7 @@ class Target;
class raw_ostream;
extern Target TheAMDGPUTarget;
+extern Target TheGCNTarget;
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
@@ -55,4 +56,4 @@ MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS);
#define GET_SUBTARGETINFO_ENUM
#include "AMDGPUGenSubtargetInfo.inc"
-#endif // AMDGPUMCTARGETDESC_H
+#endif
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index 78776c11d75d..640de3f9fc84 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -14,9 +14,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCFixup.h"
@@ -84,12 +85,10 @@ MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
unsigned OpNo) const {
+ unsigned OpType = Desc.OpInfo[OpNo].OperandType;
- unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
- return (AMDGPU::SSrc_32RegClassID == RegClass) ||
- (AMDGPU::SSrc_64RegClassID == RegClass) ||
- (AMDGPU::VSrc_32RegClassID == RegClass) ||
- (AMDGPU::VSrc_64RegClassID == RegClass);
+ return OpType == AMDGPU::OPERAND_REG_IMM32 ||
+ OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile
index 1b3ebbe8c8f3..64a7c8c045c5 100644
--- a/lib/Target/R600/Makefile
+++ b/lib/Target/R600/Makefile
@@ -16,8 +16,8 @@ BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \
AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
- AMDGPUGenAsmWriter.inc
+ AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc
-DIRS = InstPrinter TargetInfo MCTargetDesc
+DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc
include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index ce17d7cb7f13..cff97cdb3beb 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -83,28 +83,38 @@ def : Proc<"cayman", R600_VLIW4_Itin,
// Southern Islands
//===----------------------------------------------------------------------===//
-def : Proc<"SI", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"SI", SIFullSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"tahiti", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"tahiti", SIFullSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"pitcairn", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"verde", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"oland", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
-def : Proc<"hainan", SI_Itin, [FeatureSouthernIslands]>;
+def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
//===----------------------------------------------------------------------===//
// Sea Islands
//===----------------------------------------------------------------------===//
-def : Proc<"bonaire", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"bonaire", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"kabini", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kabini", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"kaveri", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"kaveri", SIQuarterSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"hawaii", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"hawaii", SIFullSpeedModel, [FeatureSeaIslands]>;
-def : Proc<"mullins", SI_Itin, [FeatureSeaIslands]>;
+def : ProcessorModel<"mullins", SIQuarterSpeedModel, [FeatureSeaIslands]>;
+
+//===----------------------------------------------------------------------===//
+// Volcanic Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"tonga", SIFullSpeedModel, [FeatureVolcanicIslands]>;
+
+def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
+
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
index 92bf0df96254..f07be0001fb8 100644
--- a/lib/Target/R600/R600ClauseMergePass.cpp
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -18,6 +18,7 @@
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -167,7 +168,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
}
bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
- TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+ TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
BB != BB_E; ++BB) {
MachineBasicBlock &MBB = *BB;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index e37767a0719d..edaf27841ca7 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -336,7 +336,7 @@ private:
getHWInstrDesc(IsTex?CF_TC:CF_VC))
.addImm(0) // ADDR
.addImm(AluInstCount - 1); // COUNT
- return ClauseFile(MIb, ClauseContent);
+ return ClauseFile(MIb, std::move(ClauseContent));
}
void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
@@ -426,7 +426,7 @@ private:
}
assert(ClauseContent.size() < 128 && "ALU clause is too big");
ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
- return ClauseFile(ClauseHead, ClauseContent);
+ return ClauseFile(ClauseHead, std::move(ClauseContent));
}
void
@@ -459,11 +459,9 @@ private:
void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
}
- void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
- const {
- for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
- It != E; ++It) {
- MachineInstr *MI = *It;
+ void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
+ unsigned Addr) const {
+ for (MachineInstr *MI : MIs) {
CounterPropagateAddr(MI, Addr);
}
}
@@ -477,8 +475,9 @@ public:
}
bool runOnMachineFunction(MachineFunction &MF) override {
- TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
- TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
+ TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TRI = static_cast<const R600RegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
CFStack CFStack(ST, MFI->getShaderType());
@@ -542,7 +541,7 @@ public:
std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
std::set<MachineInstr *>());
Pair.second.insert(MIb);
- LoopStack.push_back(Pair);
+ LoopStack.push_back(std::move(Pair));
MI->eraseFromParent();
CfCount++;
break;
@@ -550,7 +549,7 @@ public:
case AMDGPU::ENDLOOP: {
CFStack.popLoop();
std::pair<unsigned, std::set<MachineInstr *> > Pair =
- LoopStack.back();
+ std::move(LoopStack.back());
LoopStack.pop_back();
CounterPropagateAddr(Pair.second, CfCount);
BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index f2f28fe469b5..51d87eda31d1 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -8,8 +8,8 @@
/// \file
//===----------------------------------------------------------------------===//
-#ifndef R600DEFINES_H_
-#define R600DEFINES_H_
+#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H
+#define LLVM_LIB_TARGET_R600_R600DEFINES_H
#include "llvm/MC/MCRegisterInfo.h"
@@ -168,4 +168,4 @@ namespace OpName {
#define R_0288E8_SQ_LDS_ALLOC 0x0288E8
-#endif // R600DEFINES_H_
+#endif
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 38afebef400e..fdc20302f4a3 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -19,6 +19,7 @@
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -297,7 +298,7 @@ public:
}
bool runOnMachineFunction(MachineFunction &MF) override {
- TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+ TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
BB != BB_E; ++BB) {
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index 732b06dc15c7..211d392e8fcc 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -19,6 +19,7 @@
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -65,7 +66,7 @@ void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
}
bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
- TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+ TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
const R600RegisterInfo &TRI = TII->getRegisterInfo();
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 52315bf0f338..595f69884544 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -122,12 +122,19 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
// EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
// spaces, so it is custom lowered to handle those where it isn't.
- setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
+
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
+ }
setOperationAction(ISD::STORE, MVT::i8, Custom);
setOperationAction(ISD::STORE, MVT::i32, Custom);
@@ -181,8 +188,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SUBE, VT, Expand);
}
- setBooleanContents(ZeroOrNegativeOneBooleanContent);
- setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
setSchedulingPreference(Sched::Source);
}
@@ -192,7 +197,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock::iterator I = *MI;
const R600InstrInfo *TII =
- static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
+ static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
switch (MI->getOpcode()) {
default:
@@ -202,7 +207,10 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
assert(DstIdx != -1);
MachineInstrBuilder NewMI;
- if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
+ // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
+ // LDS_1A2D support and remove this special case.
+ if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
+ MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
return BB;
NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
@@ -645,8 +653,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
MachineSDNode *interp;
if (ijb < 0) {
const MachineFunction &MF = DAG.getMachineFunction();
- const R600InstrInfo *TII =
- static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
+ const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
+ MF.getSubtarget().getInstrInfo());
interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
return DAG.getTargetExtractSubreg(
@@ -806,6 +814,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::r600_read_local_size_z:
return LowerImplicitParameter(DAG, VT, DL, 8);
+ case Intrinsic::AMDGPU_read_workdim:
+ return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
+
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
AMDGPU::T1_X, VT);
@@ -901,74 +912,7 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::UDIVREM: {
SDValue Op = SDValue(N, 0);
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
-
- SDValue one = DAG.getConstant(1, HalfVT);
- SDValue zero = DAG.getConstant(0, HalfVT);
-
- //HiLo split
- SDValue LHS = N->getOperand(0);
- SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
- SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
-
- SDValue RHS = N->getOperand(1);
- SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
- SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
-
- // Get Speculative values
- SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
- SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
-
- SDValue REM_Hi = zero;
- SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
-
- SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
- SDValue DIV_Lo = zero;
-
- const unsigned halfBitWidth = HalfVT.getSizeInBits();
-
- for (unsigned i = 0; i < halfBitWidth; ++i) {
- SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
- // Get Value of high bit
- SDValue HBit;
- if (halfBitWidth == 32 && Subtarget->hasBFE()) {
- HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
- } else {
- HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
- HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
- }
-
- SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
- DAG.getConstant(halfBitWidth - 1, HalfVT));
- REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
- REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
-
- REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
- REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
-
-
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-
- SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
- SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
-
- DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
-
- // Update REM
-
- SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-
- REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
- REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
- REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
- }
-
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
- SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
- Results.push_back(DIV);
- Results.push_back(REM);
+ LowerUDIVREM64(Op, DAG, Results);
break;
}
}
@@ -1176,6 +1120,13 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
SDValue CC = Op.getOperand(4);
SDValue Temp;
+ if (VT == MVT::f32) {
+ DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+ SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+ if (MinMax)
+ return MinMax;
+ }
+
// LHS and RHS are guaranteed to be the same value type
EVT CompareVT = LHS.getValueType();
@@ -1430,8 +1381,8 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// Lowering for indirect addressing
const MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
- getTargetMachine().getFrameLowering());
+ const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+ getTargetMachine().getSubtargetImpl()->getFrameLowering());
unsigned StackWidth = TFL->getStackWidth(MF);
Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1543,7 +1494,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
SDValue MergedValues[2] = {
- SplitVectorLoad(Op, DAG),
+ ScalarizeVectorLoad(Op, DAG),
Chain
};
return DAG.getMergeValues(MergedValues, DL);
@@ -1613,6 +1564,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
LoadNode->getPointerInfo(), MemVT,
LoadNode->isVolatile(),
LoadNode->isNonTemporal(),
+ LoadNode->isInvariant(),
LoadNode->getAlignment());
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
@@ -1627,8 +1579,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
// Lowering for indirect addressing
const MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
- getTargetMachine().getFrameLowering());
+ const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+ getTargetMachine().getSubtargetImpl()->getFrameLowering());
unsigned StackWidth = TFL->getStackWidth(MF);
Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1691,10 +1643,10 @@ SDValue R600TargetLowering::LowerFormalArguments(
SDLoc DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const {
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
MachineFunction &MF = DAG.getMachineFunction();
- unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
SmallVector<ISD::InputArg, 8> LocalIns;
@@ -1704,10 +1656,15 @@ SDValue R600TargetLowering::LowerFormalArguments(
for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
CCValAssign &VA = ArgLocs[i];
- EVT VT = Ins[i].VT;
- EVT MemVT = LocalIns[i].VT;
+ const ISD::InputArg &In = Ins[i];
+ EVT VT = In.VT;
+ EVT MemVT = VA.getLocVT();
+ if (!VT.isVector() && MemVT.isVector()) {
+ // Get load source type if scalarized.
+ MemVT = MemVT.getVectorElementType();
+ }
- if (ShaderType != ShaderType::COMPUTE) {
+ if (MFI->getShaderType() != ShaderType::COMPUTE) {
unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
InVals.push_back(Register);
@@ -1715,7 +1672,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
}
PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
- AMDGPUAS::CONSTANT_BUFFER_0);
+ AMDGPUAS::CONSTANT_BUFFER_0);
// i64 isn't a legal type, so the register type used ends up as i32, which
// isn't expected here. It attempts to create this sextload, but it ends up
@@ -1724,18 +1681,33 @@ SDValue R600TargetLowering::LowerFormalArguments(
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
+ ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
+ if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
+ // FIXME: This should really check the extload type, but the handling of
+ // extload vector parameters seems to be broken.
+
+ // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ Ext = ISD::SEXTLOAD;
+ }
+
+ // Compute the offset from the value.
+ // XXX - I think PartOffset should give you this, but it seems to give the
+ // size of the register which isn't useful.
+
+ unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
+ unsigned PartOffset = VA.getLocMemOffset();
+ unsigned Offset = 36 + VA.getLocMemOffset();
- // FIXME: This should really check the extload type, but the handling of
- // extload vecto parameters seems to be broken.
- //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
- ISD::LoadExtType Ext = ISD::SEXTLOAD;
- SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
- DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
- MachinePointerInfo(UndefValue::get(PtrTy)),
- MemVT, false, false, 4);
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
+ SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
+ DAG.getConstant(Offset, MVT::i32),
+ DAG.getUNDEF(MVT::i32),
+ PtrInfo,
+ MemVT, false, true, true, 4);
// 4 is the preferred alignment for the CONSTANT memory space.
InVals.push_back(Arg);
+ MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
}
return Chain;
}
@@ -2081,7 +2053,7 @@ static bool
FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
const R600InstrInfo *TII =
- static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+ static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
if (!Src.isMachineOpcode())
return false;
switch (Src.getMachineOpcode()) {
@@ -2206,7 +2178,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
const R600InstrInfo *TII =
- static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+ static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
if (!Node->isMachineOpcode())
return Node;
unsigned Opcode = Node->getMachineOpcode();
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index d22c8c98a542..10ebc10ccdba 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef R600ISELLOWERING_H
-#define R600ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
#include "AMDGPUISelLowering.h"
@@ -74,4 +74,4 @@ private:
} // End namespace llvm;
-#endif // R600ISELLOWERING_H
+#endif
diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td
index 9428babcbefd..0ffd485476ec 100644
--- a/lib/Target/R600/R600InstrFormats.td
+++ b/lib/Target/R600/R600InstrFormats.td
@@ -38,6 +38,9 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
let Pattern = pattern;
let Itinerary = itin;
+ // No AsmMatcher support.
+ let isCodeGenOnly = 1;
+
let TSFlags{4} = Trig;
let TSFlags{5} = Op3;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 99920b7761a7..653fd0d52757 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -571,7 +571,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
if (!isLastAluTrans)
return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
- TransOps = IGSrcs.back();
+ TransOps = std::move(IGSrcs.back());
IGSrcs.pop_back();
ValidSwizzle.pop_back();
@@ -654,10 +654,10 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
return fitsConstReadLimitations(Consts);
}
-DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
- const ScheduleDAG *DAG) const {
- const InstrItineraryData *II = TM->getInstrItineraryData();
- return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II);
+DFAPacketizer *
+R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
+ const InstrItineraryData *II = STI.getInstrItineraryData();
+ return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II);
}
static bool
@@ -1082,9 +1082,8 @@ bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
const MachineFunction &MF) const {
- const AMDGPUFrameLowering *TFL =
- static_cast<const AMDGPUFrameLowering*>(
- MF.getTarget().getFrameLowering());
+ const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+ MF.getSubtarget().getFrameLowering());
unsigned StackWidth = TFL->getStackWidth(MF);
int End = getIndirectIndexEnd(MF);
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index 1c3cb637a178..d3dc0e58daa1 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef R600INSTRUCTIONINFO_H_
-#define R600INSTRUCTIONINFO_H_
+#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H
+#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H
#include "AMDGPUInstrInfo.h"
#include "R600Defines.h"
@@ -154,8 +154,8 @@ namespace llvm {
bool isMov(unsigned Opcode) const override;
- DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
- const ScheduleDAG *DAG) const override;
+ DFAPacketizer *
+ CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
@@ -206,7 +206,7 @@ namespace llvm {
int getInstrLatency(const InstrItineraryData *ItinData,
SDNode *Node) const override { return 1;}
- virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+ bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
/// \brief Reserve the registers that may be accesed using indirect addressing.
void reserveIndirectRegisters(BitVector &Reserved,
@@ -298,4 +298,4 @@ int getLDSNoRetOp(uint16_t Opcode);
} // End llvm namespace
-#endif // R600INSTRINFO_H_
+#endif
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 704507d368ec..b1d3ce276eee 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -475,13 +475,13 @@ class ExportBufWord1 {
multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
(ExportInst
- (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
0, 61, 0, 7, 7, 7, cf_inst, 0)
>;
def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
(ExportInst
- (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
0, 61, 7, 0, 7, 7, cf_inst, 0)
>;
@@ -513,17 +513,17 @@ multiclass SteamOutputExportPattern<Instruction ExportInst,
// Stream1
def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
(i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
- (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+ (ExportInst $src, 0, imm:$arraybase,
4095, imm:$mask, buf1inst, 0)>;
// Stream2
def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
(i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
- (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+ (ExportInst $src, 0, imm:$arraybase,
4095, imm:$mask, buf2inst, 0)>;
// Stream3
def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
(i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
- (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+ (ExportInst $src, 0, imm:$arraybase,
4095, imm:$mask, buf3inst, 0)>;
}
@@ -674,8 +674,9 @@ def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
// Non-IEEE MUL: 0 * anything = 0
def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
-def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>;
-def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
+// TODO: Do these actually match the regular fmin/fmax behavior?
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>;
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>;
// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
// so some of the instruction names don't match the asm string.
@@ -697,7 +698,7 @@ def SGE : R600_2OP <
def SNE : R600_2OP <
0xB, "SETNE",
- [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
>;
def SETE_DX10 : R600_2OP <
@@ -715,9 +716,10 @@ def SETGE_DX10 : R600_2OP <
[(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
>;
+// FIXME: This should probably be COND_ONE
def SETNE_DX10 : R600_2OP <
0xF, "SETNE_DX10",
- [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))]
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
>;
def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
@@ -915,6 +917,11 @@ class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
[(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
>;
+class FMA_Common <bits<5> inst> : R600_3OP <
+ inst, "FMA",
+ [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU
+>;
+
class CNDE_Common <bits<5> inst> : R600_3OP <
inst, "CNDE",
[(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
@@ -1068,7 +1075,7 @@ class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
}
class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
- inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
+ inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))]
> {
let Itinerary = TransALU;
}
@@ -1114,6 +1121,7 @@ def FNEG_R600 : FNEG<R600_Reg32>;
// Helper patterns for complex intrinsics
//===----------------------------------------------------------------------===//
+// FIXME: Should be predicated on unsafe fp math.
multiclass DIV_Common <InstR600 recip_ieee> {
def : Pat<
(int_AMDGPU_div f32:$src0, f32:$src1),
@@ -1124,6 +1132,8 @@ def : Pat<
(fdiv f32:$src0, f32:$src1),
(MUL_IEEE $src0, (recip_ieee $src1))
>;
+
+def : RcpPat<recip_ieee, f32>;
}
class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee>
@@ -1133,9 +1143,12 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie
>;
// FROUND pattern
-class FROUNDPat<Instruction CNDGE> : Pat <
+class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat <
(AMDGPUround f32:$x),
- (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
+ (CNDGE $x,
+ (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)),
+ (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
+ )
>;
@@ -1180,7 +1193,9 @@ let Predicates = [isR600] in {
def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
- def : FROUNDPat <CNDGE_r600>;
+ defm : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+
+ def : FROUNDPat <CNDGE_r600, CNDGT_r600>;
def R600_ExportSwz : ExportSwzInst {
let Word1{20-17} = 0; // BURST_COUNT
@@ -1350,7 +1365,7 @@ def CONST_COPY : Instruction {
let Pattern =
[(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
let AsmString = "CONST_COPY";
- let neverHasSideEffects = 1;
+ let hasSideEffects = 0;
let isAsCheapAsAMove = 1;
let Itinerary = NullALU;
}
@@ -1482,6 +1497,7 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
+ let isCodeGenOnly = 1;
}
multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index b0ae22e806a9..263561edd30d 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -10,8 +10,8 @@
/// \file
//===----------------------------------------------------------------------===//
-#ifndef R600MACHINEFUNCTIONINFO_H
-#define R600MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
#include "AMDGPUMachineFunction.h"
#include "llvm/ADT/BitVector.h"
@@ -31,4 +31,4 @@ public:
} // End llvm namespace
-#endif //R600MACHINEFUNCTIONINFO_H
+#endif
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 7ea654cb14cd..d782713cab65 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -14,7 +14,6 @@
#include "R600MachineScheduler.h"
#include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/PassManager.h"
@@ -76,21 +75,25 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
float ALUFetchRationEstimate =
(AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
(FetchInstCount + Available[IDFetch].size());
- unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
- DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
- // We assume the local GPR requirements to be "dominated" by the requirement
- // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
- // after TEX are indeed likely to consume or generate values from/for the
- // TEX clause.
- // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
- // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
- // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
- // (TODO : use RegisterPressure)
- // If we are going too use too many GPR, we flush Fetch instruction to lower
- // register pressure on 128 bits regs.
- unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
- if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ if (ALUFetchRationEstimate == 0) {
AllowSwitchFromAlu = true;
+ } else {
+ unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+ DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ // We assume the local GPR requirements to be "dominated" by the requirement
+ // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+ // after TEX are indeed likely to consume or generate values from/for the
+ // TEX clause.
+ // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+ // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+ // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+ // (TODO : use RegisterPressure)
+ // If we are going too use too many GPR, we flush Fetch instruction to lower
+ // register pressure on 128 bits regs.
+ unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+ if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ AllowSwitchFromAlu = true;
+ }
}
if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index fd475af2bf8f..fc5b95c28e71 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef R600MACHINESCHEDULER_H_
-#define R600MACHINESCHEDULER_H_
+#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
#include "R600InstrInfo.h"
#include "llvm/ADT/PriorityQueue.h"
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index 2314136f2227..742c0e0451cb 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -30,6 +30,7 @@
#include "llvm/Support/Debug.h"
#include "AMDGPU.h"
#include "R600InstrInfo.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -279,9 +280,8 @@ bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
continue;
if (PreviousRegSeqByReg[MOp->getReg()].empty())
continue;
- std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()];
- for (unsigned i = 0, e = MIs.size(); i < e; i++) {
- CompatibleRSI = PreviousRegSeq[MIs[i]];
+ for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) {
+ CompatibleRSI = PreviousRegSeq[MI];
if (RSI == CompatibleRSI)
continue;
if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
@@ -314,7 +314,7 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
}
bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
- TII = static_cast<const R600InstrInfo *>(Fn.getTarget().getInstrInfo());
+ TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo());
MRI = &(Fn.getRegInfo());
for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
MBB != MBBe; ++MBB) {
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index 74cf30974d58..ddf68c91cdf3 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -148,11 +148,11 @@ private:
}
public:
// Ctor.
- R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
- MachineDominatorTree &MDT)
- : VLIWPacketizerList(MF, MLI, MDT, true),
- TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())),
- TRI(TII->getRegisterInfo()) {
+ R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
+ : VLIWPacketizerList(MF, MLI, true),
+ TII(static_cast<const R600InstrInfo *>(
+ MF.getSubtarget().getInstrInfo())),
+ TRI(TII->getRegisterInfo()) {
VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
}
@@ -328,12 +328,11 @@ public:
};
bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
- const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+ const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
- MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
// Instantiate the packetizer.
- R600PacketizerList Packetizer(Fn, MLI, MDT);
+ R600PacketizerList Packetizer(Fn, MLI);
// DFA state table should not be empty.
assert(Packetizer.getResourceTracker() && "Empty DFA table!");
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index 247808b6e7b6..f1a8a41b9a5d 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef R600REGISTERINFO_H_
-#define R600REGISTERINFO_H_
+#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
#include "AMDGPURegisterInfo.h"
@@ -46,4 +46,4 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
} // End namespace llvm
-#endif // AMDIDSAREGISTERINFO_H_
+#endif
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index b7e7a2d000b3..73a9c73d8e7b 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -8,21 +8,88 @@
/// \file
//===----------------------------------------------------------------------===//
-#ifndef SIDEFINES_H_
-#define SIDEFINES_H_
+#include "llvm/MC/MCInstrDesc.h"
+
+#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H
+#define LLVM_LIB_TARGET_R600_SIDEFINES_H
namespace SIInstrFlags {
+// This needs to be kept in sync with the field bits in InstSI.
enum {
- MIMG = 1 << 3,
- SMRD = 1 << 4,
- VOP1 = 1 << 5,
- VOP2 = 1 << 6,
- VOP3 = 1 << 7,
- VOPC = 1 << 8,
- SALU = 1 << 9
+ SALU = 1 << 3,
+ VALU = 1 << 4,
+
+ SOP1 = 1 << 5,
+ SOP2 = 1 << 6,
+ SOPC = 1 << 7,
+ SOPK = 1 << 8,
+ SOPP = 1 << 9,
+
+ VOP1 = 1 << 10,
+ VOP2 = 1 << 11,
+ VOP3 = 1 << 12,
+ VOPC = 1 << 13,
+
+ MUBUF = 1 << 14,
+ MTBUF = 1 << 15,
+ SMRD = 1 << 16,
+ DS = 1 << 17,
+ MIMG = 1 << 18,
+ FLAT = 1 << 19
};
}
+namespace llvm {
+namespace AMDGPU {
+ enum OperandType {
+ /// Operand with register or 32-bit immediate
+ OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET,
+ /// Operand with register or inline constant
+ OPERAND_REG_INLINE_C
+ };
+}
+}
+
+namespace SIInstrFlags {
+ enum Flags {
+ // First 4 bits are the instruction encoding
+ VM_CNT = 1 << 0,
+ EXP_CNT = 1 << 1,
+ LGKM_CNT = 1 << 2
+ };
+
+ // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
+ // The result is true if any of these tests are true.
+ enum ClassFlags {
+ S_NAN = 1 << 0, // Signaling NaN
+ Q_NAN = 1 << 1, // Quiet NaN
+ N_INFINITY = 1 << 2, // Negative infinity
+ N_NORMAL = 1 << 3, // Negative normal
+ N_SUBNORMAL = 1 << 4, // Negative subnormal
+ N_ZERO = 1 << 5, // Negative zero
+ P_ZERO = 1 << 6, // Positive zero
+ P_SUBNORMAL = 1 << 7, // Positive subnormal
+ P_NORMAL = 1 << 8, // Positive normal
+ P_INFINITY = 1 << 9 // Positive infinity
+ };
+}
+
+namespace SISrcMods {
+ enum {
+ NEG = 1 << 0,
+ ABS = 1 << 1
+ };
+}
+
+namespace SIOutMods {
+ enum {
+ NONE = 0,
+ MUL2 = 1,
+ MUL4 = 2,
+ DIV2 = 3
+ };
+}
+
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C
#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8)
@@ -32,7 +99,14 @@ enum {
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C
-#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0)
+#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0)
+#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1)
+#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7)
+#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8)
+#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9)
+#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10)
+#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11)
+
#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
@@ -89,4 +163,4 @@ enum {
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
-#endif // SIDEFINES_H_
+#endif
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 5f714535abeb..cd1b3acc5c87 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -66,6 +66,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -135,12 +136,12 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
const MachineRegisterInfo &MRI,
unsigned Reg,
unsigned SubReg) const {
- // The Reg parameter to the function must always be defined by either a PHI
- // or a COPY, therefore it cannot be a physical register.
- assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
- "Reg cannot be a physical register");
- const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ const TargetRegisterClass *RC
+ = TargetRegisterInfo::isVirtualRegister(Reg) ?
+ MRI.getRegClass(Reg) :
+ TRI->getRegClass(Reg);
+
RC = TRI->getSubRegClass(RC, SubReg);
for (MachineRegisterInfo::use_instr_iterator
I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
@@ -181,7 +182,12 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
unsigned DstReg = Copy.getOperand(0).getReg();
unsigned SrcReg = Copy.getOperand(1).getReg();
unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
- const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+
+ const TargetRegisterClass *DstRC
+ = TargetRegisterInfo::isVirtualRegister(DstReg) ?
+ MRI.getRegClass(DstReg) :
+ TRI->getRegClass(DstReg);
+
const TargetRegisterClass *SrcRC;
if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
@@ -195,10 +201,10 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
- MF.getTarget().getRegisterInfo());
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- MF.getTarget().getInstrInfo());
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
@@ -216,20 +222,21 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
switch (MI.getOpcode()) {
default: continue;
case AMDGPU::PHI: {
- DEBUG(dbgs() << " Fixing PHI:\n");
- DEBUG(MI.print(dbgs()));
+ DEBUG(dbgs() << "Fixing PHI: " << MI);
- for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
- unsigned Reg = MI.getOperand(i).getReg();
- const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg,
- MI.getOperand(0).getSubReg());
- MRI.constrainRegClass(Reg, RC);
+ for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+ const MachineOperand &Op = MI.getOperand(i);
+ unsigned Reg = Op.getReg();
+ const TargetRegisterClass *RC
+ = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
+
+ MRI.constrainRegClass(Op.getReg(), RC);
}
unsigned Reg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
MI.getOperand(0).getSubReg());
- if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) {
- MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
+ if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
+ MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
}
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
@@ -237,14 +244,66 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
// If a PHI node defines an SGPR and any of its operands are VGPRs,
// then we need to move it to the VALU.
+ //
+ // Also, if a PHI node defines an SGPR and has all SGPR operands
+ // we must move it to the VALU, because the SGPR operands will
+ // all end up being assigned the same register, which means
+ // there is a potential for a conflict if different threads take
+ // different control flow paths.
+ //
+ // For Example:
+ //
+ // sgpr0 = def;
+ // ...
+ // sgpr1 = def;
+ // ...
+ // sgpr2 = PHI sgpr0, sgpr1
+ // use sgpr2;
+ //
+ // Will Become:
+ //
+ // sgpr2 = def;
+ // ...
+ // sgpr2 = def;
+ // ...
+ // use sgpr2
+ //
+ // FIXME: This is OK if the branching decision is made based on an
+ // SGPR value.
+ bool SGPRBranch = false;
+
+ // The one exception to this rule is when one of the operands
+ // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
+ // instruction. In this case, there we know the program will
+ // never enter the second block (the loop) without entering
+ // the first block (where the condition is computed), so there
+ // is no chance for values to be over-written.
+
+ bool HasBreakDef = false;
for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
unsigned Reg = MI.getOperand(i).getReg();
if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
TII->moveToVALU(MI);
break;
}
+ MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
+ assert(DefInstr);
+ switch(DefInstr->getOpcode()) {
+
+ case AMDGPU::SI_BREAK:
+ case AMDGPU::SI_IF_BREAK:
+ case AMDGPU::SI_ELSE_BREAK:
+ // If we see a PHI instruction that defines an SGPR, then that PHI
+ // instruction has already been considered and should have
+ // a *_BREAK as an operand.
+ case AMDGPU::PHI:
+ HasBreakDef = true;
+ break;
+ }
}
+ if (!SGPRBranch && !HasBreakDef)
+ TII->moveToVALU(MI);
break;
}
case AMDGPU::REG_SEQUENCE: {
@@ -252,8 +311,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
!hasVGPROperands(MI, TRI))
continue;
- DEBUG(dbgs() << "Fixing REG_SEQUENCE:\n");
- DEBUG(MI.print(dbgs()));
+ DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
TII->moveToVALU(MI);
break;
@@ -265,8 +323,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
- DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n");
- DEBUG(MI.print(dbgs()));
+ DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
TII->moveToVALU(MI);
}
break;
@@ -274,5 +331,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
}
}
- return false;
+
+ return true;
}
diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp
index 7d116eef396c..f34c37580432 100644
--- a/lib/Target/R600/SIFixSGPRLiveRanges.cpp
+++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp
@@ -9,18 +9,49 @@
//
/// \file
/// SALU instructions ignore control flow, so we need to modify the live ranges
-/// of the registers they define.
+/// of the registers they define in some cases.
///
-/// The strategy is to view the entire program as if it were a single basic
-/// block and calculate the intervals accordingly. We implement this
-/// by walking this list of segments for each LiveRange and setting the
-/// end of each segment equal to the start of the segment that immediately
-/// follows it.
+/// The main case we need to handle is when a def is used in one side of a
+/// branch and not another. For example:
+///
+/// %def
+/// IF
+/// ...
+/// ...
+/// ELSE
+/// %use
+/// ...
+/// ENDIF
+///
+/// Here we need the register allocator to avoid assigning any of the defs
+/// inside of the IF to the same register as %def. In traditional live
+/// interval analysis %def is not live inside the IF branch, however, since
+/// SALU instructions inside of IF will be executed even if the branch is not
+/// taken, there is the chance that one of the instructions will overwrite the
+/// value of %def, so the use in ELSE will see the wrong value.
+///
+/// The strategy we use for solving this is to add an extra use after the ENDIF:
+///
+/// %def
+/// IF
+/// ...
+/// ...
+/// ELSE
+/// %use
+/// ...
+/// ENDIF
+/// %use
+///
+/// Adding this use will make the def live thoughout the IF branch, which is
+/// what we want.
#include "AMDGPU.h"
+#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetMachine.h"
@@ -40,16 +71,15 @@ public:
initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
}
- virtual bool runOnMachineFunction(MachineFunction &MF) override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
- virtual const char *getPassName() const override {
+ const char *getPassName() const override {
return "SI Fix SGPR live ranges";
}
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
- AU.addPreserved<LiveIntervals>();
- AU.addPreserved<SlotIndexes>();
+ AU.addRequired<MachinePostDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -60,6 +90,7 @@ public:
INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
"SI Fix SGPR Live Ranges", false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
"SI Fix SGPR Live Ranges", false, false)
@@ -73,36 +104,86 @@ FunctionPass *llvm::createSIFixSGPRLiveRangesPass() {
bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
- MF.getTarget().getRegisterInfo());
+ MF.getSubtarget().getRegisterInfo());
LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+ MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+ std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges;
+
+ // First pass, collect all live intervals for SGPRs
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ for (const MachineOperand &MO : MI.defs()) {
+ if (MO.isImplicit())
+ continue;
+ unsigned Def = MO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Def)) {
+ if (TRI->isSGPRClass(MRI.getRegClass(Def)))
+ SGPRLiveRanges.push_back(
+ std::make_pair(Def, &LIS->getInterval(Def)));
+ } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) {
+ SGPRLiveRanges.push_back(
+ std::make_pair(Def, &LIS->getRegUnit(Def)));
+ }
+ }
+ }
+ }
+ // Second pass fix the intervals
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
-
MachineBasicBlock &MBB = *BI;
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- MachineInstr &MI = *I;
- MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC);
- if (ExecUse)
+ if (MBB.succ_size() < 2)
+ continue;
+
+ // We have structured control flow, so number of succesors should be two.
+ assert(MBB.succ_size() == 2);
+ MachineBasicBlock *SuccA = *MBB.succ_begin();
+ MachineBasicBlock *SuccB = *(++MBB.succ_begin());
+ MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
+
+ if (!NCD)
+ continue;
+
+ MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator();
+
+ if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) {
+ assert(NCD->succ_size() == 2);
+ // We want to make sure we insert the Use after the ENDIF, not after
+ // the ELSE.
+ NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
+ *(++NCD->succ_begin()));
+ }
+ assert(SuccA && SuccB);
+ for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) {
+ unsigned Reg = RegLR.first;
+ LiveRange *LR = RegLR.second;
+
+ // FIXME: We could be smarter here. If the register is Live-In to
+ // one block, but the other doesn't have any SGPR defs, then there
+ // won't be a conflict. Also, if the branch decision is based on
+ // a value in an SGPR, then there will be no conflict.
+ bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA);
+ bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB);
+
+ if ((!LiveInToA && !LiveInToB) ||
+ (LiveInToA && LiveInToB))
continue;
- for (const MachineOperand &Def : MI.operands()) {
- if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg()))
- continue;
-
- const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg());
-
- if (!TRI->isSGPRClass(RC))
- continue;
- LiveInterval &LI = LIS->getInterval(Def.getReg());
- for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) {
- LiveRange::Segment &Seg = LI.segments[i];
- LiveRange::Segment &Next = LI.segments[i + 1];
- Seg.end = Next.start;
- }
- }
+ // This interval is live in to one successor, but not the other, so
+ // we need to update its range so it is live in to both.
+ DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR <<
+ " BB#" << SuccA->getNumber() << ", BB#" <<
+ SuccB->getNumber() <<
+ " with NCD = " << NCD->getNumber() << '\n');
+
+ // FIXME: Need to figure out how to update LiveRange here so this pass
+ // will be able to preserve LiveInterval analysis.
+ BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
+ TII->get(AMDGPU::SGPR_USE))
+ .addReg(Reg, RegState::Implicit);
+ DEBUG(NCD->getFirstNonPHI()->dump());
}
}
diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
new file mode 100644
index 000000000000..d8ffa4f75505
--- /dev/null
+++ b/lib/Target/R600/SIFoldOperands.cpp
@@ -0,0 +1,275 @@
+//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-fold-operands"
+using namespace llvm;
+
+namespace {
+
+class SIFoldOperands : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ SIFoldOperands() : MachineFunctionPass(ID) {
+ initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "SI Fold Operands";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+struct FoldCandidate {
+ MachineInstr *UseMI;
+ unsigned UseOpNo;
+ MachineOperand *OpToFold;
+ uint64_t ImmToFold;
+
+ FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
+ UseMI(MI), UseOpNo(OpNo) {
+
+ if (FoldOp->isImm()) {
+ OpToFold = nullptr;
+ ImmToFold = FoldOp->getImm();
+ } else {
+ assert(FoldOp->isReg());
+ OpToFold = FoldOp;
+ }
+ }
+
+ bool isImm() const {
+ return !OpToFold;
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
+ "SI Fold Operands", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
+ "SI Fold Operands", false, false)
+
+char SIFoldOperands::ID = 0;
+
+char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+
+FunctionPass *llvm::createSIFoldOperandsPass() {
+ return new SIFoldOperands();
+}
+
+static bool isSafeToFold(unsigned Opcode) {
+ switch(Opcode) {
+ case AMDGPU::V_MOV_B32_e32:
+ case AMDGPU::V_MOV_B32_e64:
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::S_MOV_B32:
+ case AMDGPU::S_MOV_B64:
+ case AMDGPU::COPY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool updateOperand(FoldCandidate &Fold,
+ const TargetRegisterInfo &TRI) {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ assert(Old.isReg());
+
+ if (Fold.isImm()) {
+ Old.ChangeToImmediate(Fold.ImmToFold);
+ return true;
+ }
+
+ MachineOperand *New = Fold.OpToFold;
+ if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
+ TargetRegisterInfo::isVirtualRegister(New->getReg())) {
+ Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+ return true;
+ }
+
+ // FIXME: Handle physical registers.
+
+ return false;
+}
+
+static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+ MachineInstr *MI, unsigned OpNo,
+ MachineOperand *OpToFold,
+ const SIInstrInfo *TII) {
+ if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+ // Operand is not legal, so try to commute the instruction to
+ // see if this makes it possible to fold.
+ unsigned CommuteIdx0;
+ unsigned CommuteIdx1;
+ bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
+
+ if (CanCommute) {
+ if (CommuteIdx0 == OpNo)
+ OpNo = CommuteIdx1;
+ else if (CommuteIdx1 == OpNo)
+ OpNo = CommuteIdx0;
+ }
+
+ if (!CanCommute || !TII->commuteInstruction(MI))
+ return false;
+
+ if (!TII->isOperandLegal(MI, OpNo, OpToFold))
+ return false;
+ }
+
+ FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ return true;
+}
+
+bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MI = *I;
+
+ if (!isSafeToFold(MI.getOpcode()))
+ continue;
+
+ MachineOperand &OpToFold = MI.getOperand(1);
+ bool FoldingImm = OpToFold.isImm();
+
+ // FIXME: We could also be folding things like FrameIndexes and
+ // TargetIndexes.
+ if (!FoldingImm && !OpToFold.isReg())
+ continue;
+
+ // Folding immediates with more than one use will increase program side.
+ // FIXME: This will also reduce register usage, which may be better
+ // in some cases. A better heuristic is needed.
+ if (FoldingImm && !TII->isInlineConstant(OpToFold) &&
+ !MRI.hasOneUse(MI.getOperand(0).getReg()))
+ continue;
+
+ // FIXME: Fold operands with subregs.
+ if (OpToFold.isReg() &&
+ (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
+ OpToFold.getSubReg()))
+ continue;
+
+ std::vector<FoldCandidate> FoldList;
+ for (MachineRegisterInfo::use_iterator
+ Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
+ Use != E; ++Use) {
+
+ MachineInstr *UseMI = Use->getParent();
+ const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
+
+ // FIXME: Fold operands with subregs.
+ if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) {
+ continue;
+ }
+
+ APInt Imm;
+
+ if (FoldingImm) {
+ const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg());
+ Imm = APInt(64, OpToFold.getImm());
+
+ // Split 64-bit constants into 32-bits for folding.
+ if (UseOp.getSubReg()) {
+ if (UseRC->getSize() != 8)
+ continue;
+
+ if (UseOp.getSubReg() == AMDGPU::sub0) {
+ Imm = Imm.getLoBits(32);
+ } else {
+ assert(UseOp.getSubReg() == AMDGPU::sub1);
+ Imm = Imm.getHiBits(32);
+ }
+ }
+
+ // In order to fold immediates into copies, we need to change the
+ // copy to a MOV.
+ if (UseMI->getOpcode() == AMDGPU::COPY) {
+ unsigned MovOp = TII->getMovOpcode(
+ MRI.getRegClass(UseMI->getOperand(0).getReg()));
+ if (MovOp == AMDGPU::COPY)
+ continue;
+
+ UseMI->setDesc(TII->get(MovOp));
+ }
+ }
+
+ const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+ // Don't fold into target independent nodes. Target independent opcodes
+ // don't have defined register classes.
+ if (UseDesc.isVariadic() ||
+ UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
+ continue;
+
+ if (FoldingImm) {
+ MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+ tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
+ continue;
+ }
+
+ tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
+
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunites. The shrink operands pass
+ // already does this.
+ }
+
+ for (FoldCandidate &Fold : FoldList) {
+ if (updateOperand(Fold, TRI)) {
+ // Clear kill flags.
+ if (!Fold.isImm()) {
+ assert(Fold.OpToFold && Fold.OpToFold->isReg());
+ Fold.OpToFold->setIsKill(false);
+ }
+ DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+ Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
+ }
+ }
+ }
+ }
+ return false;
+}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 5a148a24810a..0a3fa2f930d7 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -25,6 +25,7 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -43,7 +44,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
+ addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
@@ -52,29 +53,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
- addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
- addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
computeRegisterProperties();
- // Condition Codes
- setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
- setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
-
- setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
- setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
- setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
- setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
- setCondCodeAction(ISD::SETULE, MVT::f64, Expand);
- setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
-
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
@@ -89,6 +75,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FSIN, MVT::f32, Custom);
setOperationAction(ISD::FCOS, MVT::f32, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
@@ -102,8 +93,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
- setOperationAction(ISD::SELECT, MVT::f32, Promote);
- AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -116,6 +105,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
@@ -128,8 +119,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
-
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -140,23 +130,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
+ for (MVT VT : MVT::integer_valuetypes()) {
+ if (VT == MVT::i64)
+ continue;
+
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+ }
+
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
+ }
+
+ for (MVT VT : MVT::fp_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
setTruncStoreAction(MVT::i32, MVT::i8, Custom);
setTruncStoreAction(MVT::i32, MVT::i16, Custom);
@@ -167,9 +167,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::i1, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
-
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
@@ -196,10 +193,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
case ISD::BITCAST:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
- case ISD::CONCAT_VECTORS:
case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
break;
+ case ISD::CONCAT_VECTORS:
+ setOperationAction(Op, VT, Custom);
+ break;
default:
setOperationAction(Op, VT, Expand);
break;
@@ -207,13 +206,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
}
}
- for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) {
- MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I);
- setOperationAction(ISD::FTRUNC, VT, Expand);
- setOperationAction(ISD::FCEIL, VT, Expand);
- setOperationAction(ISD::FFLOOR, VT, Expand);
- }
-
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -221,18 +213,38 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::FRINT, MVT::f64, Legal);
}
- // FIXME: These should be removed and handled the same was as f32 fneg. Source
- // modifiers also work for the double instructions.
- setOperationAction(ISD::FNEG, MVT::f64, Expand);
- setOperationAction(ISD::FABS, MVT::f64, Expand);
-
setOperationAction(ISD::FDIV, MVT::f32, Custom);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FMINNUM);
+ setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
-
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::UINT_TO_FP);
+ // All memory operations. Some folding on the pointer operand is done to help
+ // matching the constant offsets in the addressing modes.
+ setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD);
+ setTargetDAGCombine(ISD::ATOMIC_STORE);
+ setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
+ setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+ setTargetDAGCombine(ISD::ATOMIC_SWAP);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
+ setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
+
setSchedulingPreference(Sched::RegPressure);
}
@@ -240,15 +252,63 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
// TargetLowering queries
//===----------------------------------------------------------------------===//
-bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
- unsigned AddrSpace,
- bool *IsFast) const {
+bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
+ EVT) const {
+ // SI has some legal vector types, but no legal vector operations. Say no
+ // shuffles are legal in order to prefer scalarizing some vector operations.
+ return false;
+}
+
+// FIXME: This really needs an address space argument. The immediate offset
+// size is different for different sets of memory instruction sets.
+
+// The single offset DS instructions have a 16-bit unsigned byte offset.
+//
+// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
+// r + i with addr64. 32-bit has more addressing mode options. Depending on the
+// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
+//
+// SMRD instructions have an 8-bit, dword offset.
+//
+bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
+ Type *Ty) const {
+ // No global is ever allowed as a base.
+ if (AM.BaseGV)
+ return false;
+
+ // Allow a 16-bit unsigned immediate field, since this is what DS instructions
+ // use.
+ if (!isUInt<16>(AM.BaseOffs))
+ return false;
+
+ // Only support r+r,
+ switch (AM.Scale) {
+ case 0: // "r+i" or just "i", depending on HasBaseReg.
+ break;
+ case 1:
+ if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
+ return false;
+ // Otherwise we have r+r or r+i.
+ break;
+ case 2:
+ if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
+ return false;
+ // Allow 2*r as r+r.
+ break;
+ default: // Don't allow n * r
+ return false;
+ }
+
+ return true;
+}
+
+bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+ unsigned AddrSpace,
+ unsigned Align,
+ bool *IsFast) const {
if (IsFast)
*IsFast = false;
- // XXX: This depends on the address space and also we may want to revist
- // the alignment values we specify in the DataLayout.
-
// TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
// which isn't a simple VT.
if (!VT.isSimple() || VT == MVT::Other)
@@ -261,8 +321,12 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
// XXX - The only mention I see of this in the ISA manual is for LDS direct
// reads the "byte address and must be dword aligned". Is it also true for the
// normal loads and stores?
- if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS)
- return false;
+ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
+ // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
+ // aligned, 8 byte access in a single operation using ds_read2/write2_b32
+ // with adjacent offsets.
+ return Align % 4 == 0;
+ }
// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
// byte-address are ignored, thus forcing Dword alignment.
@@ -272,6 +336,26 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
return VT.bitsGT(MVT::i32);
}
+EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ // FIXME: Should account for address space here.
+
+ // The default fallback uses the private pointer size as a guess for a type to
+ // use. Make sure we switch these to 64-bit accesses.
+
+ if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
+ return MVT::v4i32;
+
+ if (Size >= 8 && DstAlign >= 4)
+ return MVT::v2i32;
+
+ // Use the default.
+ return MVT::Other;
+}
+
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(EVT VT) const {
if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
@@ -282,25 +366,37 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const {
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
return TII->isInlineConstant(Imm);
}
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
- SDLoc DL, SDValue Chain,
+ SDLoc SL, SDValue Chain,
unsigned Offset, bool Signed) const {
+ const DataLayout *DL = getDataLayout();
+ MachineFunction &MF = DAG.getMachineFunction();
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+ unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+
+ Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
- PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
- AMDGPUAS::CONSTANT_ADDRESS);
- SDValue BasePtr = DAG.getCopyFromReg(Chain, DL,
- MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
- SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+ SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
+ MRI.getLiveInVirtReg(InputPtrReg), MVT::i64);
+ SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr,
DAG.getConstant(Offset, MVT::i64));
- return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
- MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
- false, false, MemVT.getSizeInBits() >> 3);
-
+ SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+
+ return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD,
+ VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
+ false, // isVolatile
+ true, // isNonTemporal
+ true, // isInvariant
+ DL->getABITypeAlignment(Ty)); // Alignment
}
SDValue SITargetLowering::LowerFormalArguments(
@@ -311,7 +407,9 @@ SDValue SITargetLowering::LowerFormalArguments(
SDLoc DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const {
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const TargetMachine &TM = getTargetMachine();
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
MachineFunction &MF = DAG.getMachineFunction();
FunctionType *FType = MF.getFunction()->getFunctionType();
@@ -320,7 +418,7 @@ SDValue SITargetLowering::LowerFormalArguments(
assert(CallConv == CallingConv::C);
SmallVector<ISD::InputArg, 16> Splits;
- uint32_t Skipped = 0;
+ BitVector Skipped(Ins.size());
for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
@@ -333,7 +431,7 @@ SDValue SITargetLowering::LowerFormalArguments(
if (!Arg.Used) {
// We can savely skip PS inputs
- Skipped |= 1 << i;
+ Skipped.set(i);
++PSInputNum;
continue;
}
@@ -364,8 +462,8 @@ SDValue SITargetLowering::LowerFormalArguments(
}
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
// At least one interpolation mode must be enabled or else the GPU will hang.
if (Info->getShaderType() == ShaderType::PIXEL &&
@@ -378,13 +476,31 @@ SDValue SITargetLowering::LowerFormalArguments(
// The pointer to the list of arguments is stored in SGPR0, SGPR1
// The pointer to the scratch buffer is stored in SGPR2, SGPR3
if (Info->getShaderType() == ShaderType::COMPUTE) {
- Info->NumUserSGPRs = 4;
- CCInfo.AllocateReg(AMDGPU::SGPR0);
- CCInfo.AllocateReg(AMDGPU::SGPR1);
- CCInfo.AllocateReg(AMDGPU::SGPR2);
- CCInfo.AllocateReg(AMDGPU::SGPR3);
- MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
- MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass);
+ if (Subtarget->isAmdHsaOS())
+ Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers.
+ else
+ Info->NumUserSGPRs = 4;
+
+ unsigned InputPtrReg =
+ TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+ unsigned InputPtrRegLo =
+ TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
+ unsigned InputPtrRegHi =
+ TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
+
+ unsigned ScratchPtrReg =
+ TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+ unsigned ScratchPtrRegLo =
+ TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
+ unsigned ScratchPtrRegHi =
+ TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
+
+ CCInfo.AllocateReg(InputPtrRegLo);
+ CCInfo.AllocateReg(InputPtrRegHi);
+ CCInfo.AllocateReg(ScratchPtrRegLo);
+ CCInfo.AllocateReg(ScratchPtrRegHi);
+ MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+ MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
}
if (Info->getShaderType() == ShaderType::COMPUTE) {
@@ -397,23 +513,36 @@ SDValue SITargetLowering::LowerFormalArguments(
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
- if (Skipped & (1 << i)) {
+ if (Skipped[i]) {
InVals.push_back(DAG.getUNDEF(Arg.VT));
continue;
}
CCValAssign &VA = ArgLocs[ArgIdx++];
- EVT VT = VA.getLocVT();
+ MVT VT = VA.getLocVT();
if (VA.isMemLoc()) {
VT = Ins[i].VT;
EVT MemVT = Splits[i].VT;
+ const unsigned Offset = 36 + VA.getLocMemOffset();
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(),
- 36 + VA.getLocMemOffset(),
- Ins[i].Flags.isSExt());
+ Offset, Ins[i].Flags.isSExt());
+
+ const PointerType *ParamTy =
+ dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+ ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ // On SI local pointers are just offsets into LDS, so they are always
+ // less than 16-bits. On CI and newer they could potentially be
+ // real pointers, so we can't guarantee their size.
+ Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
+ DAG.getValueType(MVT::i16));
+ }
+
InVals.push_back(Arg);
+ Info->ABIArgOffset = Offset + MemVT.getStoreSize();
continue;
}
assert(VA.isRegLoc() && "Parameter must be in a register!");
@@ -466,69 +595,13 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
MachineBasicBlock::iterator I = *MI;
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
switch (MI->getOpcode()) {
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
case AMDGPU::BRANCH: return BB;
- case AMDGPU::SI_ADDR64_RSRC: {
- unsigned SuperReg = MI->getOperand(0).getReg();
- unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
- unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
- unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
- .addOperand(MI->getOperand(1));
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
- .addImm(0);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
- .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
- .addReg(SubRegHiLo)
- .addImm(AMDGPU::sub0)
- .addReg(SubRegHiHi)
- .addImm(AMDGPU::sub1);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg)
- .addReg(SubRegLo)
- .addImm(AMDGPU::sub0_sub1)
- .addReg(SubRegHi)
- .addImm(AMDGPU::sub2_sub3);
- MI->eraseFromParent();
- break;
- }
- case AMDGPU::SI_BUFFER_RSRC: {
- unsigned SuperReg = MI->getOperand(0).getReg();
- unsigned Args[4];
- for (unsigned i = 0, e = 4; i < e; ++i) {
- MachineOperand &Arg = MI->getOperand(i + 1);
-
- if (Arg.isReg()) {
- Args[i] = Arg.getReg();
- continue;
- }
-
- assert(Arg.isImm());
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg)
- .addImm(Arg.getImm());
- Args[i] = Reg;
- }
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
- SuperReg)
- .addReg(Args[0])
- .addImm(AMDGPU::sub0)
- .addReg(Args[1])
- .addImm(AMDGPU::sub1)
- .addReg(Args[2])
- .addImm(AMDGPU::sub2)
- .addReg(Args[3])
- .addImm(AMDGPU::sub3);
- MI->eraseFromParent();
- break;
- }
case AMDGPU::V_SUB_F64: {
unsigned DestReg = MI->getOperand(0).getReg();
BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
@@ -536,8 +609,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
.addReg(MI->getOperand(1).getReg())
.addImm(1) // SRC1 modifiers
.addReg(MI->getOperand(2).getReg())
- .addImm(0) // SRC2 modifiers
- .addImm(0) // src2
.addImm(0) // CLAMP
.addImm(0); // OMOD
MI->eraseFromParent();
@@ -555,58 +626,15 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MI->eraseFromParent();
break;
}
- case AMDGPU::FABS_SI: {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
- Reg)
- .addImm(0x7fffffff);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32),
- MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg())
- .addReg(Reg);
- MI->eraseFromParent();
- break;
- }
- case AMDGPU::FNEG_SI: {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
- Reg)
- .addImm(0x80000000);
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32),
- MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg())
- .addReg(Reg);
- MI->eraseFromParent();
- break;
- }
- case AMDGPU::FCLAMP_SI: {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64),
- MI->getOperand(0).getReg())
- .addImm(0) // SRC0 modifiers
- .addOperand(MI->getOperand(1))
- .addImm(0) // SRC1 modifiers
- .addImm(0) // SRC1
- .addImm(1) // CLAMP
- .addImm(0); // OMOD
- MI->eraseFromParent();
- }
}
return BB;
}
-EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
if (!VT.isVector()) {
return MVT::i1;
}
- return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
}
MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
@@ -636,8 +664,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
//===----------------------------------------------------------------------===//
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
@@ -656,114 +682,13 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::FDIV: return LowerFDIV(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
- case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
- case ISD::INTRINSIC_WO_CHAIN: {
- unsigned IntrinsicID =
- cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
- switch (IntrinsicID) {
- default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
- case Intrinsic::r600_read_ngroups_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false);
- case Intrinsic::r600_read_ngroups_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false);
- case Intrinsic::r600_read_ngroups_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false);
- case Intrinsic::r600_read_global_size_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false);
- case Intrinsic::r600_read_global_size_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false);
- case Intrinsic::r600_read_global_size_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false);
- case Intrinsic::r600_read_local_size_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false);
- case Intrinsic::r600_read_local_size_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false);
- case Intrinsic::r600_read_local_size_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
- case Intrinsic::r600_read_tgid_x:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT);
- case Intrinsic::r600_read_tgid_y:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT);
- case Intrinsic::r600_read_tgid_z:
- return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT);
- case Intrinsic::r600_read_tidig_x:
- return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
- AMDGPU::VGPR0, VT);
- case Intrinsic::r600_read_tidig_y:
- return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
- AMDGPU::VGPR1, VT);
- case Intrinsic::r600_read_tidig_z:
- return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
- AMDGPU::VGPR2, VT);
- case AMDGPUIntrinsic::SI_load_const: {
- SDValue Ops [] = {
- Op.getOperand(1),
- Op.getOperand(2)
- };
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
- VT.getSizeInBits() / 8, 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
- Op->getVTList(), Ops, VT, MMO);
- }
- case AMDGPUIntrinsic::SI_sample:
- return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
- case AMDGPUIntrinsic::SI_sampleb:
- return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
- case AMDGPUIntrinsic::SI_sampled:
- return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
- case AMDGPUIntrinsic::SI_samplel:
- return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
- case AMDGPUIntrinsic::SI_vs_load_input:
- return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3));
- }
+ case ISD::GlobalAddress: {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ return LowerGlobalAddress(MFI, Op, DAG);
}
-
- case ISD::INTRINSIC_VOID:
- SDValue Chain = Op.getOperand(0);
- unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-
- switch (IntrinsicID) {
- case AMDGPUIntrinsic::SI_tbuffer_store: {
- SDLoc DL(Op);
- SDValue Ops [] = {
- Chain,
- Op.getOperand(2),
- Op.getOperand(3),
- Op.getOperand(4),
- Op.getOperand(5),
- Op.getOperand(6),
- Op.getOperand(7),
- Op.getOperand(8),
- Op.getOperand(9),
- Op.getOperand(10),
- Op.getOperand(11),
- Op.getOperand(12),
- Op.getOperand(13),
- Op.getOperand(14)
- };
- EVT VT = Op.getOperand(3).getValueType();
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore,
- VT.getSizeInBits() / 8, 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
- }
- default:
- break;
- }
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
}
return SDValue();
}
@@ -786,16 +711,9 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
- CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI.getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET), MVT::i32);
-
return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
}
@@ -849,7 +767,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
BR->getOperand(0),
BRCOND.getOperand(2)
};
- DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops);
+ SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
+ DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
+ BR = NewBR.getNode();
}
SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
@@ -905,6 +825,139 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
}
+SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo());
+
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+ switch (IntrinsicID) {
+ case Intrinsic::r600_read_ngroups_x:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::NGROUPS_X, false);
+ case Intrinsic::r600_read_ngroups_y:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::NGROUPS_Y, false);
+ case Intrinsic::r600_read_ngroups_z:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::NGROUPS_Z, false);
+ case Intrinsic::r600_read_global_size_x:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+ case Intrinsic::r600_read_global_size_y:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+ case Intrinsic::r600_read_global_size_z:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+ case Intrinsic::r600_read_local_size_x:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+ case Intrinsic::r600_read_local_size_y:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+ case Intrinsic::r600_read_local_size_z:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
+
+ case Intrinsic::AMDGPU_read_workdim:
+ return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+ MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
+ false);
+
+ case Intrinsic::r600_read_tgid_x:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+ case Intrinsic::r600_read_tgid_y:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+ case Intrinsic::r600_read_tgid_z:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+ case Intrinsic::r600_read_tidig_x:
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+ case Intrinsic::r600_read_tidig_y:
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+ case Intrinsic::r600_read_tidig_z:
+ return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+ case AMDGPUIntrinsic::SI_load_const: {
+ SDValue Ops[] = {
+ Op.getOperand(1),
+ Op.getOperand(2)
+ };
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), 4);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+ case AMDGPUIntrinsic::SI_sample:
+ return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
+ case AMDGPUIntrinsic::SI_sampleb:
+ return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
+ case AMDGPUIntrinsic::SI_sampled:
+ return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
+ case AMDGPUIntrinsic::SI_samplel:
+ return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
+ case AMDGPUIntrinsic::SI_vs_load_input:
+ return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+ Op.getOperand(1),
+ Op.getOperand(2),
+ Op.getOperand(3));
+ default:
+ return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+ }
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+ switch (IntrinsicID) {
+ case AMDGPUIntrinsic::SI_tbuffer_store: {
+ SDLoc DL(Op);
+ SDValue Ops[] = {
+ Chain,
+ Op.getOperand(2),
+ Op.getOperand(3),
+ Op.getOperand(4),
+ Op.getOperand(5),
+ Op.getOperand(6),
+ Op.getOperand(7),
+ Op.getOperand(8),
+ Op.getOperand(9),
+ Op.getOperand(10),
+ Op.getOperand(11),
+ Op.getOperand(12),
+ Op.getOperand(13),
+ Op.getOperand(14)
+ };
+
+ EVT VT = Op.getOperand(3).getValueType();
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOStore,
+ VT.getStoreSize(), 4);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+ Op->getVTList(), Ops, VT, MMO);
+ }
+ default:
+ return SDValue();
+ }
+}
+
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -923,7 +976,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
break;
// fall-through
case AMDGPUAS::LOCAL_ADDRESS:
- return SplitVectorLoad(Op, DAG);
+ return ScalarizeVectorLoad(Op, DAG);
}
}
@@ -1027,7 +1080,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
const APFloat K1Val(BitsToFloat(0x2f800000));
const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32);
- const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32);
+ const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
@@ -1073,7 +1126,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
if (VT.isVector() && VT.getVectorNumElements() > 4)
- return SplitVectorStore(Op, DAG);
+ return ScalarizeVectorStore(Op, DAG);
return SDValue();
}
@@ -1082,7 +1135,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return Ret;
if (VT.isVector() && VT.getVectorNumElements() >= 8)
- return SplitVectorStore(Op, DAG);
+ return ScalarizeVectorStore(Op, DAG);
if (VT == MVT::i1)
return DAG.getTruncStore(Store->getChain(), DL,
@@ -1114,7 +1167,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
//===----------------------------------------------------------------------===//
SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
- DAGCombinerInfo &DCI) {
+ DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
EVT ScalarVT = VT.getScalarType();
if (ScalarVT != MVT::f32)
@@ -1162,8 +1215,21 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
-
LoadSDNode *Load = cast<LoadSDNode>(Src);
+
+ unsigned AS = Load->getAddressSpace();
+ unsigned Align = Load->getAlignment();
+ Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
+ unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+
+ // Don't try to replace the load if we have to expand it due to alignment
+ // problems. Otherwise we will end up scalarizing the load, and trying to
+ // repack into the vector for no real reason.
+ if (Align < ABIAlignment &&
+ !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
+ return SDValue();
+ }
+
SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
Load->getChain(),
Load->getBasePtr(),
@@ -1203,33 +1269,259 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
return SDValue();
}
+// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
+
+// This is a variant of
+// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
+//
+// The normal DAG combiner will do this, but only if the add has one use since
+// that would increase the number of instructions.
+//
+// This prevents us from seeing a constant offset that can be folded into a
+// memory instruction's addressing mode. If we know the resulting add offset of
+// a pointer can be folded into an addressing offset, we can replace the pointer
+// operand with the add of new constant offset. This eliminates one of the uses,
+// and may allow the remaining use to also be simplified.
+//
+SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
+ unsigned AddrSpace,
+ DAGCombinerInfo &DCI) const {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (N0.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
+ if (!CN1)
+ return SDValue();
+
+ const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!CAdd)
+ return SDValue();
+
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
+
+ // If the resulting offset is too large, we can't fold it into the addressing
+ // mode offset.
+ APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
+ if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace))
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+
+ SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
+ SDValue COffset = DAG.getConstant(Offset, MVT::i32);
+
+ return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
+}
+
+SDValue SITargetLowering::performAndCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+
+ // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
+ // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ if (LHS.getOpcode() == ISD::SETCC &&
+ RHS.getOpcode() == ISD::SETCC) {
+ ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+ ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
+
+ SDValue X = LHS.getOperand(0);
+ SDValue Y = RHS.getOperand(0);
+ if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
+ return SDValue();
+
+ if (LCC == ISD::SETO) {
+ if (X != LHS.getOperand(1))
+ return SDValue();
+
+ if (RCC == ISD::SETUNE) {
+ const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+ if (!C1 || !C1->isInfinity() || C1->isNegative())
+ return SDValue();
+
+ const uint32_t Mask = SIInstrFlags::N_NORMAL |
+ SIInstrFlags::N_SUBNORMAL |
+ SIInstrFlags::N_ZERO |
+ SIInstrFlags::P_ZERO |
+ SIInstrFlags::P_SUBNORMAL |
+ SIInstrFlags::P_NORMAL;
+
+ static_assert(((~(SIInstrFlags::S_NAN |
+ SIInstrFlags::Q_NAN |
+ SIInstrFlags::N_INFINITY |
+ SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
+ "mask not equal");
+
+ return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
+ X, DAG.getConstant(Mask, MVT::i32));
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performOrCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+ if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+ RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+ SDValue Src = LHS.getOperand(0);
+ if (Src != RHS.getOperand(0))
+ return SDValue();
+
+ const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if (!CLHS || !CRHS)
+ return SDValue();
+
+ // Only 10 bits are used.
+ static const uint32_t MaxMask = 0x3ff;
+
+ uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+ return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
+ Src, DAG.getConstant(NewMask, MVT::i32));
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performClassCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Mask = N->getOperand(1);
+
+ // fp_class x, 0 -> false
+ if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
+ if (CMask->isNullValue())
+ return DAG.getConstant(0, MVT::i1);
+ }
+
+ return SDValue();
+}
+
+static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
+ switch (Opc) {
+ case ISD::FMAXNUM:
+ return AMDGPUISD::FMAX3;
+ case AMDGPUISD::SMAX:
+ return AMDGPUISD::SMAX3;
+ case AMDGPUISD::UMAX:
+ return AMDGPUISD::UMAX3;
+ case ISD::FMINNUM:
+ return AMDGPUISD::FMIN3;
+ case AMDGPUISD::SMIN:
+ return AMDGPUISD::SMIN3;
+ case AMDGPUISD::UMIN:
+ return AMDGPUISD::UMIN3;
+ default:
+ llvm_unreachable("Not a min/max opcode");
+ }
+}
+
+SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ unsigned Opc = N->getOpcode();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // Only do this if the inner op has one use since this will just increases
+ // register pressure for no benefit.
+
+ // max(max(a, b), c)
+ if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+ SDLoc DL(N);
+ return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+ DL,
+ N->getValueType(0),
+ Op0.getOperand(0),
+ Op0.getOperand(1),
+ Op1);
+ }
+
+ // max(a, max(b, c))
+ if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+ SDLoc DL(N);
+ return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+ DL,
+ N->getValueType(0),
+ Op0,
+ Op1.getOperand(0),
+ Op1.getOperand(1));
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performSetCCCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ EVT VT = LHS.getValueType();
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return SDValue();
+
+ // Match isinf pattern
+ // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
+ const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+ if (!CRHS)
+ return SDValue();
+
+ const APFloat &APF = CRHS->getValueAPF();
+ if (APF.isInfinity() && !APF.isNegative()) {
+ unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
+ return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
+ LHS.getOperand(0), DAG.getConstant(Mask, MVT::i32));
+ }
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
- EVT VT = N->getValueType(0);
switch (N->getOpcode()) {
- default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
- case ISD::SETCC: {
- SDValue Arg0 = N->getOperand(0);
- SDValue Arg1 = N->getOperand(1);
- SDValue CC = N->getOperand(2);
- ConstantSDNode * C = nullptr;
- ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
-
- // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
- if (VT == MVT::i1
- && Arg0.getOpcode() == ISD::SIGN_EXTEND
- && Arg0.getOperand(0).getValueType() == MVT::i1
- && (C = dyn_cast<ConstantSDNode>(Arg1))
- && C->isNullValue()
- && CCOp == ISD::SETNE) {
- return SimplifySetCC(VT, Arg0.getOperand(0),
- DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
- }
- break;
- }
+ default:
+ return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+ case ISD::SETCC:
+ return performSetCCCombine(N, DCI);
+ case ISD::FMAXNUM: // TODO: What about fmax_legacy?
+ case ISD::FMINNUM:
+ case AMDGPUISD::SMAX:
+ case AMDGPUISD::SMIN:
+ case AMDGPUISD::UMAX:
+ case AMDGPUISD::UMIN: {
+ if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+ getTargetMachine().getOptLevel() > CodeGenOpt::None)
+ return performMin3Max3Combine(N, DCI);
+ break;
+ }
case AMDGPUISD::CVT_F32_UBYTE0:
case AMDGPUISD::CVT_F32_UBYTE1:
@@ -1254,22 +1546,155 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UINT_TO_FP: {
return performUCharToFloatCombine(N, DCI);
+
+ case ISD::FADD: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::f32)
+ break;
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // These should really be instruction patterns, but writing patterns with
+ // source modiifiers is a pain.
+
+ // fadd (fadd (a, a), b) -> mad 2.0, a, b
+ if (LHS.getOpcode() == ISD::FADD) {
+ SDValue A = LHS.getOperand(0);
+ if (A == LHS.getOperand(1)) {
+ const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
+ return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS);
+ }
+ }
+
+ // fadd (b, fadd (a, a)) -> mad 2.0, a, b
+ if (RHS.getOpcode() == ISD::FADD) {
+ SDValue A = RHS.getOperand(0);
+ if (A == RHS.getOperand(1)) {
+ const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
+ return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS);
+ }
+ }
+
+ break;
+ }
+ case ISD::FSUB: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ EVT VT = N->getValueType(0);
+
+ // Try to get the fneg to fold into the source modifier. This undoes generic
+ // DAG combines and folds them into the mad.
+ if (VT == MVT::f32) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ if (LHS.getOpcode() == ISD::FMUL) {
+ // (fsub (fmul a, b), c) -> mad a, b, (fneg c)
+
+ SDValue A = LHS.getOperand(0);
+ SDValue B = LHS.getOperand(1);
+ SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS);
+
+ return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
+ }
+
+ if (RHS.getOpcode() == ISD::FMUL) {
+ // (fsub c, (fmul a, b)) -> mad (fneg a), b, c
+
+ SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0));
+ SDValue B = RHS.getOperand(1);
+ SDValue C = LHS;
+
+ return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
+ }
+
+ if (LHS.getOpcode() == ISD::FADD) {
+ // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
+
+ SDValue A = LHS.getOperand(0);
+ if (A == LHS.getOperand(1)) {
+ const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
+ SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
+
+ return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS);
+ }
+ }
+
+ if (RHS.getOpcode() == ISD::FADD) {
+ // (fsub c, (fadd a, a)) -> mad -2.0, a, c
+
+ SDValue A = RHS.getOperand(0);
+ if (A == RHS.getOperand(1)) {
+ const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32);
+ return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS);
+ }
+ }
+ }
+
+ break;
}
}
+ case ISD::LOAD:
+ case ISD::STORE:
+ case ISD::ATOMIC_LOAD:
+ case ISD::ATOMIC_STORE:
+ case ISD::ATOMIC_CMP_SWAP:
+ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+ case ISD::ATOMIC_SWAP:
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_NAND:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
+ if (DCI.isBeforeLegalize())
+ break;
+
+ MemSDNode *MemNode = cast<MemSDNode>(N);
+ SDValue Ptr = MemNode->getBasePtr();
+ // TODO: We could also do this for multiplies.
+ unsigned AS = MemNode->getAddressSpace();
+ if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
+ if (NewPtr) {
+ SmallVector<SDValue, 8> NewOps;
+ for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I)
+ NewOps.push_back(MemNode->getOperand(I));
+
+ NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+ return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
+ }
+ }
+ break;
+ }
+ case ISD::AND:
+ return performAndCombine(N, DCI);
+ case ISD::OR:
+ return performOrCombine(N, DCI);
+ case AMDGPUISD::FP_CLASS:
+ return performClassCombine(N, DCI);
+ }
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
/// \brief Test if RegClass is one of the VSrc classes
static bool isVSrc(unsigned RegClass) {
- return AMDGPU::VSrc_32RegClassID == RegClass ||
- AMDGPU::VSrc_64RegClassID == RegClass;
-}
-
-/// \brief Test if RegClass is one of the SSrc classes
-static bool isSSrc(unsigned RegClass) {
- return AMDGPU::SSrc_32RegClassID == RegClass ||
- AMDGPU::SSrc_64RegClassID == RegClass;
+ switch(RegClass) {
+ default: return false;
+ case AMDGPU::VS_32RegClassID:
+ case AMDGPU::VS_64RegClassID:
+ return true;
+ }
}
/// \brief Analyze the possible immediate value Op
@@ -1278,75 +1703,36 @@ static bool isSSrc(unsigned RegClass) {
/// and the immediate value if it's a literal immediate
int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
- union {
- int32_t I;
- float F;
- } Imm;
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
- if (Node->getZExtValue() >> 32) {
- return -1;
- }
- Imm.I = Node->getSExtValue();
- } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
- if (N->getValueType(0) != MVT::f32)
+ if (Node->getZExtValue() >> 32)
return -1;
- Imm.F = Node->getValueAPF().convertToFloat();
- } else
- return -1; // It isn't an immediate
-
- if ((Imm.I >= -16 && Imm.I <= 64) ||
- Imm.F == 0.5f || Imm.F == -0.5f ||
- Imm.F == 1.0f || Imm.F == -1.0f ||
- Imm.F == 2.0f || Imm.F == -2.0f ||
- Imm.F == 4.0f || Imm.F == -4.0f)
- return 0; // It's an inline immediate
-
- return Imm.I; // It's a literal immediate
-}
-
-/// \brief Try to fold an immediate directly into an instruction
-bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
- bool &ScalarSlotUsed) const {
-
- MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
- return false;
- const SDValue &Op = Mov->getOperand(0);
- int32_t Value = analyzeImmediate(Op.getNode());
- if (Value == -1) {
- // Not an immediate at all
- return false;
+ if (TII->isInlineConstant(Node->getAPIntValue()))
+ return 0;
- } else if (Value == 0) {
- // Inline immediates can always be fold
- Operand = Op;
- return true;
+ return Node->getZExtValue();
+ }
- } else if (Value == Immediate) {
- // Already fold literal immediate
- Operand = Op;
- return true;
+ if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+ if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
+ return 0;
- } else if (!ScalarSlotUsed && !Immediate) {
- // Fold this literal immediate
- ScalarSlotUsed = true;
- Immediate = Value;
- Operand = Op;
- return true;
+ if (Node->getValueType(0) == MVT::f32)
+ return FloatToBits(Node->getValueAPF().convertToFloat());
+ return -1;
}
- return false;
+ return -1;
}
const TargetRegisterClass *SITargetLowering::getRegClassForNode(
SelectionDAG &DAG, const SDValue &Op) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
const SIRegisterInfo &TRI = TII->getRegisterInfo();
if (!Op->isMachineOpcode()) {
@@ -1375,10 +1761,9 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode(
// If the COPY_TO_REGCLASS instruction is copying to a VSrc register
// class, then the register class for the value could be either a
// VReg or and SReg. In order to get a more accurate
- if (OpClassID == AMDGPU::VSrc_32RegClassID ||
- OpClassID == AMDGPU::VSrc_64RegClassID) {
+ if (isVSrc(OpClassID))
return getRegClassForNode(DAG, Op.getOperand(0));
- }
+
return TRI.getRegClass(OpClassID);
case AMDGPU::EXTRACT_SUBREG: {
int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -1398,7 +1783,8 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode(
/// \brief Does "Op" fit into register class "RegClass" ?
bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
unsigned RegClass) const {
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo *TRI =
+ getTargetMachine().getSubtargetImpl()->getRegisterInfo();
const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
if (!RC) {
return false;
@@ -1406,242 +1792,6 @@ bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
}
-/// \brief Make sure that we don't exeed the number of allowed scalars
-void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
- unsigned RegClass,
- bool &ScalarSlotUsed) const {
-
- // First map the operands register class to a destination class
- if (RegClass == AMDGPU::VSrc_32RegClassID)
- RegClass = AMDGPU::VReg_32RegClassID;
- else if (RegClass == AMDGPU::VSrc_64RegClassID)
- RegClass = AMDGPU::VReg_64RegClassID;
- else
- return;
-
- // Nothing to do if they fit naturally
- if (fitsRegClass(DAG, Operand, RegClass))
- return;
-
- // If the scalar slot isn't used yet use it now
- if (!ScalarSlotUsed) {
- ScalarSlotUsed = true;
- return;
- }
-
- // This is a conservative aproach. It is possible that we can't determine the
- // correct register class and copy too often, but better safe than sorry.
-
- SDNode *Node;
- // We can't use COPY_TO_REGCLASS with FrameIndex arguments.
- if (isa<FrameIndexSDNode>(Operand)) {
- unsigned Opcode = Operand.getValueType() == MVT::i32 ?
- AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(),
- Operand);
- } else {
- SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
- Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
- Operand.getValueType(), Operand, RC);
- }
- Operand = SDValue(Node, 0);
-}
-
-/// \returns true if \p Node's operands are different from the SDValue list
-/// \p Ops
-static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
- for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
- if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
- return true;
- }
- }
- return false;
-}
-
-/// \brief Try to fold the Nodes operands into the Node
-SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
- SelectionDAG &DAG) const {
-
- // Original encoding (either e32 or e64)
- int Opcode = Node->getMachineOpcode();
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- const MCInstrDesc *Desc = &TII->get(Opcode);
-
- unsigned NumDefs = Desc->getNumDefs();
- unsigned NumOps = Desc->getNumOperands();
-
- // Commuted opcode if available
- int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
- const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
-
- assert(!DescRev || DescRev->getNumDefs() == NumDefs);
- assert(!DescRev || DescRev->getNumOperands() == NumOps);
-
- // e64 version if available, -1 otherwise
- int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
- const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64);
- int InputModifiers[3] = {0};
-
- assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
-
- int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
- bool HaveVSrc = false, HaveSSrc = false;
-
- // First figure out what we already have in this instruction.
- for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
- i != e && Op < NumOps; ++i, ++Op) {
-
- unsigned RegClass = Desc->OpInfo[Op].RegClass;
- if (isVSrc(RegClass))
- HaveVSrc = true;
- else if (isSSrc(RegClass))
- HaveSSrc = true;
- else
- continue;
-
- int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode());
- if (Imm != -1 && Imm != 0) {
- // Literal immediate
- Immediate = Imm;
- }
- }
-
- // If we neither have VSrc nor SSrc, it makes no sense to continue.
- if (!HaveVSrc && !HaveSSrc)
- return Node;
-
- // No scalar allowed when we have both VSrc and SSrc
- bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
-
- // Second go over the operands and try to fold them
- std::vector<SDValue> Ops;
- bool Promote2e64 = false;
- for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
- i != e && Op < NumOps; ++i, ++Op) {
-
- const SDValue &Operand = Node->getOperand(i);
- Ops.push_back(Operand);
-
- // Already folded immediate?
- if (isa<ConstantSDNode>(Operand.getNode()) ||
- isa<ConstantFPSDNode>(Operand.getNode()))
- continue;
-
- // Is this a VSrc or SSrc operand?
- unsigned RegClass = Desc->OpInfo[Op].RegClass;
- if (isVSrc(RegClass) || isSSrc(RegClass)) {
- // Try to fold the immediates
- if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
- // Folding didn't work, make sure we don't hit the SReg limit.
- ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
- }
- continue;
- } else {
- // If it's not a VSrc or SSrc operand check if we have a GlobalAddress.
- // These will be lowered to immediates, so we will need to insert a MOV.
- if (isa<GlobalAddressSDNode>(Ops[i])) {
- SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(),
- Operand.getValueType(), Operand);
- Ops[i] = SDValue(Node, 0);
- }
- }
-
- if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
-
- unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass;
- assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass));
-
- // Test if it makes sense to swap operands
- if (foldImm(Ops[1], Immediate, ScalarSlotUsed) ||
- (!fitsRegClass(DAG, Ops[1], RegClass) &&
- fitsRegClass(DAG, Ops[1], OtherRegClass))) {
-
- // Swap commutable operands
- std::swap(Ops[0], Ops[1]);
-
- Desc = DescRev;
- DescRev = nullptr;
- continue;
- }
- }
-
- if (Immediate)
- continue;
-
- if (DescE64) {
- // Test if it makes sense to switch to e64 encoding
- unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
- if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
- continue;
-
- int32_t TmpImm = -1;
- if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) ||
- (!fitsRegClass(DAG, Ops[i], RegClass) &&
- fitsRegClass(DAG, Ops[1], OtherRegClass))) {
-
- // Switch to e64 encoding
- Immediate = -1;
- Promote2e64 = true;
- Desc = DescE64;
- DescE64 = nullptr;
- }
- }
-
- if (!DescE64 && !Promote2e64)
- continue;
- if (!Operand.isMachineOpcode())
- continue;
- if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) {
- Ops.pop_back();
- Ops.push_back(Operand.getOperand(0));
- InputModifiers[i] = 1;
- Promote2e64 = true;
- if (!DescE64)
- continue;
- Desc = DescE64;
- DescE64 = nullptr;
- }
- else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
- Ops.pop_back();
- Ops.push_back(Operand.getOperand(0));
- InputModifiers[i] = 2;
- Promote2e64 = true;
- if (!DescE64)
- continue;
- Desc = DescE64;
- DescE64 = nullptr;
- }
- }
-
- if (Promote2e64) {
- std::vector<SDValue> OldOps(Ops);
- Ops.clear();
- for (unsigned i = 0; i < OldOps.size(); ++i) {
- // src_modifier
- Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32));
- Ops.push_back(OldOps[i]);
- }
- // Add the modifier flags while promoting
- for (unsigned i = 0; i < 2; ++i)
- Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
- }
-
- // Add optional chain and glue
- for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
- Ops.push_back(Node->getOperand(i));
-
- // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
- // this case a brand new node is always be created, even if the operands
- // are the same as before. So, manually check if anything has been changed.
- if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) {
- return Node;
- }
-
- // Create a complete new instruction
- return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops);
-}
-
/// \brief Helper function for adjustWritemask
static unsigned SubIdx2Lane(unsigned Idx) {
switch (Idx) {
@@ -1706,7 +1856,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// If we only got one lane, replace it with a copy
// (if NewDmask has only one bit set...)
if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
- SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
+ SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, MVT::i32);
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
SDLoc(), Users[Lane]->getValueType(0),
SDValue(Node, 0), RC);
@@ -1733,46 +1883,185 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
}
}
+/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// with frame index operands.
+/// LLVM assumes that inputs are to these instructions are registers.
+void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
+ SelectionDAG &DAG) const {
+
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
+ if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+ Ops.push_back(Node->getOperand(i));
+ continue;
+ }
+
+ SDLoc DL(Node);
+ Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
+ Node->getOperand(i).getValueType(),
+ Node->getOperand(i)), 0));
+ }
+
+ DAG.UpdateNodeOperands(Node, Ops);
+}
+
/// \brief Fold the instructions after selecting them.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
Node = AdjustRegClass(Node, DAG);
if (TII->isMIMG(Node->getMachineOpcode()))
adjustWritemask(Node, DAG);
- return foldOperands(Node, DAG);
+ if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
+ Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
+ legalizeTargetIndependentNode(Node, DAG);
+ return Node;
+ }
+ return Node;
}
/// \brief Assign the register class depending on the number of
/// bits set in the writemask
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
SDNode *Node) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
- if (!TII->isMIMG(MI->getOpcode()))
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
+
+ MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+ TII->legalizeOperands(MI);
+
+ if (TII->isMIMG(MI->getOpcode())) {
+ unsigned VReg = MI->getOperand(0).getReg();
+ unsigned Writemask = MI->getOperand(1).getImm();
+ unsigned BitsSet = 0;
+ for (unsigned i = 0; i < 4; ++i)
+ BitsSet += Writemask & (1 << i) ? 1 : 0;
+
+ const TargetRegisterClass *RC;
+ switch (BitsSet) {
+ default: return;
+ case 1: RC = &AMDGPU::VGPR_32RegClass; break;
+ case 2: RC = &AMDGPU::VReg_64RegClass; break;
+ case 3: RC = &AMDGPU::VReg_96RegClass; break;
+ }
+
+ unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
+ MI->setDesc(TII->get(NewOpcode));
+ MRI.setRegClass(VReg, RC);
return;
+ }
- unsigned VReg = MI->getOperand(0).getReg();
- unsigned Writemask = MI->getOperand(1).getImm();
- unsigned BitsSet = 0;
- for (unsigned i = 0; i < 4; ++i)
- BitsSet += Writemask & (1 << i) ? 1 : 0;
+ // Replace unused atomics with the no return version.
+ int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
+ if (NoRetAtomicOp != -1) {
+ if (!Node->hasAnyUseOfValue(0)) {
+ MI->setDesc(TII->get(NoRetAtomicOp));
+ MI->RemoveOperand(0);
+ }
- const TargetRegisterClass *RC;
- switch (BitsSet) {
- default: return;
- case 1: RC = &AMDGPU::VReg_32RegClass; break;
- case 2: RC = &AMDGPU::VReg_64RegClass; break;
- case 3: RC = &AMDGPU::VReg_96RegClass; break;
+ return;
}
+}
- unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
- MI->setDesc(TII->get(NewOpcode));
- MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
- MRI.setRegClass(VReg, RC);
+static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
+ SDValue K = DAG.getTargetConstant(Val, MVT::i32);
+ return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
+}
+
+MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
+ SDLoc DL,
+ SDValue Ptr) const {
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
+#if 1
+ // XXX - Workaround for moveToVALU not handling different register class
+ // inserts for REG_SEQUENCE.
+
+ // Build the half of the subregister with the constants.
+ const SDValue Ops0[] = {
+ DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32),
+ buildSMovImm32(DAG, DL, 0),
+ DAG.getTargetConstant(AMDGPU::sub0, MVT::i32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+ DAG.getTargetConstant(AMDGPU::sub1, MVT::i32)
+ };
+
+ SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::v2i32, Ops0), 0);
+
+ // Combine the constants and the pointer.
+ const SDValue Ops1[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+ Ptr,
+ DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
+ SubRegHi,
+ DAG.getTargetConstant(AMDGPU::sub2_sub3, MVT::i32)
+ };
+
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
+#else
+ const SDValue Ops[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+ Ptr,
+ DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
+ buildSMovImm32(DAG, DL, 0),
+ DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
+ DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
+ };
+
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+
+#endif
+}
+
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
+/// of the resource descriptor) to create an offset, which is added to the
+/// resource ponter.
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
+ SDLoc DL,
+ SDValue Ptr,
+ uint32_t RsrcDword1,
+ uint64_t RsrcDword2And3) const {
+ SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+ SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+ if (RsrcDword1) {
+ PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
+ DAG.getConstant(RsrcDword1, MVT::i32)), 0);
+ }
+
+ SDValue DataLo = buildSMovImm32(DAG, DL,
+ RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
+ SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
+
+ const SDValue Ops[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+ PtrLo,
+ DAG.getTargetConstant(AMDGPU::sub0, MVT::i32),
+ PtrHi,
+ DAG.getTargetConstant(AMDGPU::sub1, MVT::i32),
+ DataLo,
+ DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
+ DataHi,
+ DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
+ };
+
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+}
+
+MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
+ SDLoc DL,
+ SDValue Ptr) const {
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+ getTargetMachine().getSubtargetImpl()->getInstrInfo());
+ uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
+ 0xffffffff; // Size
+
+ return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
}
MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
@@ -1800,12 +2089,26 @@ MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
return N;
}
ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
- SDValue Ops[] = {
- SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128,
- DAG.getConstant(0, MVT::i64)), 0),
- N->getOperand(0),
- DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)
- };
+
+ const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64);
+ SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0);
+ MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr);
+
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(SDValue(RSrc, 0));
+ Ops.push_back(N->getOperand(0));
+
+ // The immediate offset is in dwords on SI and in bytes on VI.
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ Ops.push_back(DAG.getTargetConstant(Offset->getSExtValue(), MVT::i32));
+ else
+ Ops.push_back(DAG.getTargetConstant(Offset->getSExtValue() << 2, MVT::i32));
+
+ // Copy remaining operands so we keep any chain and glue nodes that follow
+ // the normal operands.
+ for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I)
+ Ops.push_back(N->getOperand(I));
+
return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
}
}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index d106d4abb187..876fd8c9f369 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef SIISELLOWERING_H
-#define SIISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H
#include "AMDGPUISelLowering.h"
#include "SIInstrInfo.h"
@@ -27,6 +27,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
+
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -34,30 +37,49 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- bool foldImm(SDValue &Operand, int32_t &Immediate,
- bool &ScalarSlotUsed) const;
const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
const SDValue &Op) const;
bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
unsigned RegClass) const;
- void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
- unsigned RegClass, bool &ScalarSlotUsed) const;
- SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
- static SDValue performUCharToFloatCombine(SDNode *N,
- DAGCombinerInfo &DCI);
+ SDValue performUCharToFloatCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const;
+ SDValue performSHLPtrCombine(SDNode *N,
+ unsigned AS,
+ DAGCombinerInfo &DCI) const;
+ SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
public:
SITargetLowering(TargetMachine &tm);
- bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
- bool *IsFast) const override;
+
+ bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+ EVT /*VT*/) const override;
+
+ bool isLegalAddressingMode(const AddrMode &AM,
+ Type *Ty) const override;
+
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+ unsigned Align,
+ bool *IsFast) const override;
+
+ EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+ unsigned SrcAlign, bool IsMemset,
+ bool ZeroMemset,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const override;
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
@@ -85,8 +107,19 @@ public:
int32_t analyzeImmediate(const SDNode *N) const;
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const override;
+ void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
+
+ MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const;
+ MachineSDNode *buildRSRC(SelectionDAG &DAG,
+ SDLoc DL,
+ SDValue Ptr,
+ uint32_t RsrcDword1,
+ uint64_t RsrcDword2And3) const;
+ MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
+ SDLoc DL,
+ SDValue Ptr) const;
};
} // End namespace llvm
-#endif //SIISELLOWERING_H
+#endif
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 7dfc31bdfa01..181b11643bf3 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -17,6 +17,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -39,6 +41,12 @@ typedef union {
} Counters;
+typedef enum {
+ OTHER,
+ SMEM,
+ VMEM
+} InstType;
+
typedef Counters RegCounters[512];
typedef std::pair<unsigned, unsigned> RegInterval;
@@ -71,6 +79,9 @@ private:
/// \brief Different export instruction types seen since last wait.
unsigned ExpInstrTypesSeen;
+ /// \brief Type of the last opcode.
+ InstType LastOpcodeType;
+
/// \brief Get increment/decrement amount for this instruction.
Counters getHwCounts(MachineInstr &MI);
@@ -81,7 +92,8 @@ private:
RegInterval getRegInterval(MachineOperand &Op);
/// \brief Handle instructions async components
- void pushInstruction(MachineInstr &MI);
+ void pushInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I);
/// \brief Insert the actual wait instruction
bool insertWait(MachineBasicBlock &MBB,
@@ -174,6 +186,29 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
if (!MI.getDesc().mayStore())
return false;
+ // Check if this operand is the value being stored.
+ // Special case for DS instructions, since the address
+ // operand comes before the value operand and it may have
+ // multiple data operands.
+
+ if (TII->isDS(MI.getOpcode())) {
+ MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
+ if (Data && Op.isIdenticalTo(*Data))
+ return true;
+
+ MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+ if (Data0 && Op.isIdenticalTo(*Data0))
+ return true;
+
+ MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
+ if (Data1 && Op.isIdenticalTo(*Data1))
+ return true;
+
+ return false;
+ }
+
+ // NOTE: This assumes that the value operand is before the
+ // address operand, and that there is only one value operand.
for (MachineInstr::mop_iterator I = MI.operands_begin(),
E = MI.operands_end(); I != E; ++I) {
@@ -201,10 +236,11 @@ RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
return Result;
}
-void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
// Get the hardware counter increments and sum them up
- Counters Increment = getHwCounts(MI);
+ Counters Increment = getHwCounts(*I);
unsigned Sum = 0;
for (unsigned i = 0; i < 3; ++i) {
@@ -213,17 +249,42 @@ void SIInsertWaits::pushInstruction(MachineInstr &MI) {
}
// If we don't increase anything then that's it
- if (Sum == 0)
+ if (Sum == 0) {
+ LastOpcodeType = OTHER;
return;
+ }
+
+ if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+ // or SMEM clause, respectively.
+ //
+ // The temporary workaround is to break the clauses with S_NOP.
+ //
+ // The proper solution would be to allocate registers such that all source
+ // and destination registers don't overlap, e.g. this is illegal:
+ // r0 = load r2
+ // r2 = load r0
+ if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+ (LastOpcodeType == VMEM && Increment.Named.VM)) {
+ // Insert a NOP to break the clause.
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
+ .addImm(0);
+ }
+
+ if (TII->isSMRD(I->getOpcode()))
+ LastOpcodeType = SMEM;
+ else if (Increment.Named.VM)
+ LastOpcodeType = VMEM;
+ }
// Remember which export instructions we have seen
if (Increment.Named.EXP) {
- ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+ ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
}
- for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- MachineOperand &Op = MI.getOperand(i);
+ MachineOperand &Op = I->getOperand(i);
if (!isOpRelevant(Op))
continue;
@@ -300,6 +361,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
((Counts.Named.EXP & 0x7) << 4) |
((Counts.Named.LGKM & 0x7) << 8));
+ LastOpcodeType = OTHER;
return true;
}
@@ -346,13 +408,15 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
bool Changes = false;
- TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
- TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+ TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TRI =
+ static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
MRI = &MF.getRegInfo();
WaitedOn = ZeroCounts;
LastIssued = ZeroCounts;
+ LastOpcodeType = OTHER;
memset(&UsedRegs, 0, sizeof(UsedRegs));
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@@ -364,8 +428,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
- Changes |= insertWait(MBB, I, handleOperands(*I));
- pushInstruction(*I);
+ // Wait for everything before a barrier.
+ if (I->getOpcode() == AMDGPU::S_BARRIER)
+ Changes |= insertWait(MBB, I, LastIssued);
+ else
+ Changes |= insertWait(MBB, I, handleOperands(*I));
+ pushInstruction(MBB, I);
}
// Wait for everything at the end of the MBB
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 00e69ddbeea4..99a1df36c1f4 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -17,24 +17,58 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
field bits<1> VM_CNT = 0;
field bits<1> EXP_CNT = 0;
field bits<1> LGKM_CNT = 0;
- field bits<1> MIMG = 0;
- field bits<1> SMRD = 0;
+
+ field bits<1> SALU = 0;
+ field bits<1> VALU = 0;
+
+ field bits<1> SOP1 = 0;
+ field bits<1> SOP2 = 0;
+ field bits<1> SOPC = 0;
+ field bits<1> SOPK = 0;
+ field bits<1> SOPP = 0;
+
field bits<1> VOP1 = 0;
field bits<1> VOP2 = 0;
field bits<1> VOP3 = 0;
field bits<1> VOPC = 0;
- field bits<1> SALU = 0;
+ field bits<1> MUBUF = 0;
+ field bits<1> MTBUF = 0;
+ field bits<1> SMRD = 0;
+ field bits<1> DS = 0;
+ field bits<1> MIMG = 0;
+ field bits<1> FLAT = 0;
+
+ // These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = VM_CNT;
let TSFlags{1} = EXP_CNT;
let TSFlags{2} = LGKM_CNT;
- let TSFlags{3} = MIMG;
- let TSFlags{4} = SMRD;
- let TSFlags{5} = VOP1;
- let TSFlags{6} = VOP2;
- let TSFlags{7} = VOP3;
- let TSFlags{8} = VOPC;
- let TSFlags{9} = SALU;
+
+ let TSFlags{3} = SALU;
+ let TSFlags{4} = VALU;
+
+ let TSFlags{5} = SOP1;
+ let TSFlags{6} = SOP2;
+ let TSFlags{7} = SOPC;
+ let TSFlags{8} = SOPK;
+ let TSFlags{9} = SOPP;
+
+ let TSFlags{10} = VOP1;
+ let TSFlags{11} = VOP2;
+ let TSFlags{12} = VOP3;
+ let TSFlags{13} = VOPC;
+
+ let TSFlags{14} = MUBUF;
+ let TSFlags{15} = MTBUF;
+ let TSFlags{16} = SMRD;
+ let TSFlags{17} = DS;
+ let TSFlags{18} = MIMG;
+ let TSFlags{19} = FLAT;
+
+ // Most instructions require adjustments after selection to satisfy
+ // operand requirements.
+ let hasPostISelHook = 1;
+ let SchedRW = [Write32Bit];
}
class Enc32 {
@@ -49,6 +83,44 @@ class Enc64 {
int Size = 8;
}
+let Uses = [EXEC] in {
+
+class VOPCCommon <dag ins, string asm, list<dag> pattern> :
+ InstSI <(outs VCCReg:$dst), ins, asm, pattern> {
+
+ let DisableEncoding = "$dst";
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let VOPC = 1;
+ let VALU = 1;
+ let Size = 4;
+}
+
+class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let VOP1 = 1;
+ let VALU = 1;
+ let Size = 4;
+}
+
+class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let VOP2 = 1;
+ let VALU = 1;
+ let Size = 4;
+}
+
class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern> {
@@ -56,11 +128,20 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
let mayStore = 0;
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ // Using complex patterns gives VOP3 patterns a very high complexity rating,
+ // but standalone patterns are almost always prefered, so we need to adjust the
+ // priority lower. The goal is to use a high number to reduce complexity to
+ // zero (or less than zero).
+ let AddedComplexity = -1000;
+
let VOP3 = 1;
+ let VALU = 1;
int Size = 8;
}
+} // End Uses = [EXEC]
+
//===----------------------------------------------------------------------===//
// Scalar operations
//===----------------------------------------------------------------------===//
@@ -134,22 +215,26 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
let Inst{31-27} = 0x18; //encoding
}
-class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern>, SOP1e <op> {
-
+let SchedRW = [WriteSALU] in {
+class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
+ let SOP1 = 1;
}
-class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern>, SOP2e<op> {
+class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
+ let SOP2 = 1;
+
+ let UseNamedOperandTable = 1;
}
class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -160,31 +245,48 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
+ let SOPC = 1;
+
+ let UseNamedOperandTable = 1;
}
-class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins , asm, pattern>, SOPKe<op> {
+class SOPK <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins , asm, pattern> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
+ let SOPK = 1;
+
+ let UseNamedOperandTable = 1;
}
-class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> :
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
+ let isCodeGenOnly = 0;
let SALU = 1;
+ let SOPP = 1;
+
+ let UseNamedOperandTable = 1;
}
-class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
- list<dag> pattern> : InstSI<outs, ins, asm, pattern>, SMRDe<op, imm> {
+} // let SchedRW = [WriteSALU]
+
+class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
let LGKM_CNT = 1;
let SMRD = 1;
+ let mayStore = 0;
+ let mayLoad = 1;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteSMEM];
}
//===----------------------------------------------------------------------===//
@@ -410,8 +512,27 @@ class MIMGe <bits<7> op> : Enc64 {
let Inst{57-53} = SSAMP{6-2};
}
-class EXPe : Enc64 {
+class FLATe<bits<7> op> : Enc64 {
+ bits<8> addr;
+ bits<8> data;
+ bits<8> vdst;
+ bits<1> slc;
+ bits<1> glc;
+ bits<1> tfe;
+ // 15-0 is reserved.
+ let Inst{16} = glc;
+ let Inst{17} = slc;
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x37; // Encoding.
+ let Inst{39-32} = addr;
+ let Inst{47-40} = data;
+ // 54-48 is reserved.
+ let Inst{55} = tfe;
+ let Inst{63-56} = vdst;
+}
+
+class EXPe : Enc64 {
bits<4> EN;
bits<6> TGT;
bits<1> COMPR;
@@ -437,48 +558,23 @@ class EXPe : Enc64 {
let Uses = [EXEC] in {
class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern>, VOP1e<op> {
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let VOP1 = 1;
-}
+ VOP1Common <outs, ins, asm, pattern>,
+ VOP1e<op>;
class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern>, VOP2e<op> {
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let VOP2 = 1;
-}
-
-class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
- VOP3Common <outs, ins, asm, pattern>, VOP3e<op>;
+ VOP2Common <outs, ins, asm, pattern>, VOP2e<op>;
class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
VOP3Common <outs, ins, asm, pattern>, VOP3be<op>;
class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
- InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe <op> {
+ VOPCCommon <ins, asm, pattern>, VOPCe <op>;
- let DisableEncoding = "$dst";
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
- let UseNamedOperandTable = 1;
- let VOPC = 1;
-}
-
-class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern>, VINTRPe<op> {
-
- let neverHasSideEffects = 1;
+class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
let mayLoad = 1;
let mayStore = 0;
+ let hasSideEffects = 0;
}
} // End Uses = [EXEC]
@@ -489,29 +585,56 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
let Uses = [EXEC] in {
-class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI <outs, ins, asm, pattern> , DSe<op> {
+class DS <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI <outs, ins, asm, pattern> {
let LGKM_CNT = 1;
+ let DS = 1;
+ let UseNamedOperandTable = 1;
+ let DisableEncoding = "$m0";
+ let SchedRW = [WriteLDS];
}
-class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern>, MUBUFe <op> {
+class DS_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+ DS <outs, ins, asm, pattern>, DSe<op>;
+
+class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
let VM_CNT = 1;
let EXP_CNT = 1;
+ let MUBUF = 1;
- let neverHasSideEffects = 1;
+ let hasSideEffects = 0;
let UseNamedOperandTable = 1;
+ let SchedRW = [WriteVMEM];
}
-class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
- InstSI<outs, ins, asm, pattern>, MTBUFe <op> {
+class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern> {
let VM_CNT = 1;
let EXP_CNT = 1;
+ let MTBUF = 1;
- let neverHasSideEffects = 1;
+ let hasSideEffects = 0;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteVMEM];
+}
+
+class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+ InstSI<outs, ins, asm, pattern>, FLATe <op> {
+ let FLAT = 1;
+ // Internally, FLAT instruction are executed as both an LDS and a
+ // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+ // and are not considered done until both have been decremented.
+ let VM_CNT = 1;
+ let LGKM_CNT = 1;
+
+ let Uses = [EXEC, FLAT_SCR]; // M0
+
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0;
}
class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -520,16 +643,9 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let VM_CNT = 1;
let EXP_CNT = 1;
let MIMG = 1;
-}
-def EXP : InstSI<
- (outs),
- (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
- VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
- "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
- [] >, EXPe {
-
- let EXP_CNT = 1;
+ let hasSideEffects = 0; // XXX ????
}
+
} // End Uses = [EXEC]
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 51f453292da8..1a4c0d4e57b0 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -17,10 +17,13 @@
#include "AMDGPUTargetMachine.h"
#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -32,6 +35,259 @@ SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
// TargetInstrInfo callbacks
//===----------------------------------------------------------------------===//
+static unsigned getNumOperandsNoGlue(SDNode *Node) {
+ unsigned N = Node->getNumOperands();
+ while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
+ --N;
+ return N;
+}
+
+static SDValue findChainOperand(SDNode *Load) {
+ SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
+ assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
+ return LastOp;
+}
+
+/// \brief Returns true if both nodes have the same value for the given
+/// operand \p Op, or if both nodes do not have this operand.
+static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
+ unsigned Opc0 = N0->getMachineOpcode();
+ unsigned Opc1 = N1->getMachineOpcode();
+
+ int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
+ int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
+
+ if (Op0Idx == -1 && Op1Idx == -1)
+ return true;
+
+
+ if ((Op0Idx == -1 && Op1Idx != -1) ||
+ (Op1Idx == -1 && Op0Idx != -1))
+ return false;
+
+ // getNamedOperandIdx returns the index for the MachineInstr's operands,
+ // which includes the result as the first operand. We are indexing into the
+ // MachineSDNode's operands, so we need to skip the result operand to get
+ // the real index.
+ --Op0Idx;
+ --Op1Idx;
+
+ return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
+}
+
+bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
+ int64_t &Offset0,
+ int64_t &Offset1) const {
+ if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
+ return false;
+
+ unsigned Opc0 = Load0->getMachineOpcode();
+ unsigned Opc1 = Load1->getMachineOpcode();
+
+ // Make sure both are actually loads.
+ if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
+ return false;
+
+ if (isDS(Opc0) && isDS(Opc1)) {
+
+ // FIXME: Handle this case:
+ if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
+ return false;
+
+ // Check base reg.
+ if (Load0->getOperand(1) != Load1->getOperand(1))
+ return false;
+
+ // Check chain.
+ if (findChainOperand(Load0) != findChainOperand(Load1))
+ return false;
+
+ // Skip read2 / write2 variants for simplicity.
+ // TODO: We should report true if the used offsets are adjacent (excluded
+ // st64 versions).
+ if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
+ AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
+ return false;
+
+ Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
+ return true;
+ }
+
+ if (isSMRD(Opc0) && isSMRD(Opc1)) {
+ assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
+
+ // Check base reg.
+ if (Load0->getOperand(0) != Load1->getOperand(0))
+ return false;
+
+ // Check chain.
+ if (findChainOperand(Load0) != findChainOperand(Load1))
+ return false;
+
+ Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue();
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue();
+ return true;
+ }
+
+ // MUBUF and MTBUF can access the same addresses.
+ if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
+
+ // MUBUF and MTBUF have vaddr at different indices.
+ if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
+ findChainOperand(Load0) != findChainOperand(Load1) ||
+ !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
+ !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
+ return false;
+
+ int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
+ int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
+
+ if (OffIdx0 == -1 || OffIdx1 == -1)
+ return false;
+
+ // getNamedOperandIdx returns the index for MachineInstrs. Since they
+ // inlcude the output in the operand list, but SDNodes don't, we need to
+ // subtract the index by one.
+ --OffIdx0;
+ --OffIdx1;
+
+ SDValue Off0 = Load0->getOperand(OffIdx0);
+ SDValue Off1 = Load1->getOperand(OffIdx1);
+
+ // The offset might be a FrameIndexSDNode.
+ if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
+ return false;
+
+ Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
+ Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
+ return true;
+ }
+
+ return false;
+}
+
+static bool isStride64(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::DS_READ2ST64_B32:
+ case AMDGPU::DS_READ2ST64_B64:
+ case AMDGPU::DS_WRITE2ST64_B32:
+ case AMDGPU::DS_WRITE2ST64_B64:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt,
+ unsigned &BaseReg, unsigned &Offset,
+ const TargetRegisterInfo *TRI) const {
+ unsigned Opc = LdSt->getOpcode();
+ if (isDS(Opc)) {
+ const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+ AMDGPU::OpName::offset);
+ if (OffsetImm) {
+ // Normal, single offset LDS instruction.
+ const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+ AMDGPU::OpName::addr);
+
+ BaseReg = AddrReg->getReg();
+ Offset = OffsetImm->getImm();
+ return true;
+ }
+
+ // The 2 offset instructions use offset0 and offset1 instead. We can treat
+ // these as a load with a single offset if the 2 offsets are consecutive. We
+ // will use this for some partially aligned loads.
+ const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
+ AMDGPU::OpName::offset0);
+ const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
+ AMDGPU::OpName::offset1);
+
+ uint8_t Offset0 = Offset0Imm->getImm();
+ uint8_t Offset1 = Offset1Imm->getImm();
+ assert(Offset1 > Offset0);
+
+ if (Offset1 - Offset0 == 1) {
+ // Each of these offsets is in element sized units, so we need to convert
+ // to bytes of the individual reads.
+
+ unsigned EltSize;
+ if (LdSt->mayLoad())
+ EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
+ else {
+ assert(LdSt->mayStore());
+ int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+ EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
+ }
+
+ if (isStride64(Opc))
+ EltSize *= 64;
+
+ const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+ AMDGPU::OpName::addr);
+ BaseReg = AddrReg->getReg();
+ Offset = EltSize * Offset0;
+ return true;
+ }
+
+ return false;
+ }
+
+ if (isMUBUF(Opc) || isMTBUF(Opc)) {
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
+ return false;
+
+ const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+ AMDGPU::OpName::vaddr);
+ if (!AddrReg)
+ return false;
+
+ const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+ AMDGPU::OpName::offset);
+ BaseReg = AddrReg->getReg();
+ Offset = OffsetImm->getImm();
+ return true;
+ }
+
+ if (isSMRD(Opc)) {
+ const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+ AMDGPU::OpName::offset);
+ if (!OffsetImm)
+ return false;
+
+ const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
+ AMDGPU::OpName::sbase);
+ BaseReg = SBaseReg->getReg();
+ Offset = OffsetImm->getImm();
+ return true;
+ }
+
+ return false;
+}
+
+bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+ MachineInstr *SecondLdSt,
+ unsigned NumLoads) const {
+ unsigned Opc0 = FirstLdSt->getOpcode();
+ unsigned Opc1 = SecondLdSt->getOpcode();
+
+ // TODO: This needs finer tuning
+ if (NumLoads > 4)
+ return false;
+
+ if (isDS(Opc0) && isDS(Opc1))
+ return true;
+
+ if (isSMRD(Opc0) && isSMRD(Opc1))
+ return true;
+
+ if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
+ return true;
+
+ return false;
+}
+
void
SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, DebugLoc DL,
@@ -70,26 +326,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned Opcode;
const int16_t *SubIndices;
- if (AMDGPU::M0 == DestReg) {
- // Check if M0 isn't already set to this value
- for (MachineBasicBlock::reverse_iterator E = MBB.rend(),
- I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) {
-
- if (!I->definesRegister(AMDGPU::M0))
- continue;
-
- unsigned Opc = I->getOpcode();
- if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32)
- break;
-
- if (!I->readsRegister(SrcReg))
- break;
-
- // The copy isn't necessary
- return;
- }
- }
-
if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
@@ -117,8 +353,8 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opcode = AMDGPU::S_MOV_B32;
SubIndices = Sub0_15;
- } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
- assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
+ } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
+ assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
AMDGPU::SReg_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
@@ -182,6 +418,27 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
return Opcode;
}
+unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+
+ if (DstRC->getSize() == 4) {
+ return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+ } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+ return AMDGPU::S_MOV_B64;
+ } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+ return AMDGPU::V_MOV_B64_PSEUDO;
+ }
+ return AMDGPU::COPY;
+}
+
+static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
+
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+ // FIXME: Implement spilling for other shader types.
+ return MFI->getShaderType() == ShaderType::COMPUTE;
+
+}
+
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill,
@@ -190,49 +447,49 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineFrameInfo *FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- unsigned KillFlag = isKill ? RegState::Kill : 0;
+ int Opcode = -1;
- if (RI.hasVGPRs(RC)) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!");
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
- .addReg(SrcReg);
- } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
- unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF);
- unsigned TgtReg = MFI->SpillTracker.LaneVGPR;
-
- BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg)
- .addReg(SrcReg, KillFlag)
- .addImm(Lane);
- MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane);
- } else if (RI.isSGPRClass(RC)) {
+ if (RI.isSGPRClass(RC)) {
// We are only allowed to create one new instruction when spilling
- // registers, so we need to use pseudo instruction for vector
- // registers.
- //
- // Reserve a spot in the spill tracker for each sub-register of
- // the vector register.
- unsigned NumSubRegs = RC->getSize() / 4;
- unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs);
- MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
- FirstLane);
-
- unsigned Opcode;
+ // registers, so we need to use pseudo instruction for spilling
+ // SGPRs.
switch (RC->getSize() * 8) {
- case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
- default: llvm_unreachable("Cannot spill register class");
+ case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break;
+ case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
}
+ } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+ MFI->setHasSpilledVGPRs();
- BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
+ switch(RC->getSize() * 8) {
+ case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
+ case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
+ case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
+ }
+ }
+
+ if (Opcode != -1) {
+ FrameInfo->setObjectAlignment(FrameIndex, 4);
+ BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg)
- .addImm(FrameIndex);
+ .addFrameIndex(FrameIndex)
+ // Place-holder registers, these will be filled in by
+ // SIPrepareScratchRegs.
+ .addReg(AMDGPU::SGPR0_SGPR1, RegState::Undef)
+ .addReg(AMDGPU::SGPR0, RegState::Undef);
} else {
- llvm_unreachable("VGPR spilling not supported");
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
+ " spill register");
+ BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
+ .addReg(SrcReg);
}
}
@@ -242,55 +499,142 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
- SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo *FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
+ int Opcode = -1;
- if (RI.hasVGPRs(RC)) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!");
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
- .addImm(0);
- } else if (RI.isSGPRClass(RC)){
- unsigned Opcode;
+ if (RI.isSGPRClass(RC)){
switch(RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
- default: llvm_unreachable("Cannot spill register class");
+ case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
+ case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
}
+ } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+ switch(RC->getSize() * 8) {
+ case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
+ case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
+ case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
+ case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
+ case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
+ case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
+ }
+ }
- SIMachineFunctionInfo::SpilledReg Spill =
- MFI->SpillTracker.getSpilledReg(FrameIndex);
-
+ if (Opcode != -1) {
+ FrameInfo->setObjectAlignment(FrameIndex, 4);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addReg(Spill.VGPR)
- .addImm(FrameIndex);
+ .addFrameIndex(FrameIndex)
+ // Place-holder registers, these will be filled in by
+ // SIPrepareScratchRegs.
+ .addReg(AMDGPU::SGPR0_SGPR1, RegState::Undef)
+ .addReg(AMDGPU::SGPR0, RegState::Undef);
+
} else {
- llvm_unreachable("VGPR spilling not supported");
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
+ " restore register");
+ BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
}
}
-static unsigned getNumSubRegsForSpillOp(unsigned Op) {
-
- switch (Op) {
- case AMDGPU::SI_SPILL_S512_SAVE:
- case AMDGPU::SI_SPILL_S512_RESTORE:
- return 16;
- case AMDGPU::SI_SPILL_S256_SAVE:
- case AMDGPU::SI_SPILL_S256_RESTORE:
- return 8;
- case AMDGPU::SI_SPILL_S128_SAVE:
- case AMDGPU::SI_SPILL_S128_RESTORE:
- return 4;
- case AMDGPU::SI_SPILL_S64_SAVE:
- case AMDGPU::SI_SPILL_S64_RESTORE:
- return 2;
- case AMDGPU::SI_SPILL_S32_RESTORE:
- return 1;
- default: llvm_unreachable("Invalid spill opcode");
+/// \param @Offset Offset in bytes of the FrameIndex being spilled
+unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ RegScavenger *RS, unsigned TmpReg,
+ unsigned FrameOffset,
+ unsigned Size) const {
+ MachineFunction *MF = MBB.getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>();
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
+ unsigned WavefrontSize = ST.getWavefrontSize();
+
+ unsigned TIDReg = MFI->getTIDReg();
+ if (!MFI->hasCalculatedTID()) {
+ MachineBasicBlock &Entry = MBB.getParent()->front();
+ MachineBasicBlock::iterator Insert = Entry.front();
+ DebugLoc DL = Insert->getDebugLoc();
+
+ TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
+ if (TIDReg == AMDGPU::NoRegister)
+ return TIDReg;
+
+
+ if (MFI->getShaderType() == ShaderType::COMPUTE &&
+ WorkGroupSize > WavefrontSize) {
+
+ unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
+ unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
+ unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+ unsigned InputPtrReg =
+ TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+ static const unsigned TIDIGRegs[3] = {
+ TIDIGXReg, TIDIGYReg, TIDIGZReg
+ };
+ for (unsigned Reg : TIDIGRegs) {
+ if (!Entry.isLiveIn(Reg))
+ Entry.addLiveIn(Reg);
+ }
+
+ RS->enterBasicBlock(&Entry);
+ unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
+ .addReg(InputPtrReg)
+ .addImm(SI::KernelInputOffsets::NGROUPS_Z);
+ BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
+ .addReg(InputPtrReg)
+ .addImm(SI::KernelInputOffsets::NGROUPS_Y);
+
+ // NGROUPS.X * NGROUPS.Y
+ BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
+ .addReg(STmp1)
+ .addReg(STmp0);
+ // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
+ .addReg(STmp1)
+ .addReg(TIDIGXReg);
+ // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
+ .addReg(STmp0)
+ .addReg(TIDIGYReg)
+ .addReg(TIDReg);
+ // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
+ .addReg(TIDReg)
+ .addReg(TIDIGZReg);
+ } else {
+ // Get the wave id
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+ TIDReg)
+ .addImm(-1)
+ .addImm(0);
+
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32),
+ TIDReg)
+ .addImm(-1)
+ .addReg(TIDReg);
+ }
+
+ BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
+ TIDReg)
+ .addImm(2)
+ .addReg(TIDReg);
+ MFI->setTIDReg(TIDReg);
}
+
+ // Add FrameIndex to LDS offset
+ unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
+ .addImm(LDSOffset)
+ .addReg(TIDReg);
+
+ return TmpReg;
}
void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
@@ -308,59 +652,11 @@ void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
}
bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
- SIMachineFunctionInfo *MFI =
- MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
MachineBasicBlock &MBB = *MI->getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI->getOpcode()) {
default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
- // SGPR register spill
- case AMDGPU::SI_SPILL_S512_SAVE:
- case AMDGPU::SI_SPILL_S256_SAVE:
- case AMDGPU::SI_SPILL_S128_SAVE:
- case AMDGPU::SI_SPILL_S64_SAVE: {
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
- unsigned FrameIndex = MI->getOperand(2).getImm();
-
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- SIMachineFunctionInfo::SpilledReg Spill;
- unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(),
- &AMDGPU::SGPR_32RegClass, i);
- Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
-
- BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
- MI->getOperand(0).getReg())
- .addReg(SubReg)
- .addImm(Spill.Lane + i);
- }
- MI->eraseFromParent();
- break;
- }
-
- // SGPR register restore
- case AMDGPU::SI_SPILL_S512_RESTORE:
- case AMDGPU::SI_SPILL_S256_RESTORE:
- case AMDGPU::SI_SPILL_S128_RESTORE:
- case AMDGPU::SI_SPILL_S64_RESTORE:
- case AMDGPU::SI_SPILL_S32_RESTORE: {
- unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
-
- for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- SIMachineFunctionInfo::SpilledReg Spill;
- unsigned FrameIndex = MI->getOperand(2).getImm();
- unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(),
- &AMDGPU::SGPR_32RegClass, i);
- Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
-
- BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg)
- .addReg(MI->getOperand(1).getReg())
- .addImm(Spill.Lane + i);
- }
- insertNOPs(MI, 3);
- MI->eraseFromParent();
- break;
- }
case AMDGPU::SI_CONSTDATA_PTR: {
unsigned Reg = MI->getOperand(0).getReg();
unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
@@ -369,7 +665,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
// Add 32-bit offset from this instruction to the start of the constant data.
- BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_I32), RegLo)
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
.addReg(RegLo)
.addTargetIndex(AMDGPU::TI_CONSTDATA_START)
.addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
@@ -381,6 +677,39 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
MI->eraseFromParent();
break;
}
+ case AMDGPU::SGPR_USE:
+ // This is just a placeholder for register allocation.
+ MI->eraseFromParent();
+ break;
+
+ case AMDGPU::V_MOV_B64_PSEUDO: {
+ unsigned Dst = MI->getOperand(0).getReg();
+ unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+ unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+ const MachineOperand &SrcOp = MI->getOperand(1);
+ // FIXME: Will this work for 64-bit floating point immediates?
+ assert(!SrcOp.isFPImm());
+ if (SrcOp.isImm()) {
+ APInt Imm(64, SrcOp.getImm());
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addImm(Imm.getLoBits(32).getZExtValue())
+ .addReg(Dst, RegState::Implicit);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addImm(Imm.getHiBits(32).getZExtValue())
+ .addReg(Dst, RegState::Implicit);
+ } else {
+ assert(SrcOp.isReg());
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+ .addReg(Dst, RegState::Implicit);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+ .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+ .addReg(Dst, RegState::Implicit);
+ }
+ MI->eraseFromParent();
+ break;
+ }
}
return true;
}
@@ -388,35 +717,66 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
bool NewMI) const {
- MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
- if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
+ if (MI->getNumOperands() < 3)
return nullptr;
- // Cannot commute VOP2 if src0 is SGPR.
- if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
- RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
- return nullptr;
+ int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src0);
+ assert(Src0Idx != -1 && "Should always have src0 operand");
- if (!MI->getOperand(2).isReg()) {
- // XXX: Commute instructions with FPImm operands
- if (NewMI || MI->getOperand(2).isFPImm() ||
+ MachineOperand &Src0 = MI->getOperand(Src0Idx);
+ if (!Src0.isReg())
+ return nullptr;
+
+ int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+ AMDGPU::OpName::src1);
+ if (Src1Idx == -1)
+ return nullptr;
+
+ MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+ // Make sure it's legal to commute operands for VOP2.
+ if (isVOP2(MI->getOpcode()) &&
+ (!isOperandLegal(MI, Src0Idx, &Src1) ||
+ !isOperandLegal(MI, Src1Idx, &Src0))) {
+ return nullptr;
+ }
+
+ if (!Src1.isReg()) {
+ // Allow commuting instructions with Imm operands.
+ if (NewMI || !Src1.isImm() ||
(!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
return nullptr;
}
- // XXX: Commute VOP3 instructions with abs and neg set.
- if (isVOP3(MI->getOpcode()) &&
- (MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::abs)).getImm() ||
- MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::neg)).getImm()))
- return nullptr;
+ // Be sure to copy the source modifiers to the right place.
+ if (MachineOperand *Src0Mods
+ = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
+ MachineOperand *Src1Mods
+ = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
+
+ int Src0ModsVal = Src0Mods->getImm();
+ if (!Src1Mods && Src0ModsVal != 0)
+ return nullptr;
+
+ // XXX - This assert might be a lie. It might be useful to have a neg
+ // modifier with 0.0.
+ int Src1ModsVal = Src1Mods->getImm();
+ assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates");
+
+ Src1Mods->setImm(Src0ModsVal);
+ Src0Mods->setImm(Src1ModsVal);
+ }
+
+ unsigned Reg = Src0.getReg();
+ unsigned SubReg = Src0.getSubReg();
+ if (Src1.isImm())
+ Src0.ChangeToImmediate(Src1.getImm());
+ else
+ llvm_unreachable("Should only have immediates");
- unsigned Reg = MI->getOperand(1).getReg();
- unsigned SubReg = MI->getOperand(1).getSubReg();
- MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm());
- MI->getOperand(2).ChangeToRegister(Reg, false);
- MI->getOperand(2).setSubReg(SubReg);
+ Src1.ChangeToRegister(Reg, false);
+ Src1.setSubReg(SubReg);
} else {
MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
}
@@ -427,6 +787,44 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
return MI;
}
+// This needs to be implemented because the source modifiers may be inserted
+// between the true commutable operands, and the base
+// TargetInstrInfo::commuteInstruction uses it.
+bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ const MCInstrDesc &MCID = MI->getDesc();
+ if (!MCID.isCommutable())
+ return false;
+
+ unsigned Opc = MI->getOpcode();
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ if (Src0Idx == -1)
+ return false;
+
+ // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
+ // immediate.
+ if (!MI->getOperand(Src0Idx).isReg())
+ return false;
+
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx == -1)
+ return false;
+
+ if (!MI->getOperand(Src1Idx).isReg())
+ return false;
+
+ // If any source modifiers are set, the generic instruction commuting won't
+ // understand how to copy the source modifiers.
+ if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
+ hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+ return false;
+
+ SrcOpIdx1 = Src0Idx;
+ SrcOpIdx2 = Src1Idx;
+ return true;
+}
+
MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
unsigned DstReg,
@@ -463,51 +861,106 @@ SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
}
}
-namespace llvm {
-namespace AMDGPU {
-// Helper function generated by tablegen. We are wrapping this with
-// an SIInstrInfo function that reutrns bool rather than int.
-int isDS(uint16_t Opcode);
-}
+static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
+ int WidthB, int OffsetB) {
+ int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+ int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+ int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+ return LowOffset + LowWidth <= HighOffset;
}
-bool SIInstrInfo::isDS(uint16_t Opcode) const {
- return ::AMDGPU::isDS(Opcode) != -1;
-}
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
+ MachineInstr *MIb) const {
+ unsigned BaseReg0, Offset0;
+ unsigned BaseReg1, Offset1;
+
+ if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
+ getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
+ assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
+ "read2 / write2 not expected here yet");
+ unsigned Width0 = (*MIa->memoperands_begin())->getSize();
+ unsigned Width1 = (*MIb->memoperands_begin())->getSize();
+ if (BaseReg0 == BaseReg1 &&
+ offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
+ return true;
+ }
+ }
-int SIInstrInfo::isMIMG(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+ return false;
}
-int SIInstrInfo::isSMRD(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::SMRD;
-}
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
+ MachineInstr *MIb,
+ AliasAnalysis *AA) const {
+ unsigned Opc0 = MIa->getOpcode();
+ unsigned Opc1 = MIb->getOpcode();
-bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOP1;
-}
+ assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+ "MIa must load from or modify a memory location");
+ assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+ "MIb must load from or modify a memory location");
-bool SIInstrInfo::isVOP2(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOP2;
-}
+ if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
+ return false;
-bool SIInstrInfo::isVOP3(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOP3;
-}
+ // XXX - Can we relax this between address spaces?
+ if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+ return false;
-bool SIInstrInfo::isVOPC(uint16_t Opcode) const {
- return get(Opcode).TSFlags & SIInstrFlags::VOPC;
-}
+ // TODO: Should we check the address space from the MachineMemOperand? That
+ // would allow us to distinguish objects we know don't alias based on the
+ // underlying addres space, even if it was lowered to a different one,
+ // e.g. private accesses lowered to use MUBUF instructions on a scratch
+ // buffer.
+ if (isDS(Opc0)) {
+ if (isDS(Opc1))
+ return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+ return !isFLAT(Opc1);
+ }
+
+ if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
+ if (isMUBUF(Opc1) || isMTBUF(Opc1))
+ return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+ return !isFLAT(Opc1) && !isSMRD(Opc1);
+ }
+
+ if (isSMRD(Opc0)) {
+ if (isSMRD(Opc1))
+ return checkInstOffsetsDoNotOverlap(MIa, MIb);
-bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const {
- return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU;
+ return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
+ }
+
+ if (isFLAT(Opc0)) {
+ if (isFLAT(Opc1))
+ return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+ return false;
+ }
+
+ return false;
}
bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
- int32_t Val = Imm.getSExtValue();
- if (Val >= -16 && Val <= 64)
+ int64_t SVal = Imm.getSExtValue();
+ if (SVal >= -16 && SVal <= 64)
return true;
+ if (Imm.getBitWidth() == 64) {
+ uint64_t Val = Imm.getZExtValue();
+ return (DoubleToBits(0.0) == Val) ||
+ (DoubleToBits(1.0) == Val) ||
+ (DoubleToBits(-1.0) == Val) ||
+ (DoubleToBits(0.5) == Val) ||
+ (DoubleToBits(-0.5) == Val) ||
+ (DoubleToBits(2.0) == Val) ||
+ (DoubleToBits(-2.0) == Val) ||
+ (DoubleToBits(4.0) == Val) ||
+ (DoubleToBits(-4.0) == Val);
+ }
+
// The actual type of the operand does not seem to matter as long
// as the bits match one of the inline immediate values. For example:
//
@@ -516,32 +969,28 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
//
// 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
// floating-point, so it is a legal inline immediate.
-
- return (APInt::floatToBits(0.0f) == Imm) ||
- (APInt::floatToBits(1.0f) == Imm) ||
- (APInt::floatToBits(-1.0f) == Imm) ||
- (APInt::floatToBits(0.5f) == Imm) ||
- (APInt::floatToBits(-0.5f) == Imm) ||
- (APInt::floatToBits(2.0f) == Imm) ||
- (APInt::floatToBits(-2.0f) == Imm) ||
- (APInt::floatToBits(4.0f) == Imm) ||
- (APInt::floatToBits(-4.0f) == Imm);
+ uint32_t Val = Imm.getZExtValue();
+
+ return (FloatToBits(0.0f) == Val) ||
+ (FloatToBits(1.0f) == Val) ||
+ (FloatToBits(-1.0f) == Val) ||
+ (FloatToBits(0.5f) == Val) ||
+ (FloatToBits(-0.5f) == Val) ||
+ (FloatToBits(2.0f) == Val) ||
+ (FloatToBits(-2.0f) == Val) ||
+ (FloatToBits(4.0f) == Val) ||
+ (FloatToBits(-4.0f) == Val);
}
bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const {
if (MO.isImm())
return isInlineConstant(APInt(32, MO.getImm(), true));
- if (MO.isFPImm()) {
- APFloat FpImm = MO.getFPImm()->getValueAPF();
- return isInlineConstant(FpImm.bitcastToAPInt());
- }
-
return false;
}
bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
- return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
+ return MO.isImm() && !isInlineConstant(MO);
}
static bool compareMachineOp(const MachineOperand &Op0,
@@ -554,8 +1003,6 @@ static bool compareMachineOp(const MachineOperand &Op0,
return Op0.getReg() == Op1.getReg();
case MachineOperand::MO_Immediate:
return Op0.getImm() == Op1.getImm();
- case MachineOperand::MO_FPImmediate:
- return Op0.getFPImm() == Op1.getFPImm();
default:
llvm_unreachable("Didn't expect to be comparing these operand types");
}
@@ -565,7 +1012,7 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
const MachineOperand &MO) const {
const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
- assert(MO.isImm() || MO.isFPImm());
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;
@@ -573,16 +1020,91 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
if (OpInfo.RegClass < 0)
return false;
- return RI.regClassCanUseImmediate(OpInfo.RegClass);
+ if (isLiteralConstant(MO))
+ return RI.opCanUseLiteralConstant(OpInfo.OperandType);
+
+ return RI.opCanUseInlineConstant(OpInfo.OperandType);
+}
+
+bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) const {
+ switch (AS) {
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ // MUBUF instructions a 12-bit offset in bytes.
+ return isUInt<12>(OffsetSize);
+ }
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ // SMRD instructions have an 8-bit offset in dwords on SI and
+ // a 20-bit offset in bytes on VI.
+ if (RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return isUInt<20>(OffsetSize);
+ else
+ return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
+ }
+ case AMDGPUAS::LOCAL_ADDRESS:
+ case AMDGPUAS::REGION_ADDRESS: {
+ // The single offset versions have a 16-bit offset in bytes.
+ return isUInt<16>(OffsetSize);
+ }
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ // Indirect register addressing does not use any offsets.
+ default:
+ return 0;
+ }
}
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
return AMDGPU::getVOPe32(Opcode) != -1;
}
+bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
+ // The src0_modifier operand is present on all instructions
+ // that have modifiers.
+
+ return AMDGPU::getNamedOperandIdx(Opcode,
+ AMDGPU::OpName::src0_modifiers) != -1;
+}
+
+bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
+ unsigned OpName) const {
+ const MachineOperand *Mods = getNamedOperand(MI, OpName);
+ return Mods && Mods->getImm();
+}
+
+bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
+ const MachineOperand &MO) const {
+ // Literal constants use the constant bus.
+ if (isLiteralConstant(MO))
+ return true;
+
+ if (!MO.isReg() || !MO.isUse())
+ return false;
+
+ if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
+
+ // FLAT_SCR is just an SGPR pair.
+ if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
+ return true;
+
+ // EXEC register uses the constant bus.
+ if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+ return true;
+
+ // SGPRs use the constant bus
+ if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
+ (!MO.isImplicit() &&
+ (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+ AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
+ return true;
+ }
+
+ return false;
+}
+
bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI->getOpcode();
+ const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
@@ -596,23 +1118,27 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
}
// Make sure the register classes are correct
- for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+ for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+ if (MI->getOperand(i).isFPImm()) {
+ ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
+ "all fp values to integers.";
+ return false;
+ }
+
switch (Desc.OpInfo[i].OperandType) {
case MCOI::OPERAND_REGISTER: {
- int RegClass = Desc.OpInfo[i].RegClass;
- if (!RI.regClassCanUseImmediate(RegClass) &&
- (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) {
- ErrInfo = "Expected register, but got immediate";
- return false;
+ if (MI->getOperand(i).isImm() &&
+ !isImmOperandLegal(MI, i, MI->getOperand(i))) {
+ ErrInfo = "Illegal immediate value for operand.";
+ return false;
+ }
}
- }
break;
case MCOI::OPERAND_IMMEDIATE:
// Check if this operand is an immediate.
// FrameIndex operands will be replaced by immediates, so they are
// allowed.
- if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
- !MI->getOperand(i).isFI()) {
+ if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
ErrInfo = "Expected immediate, but got non-immediate";
return false;
}
@@ -641,31 +1167,27 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
// Verify VOP*
if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+ // Only look at the true operands. Only a real operand can use the constant
+ // bus, and we don't want to check pseudo-operands like the source modifier
+ // flags.
+ const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
unsigned ConstantBusCount = 0;
unsigned SGPRUsed = AMDGPU::NoRegister;
- for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
- if (MO.isReg() && MO.isUse() &&
- !TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-
- // EXEC register uses the constant bus.
- if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
- ++ConstantBusCount;
+ for (int OpIdx : OpIndices) {
+ if (OpIdx == -1)
+ break;
- // SGPRs use the constant bus
- if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
- (!MO.isImplicit() &&
- (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
- AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
- if (SGPRUsed != MO.getReg()) {
+ const MachineOperand &MO = MI->getOperand(OpIdx);
+ if (usesConstantBus(MRI, MO)) {
+ if (MO.isReg()) {
+ if (MO.getReg() != SGPRUsed)
++ConstantBusCount;
- SGPRUsed = MO.getReg();
- }
+ SGPRUsed = MO.getReg();
+ } else {
+ ++ConstantBusCount;
}
}
- // Literal constants use the constant bus.
- if (isLiteralConstant(MO))
- ++ConstantBusCount;
}
if (ConstantBusCount > 1) {
ErrInfo = "VOP* instruction uses the constant bus more than once";
@@ -676,7 +1198,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
// Verify SRC1 for VOP2 and VOPC
if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) {
const MachineOperand &Src1 = MI->getOperand(Src1Idx);
- if (Src1.isImm() || Src1.isFPImm()) {
+ if (Src1.isImm()) {
ErrInfo = "VOP[2C] src1 cannot be an immediate.";
return false;
}
@@ -701,11 +1223,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
// Verify misc. restrictions on specific instructions.
if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
- MI->dump();
-
- const MachineOperand &Src0 = MI->getOperand(2);
- const MachineOperand &Src1 = MI->getOperand(3);
- const MachineOperand &Src2 = MI->getOperand(4);
+ const MachineOperand &Src0 = MI->getOperand(Src0Idx);
+ const MachineOperand &Src1 = MI->getOperand(Src1Idx);
+ const MachineOperand &Src2 = MI->getOperand(Src2Idx);
if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
if (!compareMachineOp(Src0, Src1) &&
!compareMachineOp(Src0, Src2)) {
@@ -728,10 +1248,13 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_MOV_B32:
return MI.getOperand(1).isReg() ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
- case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32;
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
- case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32;
+ case AMDGPU::S_SUB_I32:
+ case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+ case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
@@ -779,8 +1302,13 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
const MCInstrDesc &Desc = get(MI.getOpcode());
if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
- Desc.OpInfo[OpNo].RegClass == -1)
- return MRI.getRegClass(MI.getOperand(OpNo).getReg());
+ Desc.OpInfo[OpNo].RegClass == -1) {
+ unsigned Reg = MI.getOperand(OpNo).getReg();
+
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return MRI.getRegClass(Reg);
+ return RI.getRegClass(Reg);
+ }
unsigned RCID = Desc.OpInfo[OpNo].RegClass;
return RI.getRegClass(RCID);
@@ -800,21 +1328,28 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
MachineBasicBlock::iterator I = MI;
+ MachineBasicBlock *MBB = MI->getParent();
MachineOperand &MO = MI->getOperand(OpIdx);
- MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
- if (MO.isReg()) {
+ if (MO.isReg())
Opcode = AMDGPU::COPY;
- } else if (RI.isSGPRClass(RC)) {
+ else if (RI.isSGPRClass(RC))
Opcode = AMDGPU::S_MOV_B32;
- }
+
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+ if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
+ VRC = &AMDGPU::VReg_64RegClass;
+ else
+ VRC = &AMDGPU::VGPR_32RegClass;
+
unsigned Reg = MRI.createVirtualRegister(VRC);
- BuildMI(*MI->getParent(), I, MI->getParent()->findDebugLoc(I), get(Opcode),
- Reg).addOperand(MO);
+ DebugLoc DL = MBB->findDebugLoc(I);
+ BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
+ .addOperand(MO);
MO.ChangeToRegister(Reg, false);
}
@@ -834,13 +1369,15 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
// value so we don't need to worry about merging its subreg index with the
// SubIdx passed to this function. The register coalescer should be able to
// eliminate this extra copy.
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
- NewSuperReg)
- .addOperand(SuperReg);
+ MachineBasicBlock *MBB = MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
+ .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
+
+ BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+ .addReg(NewSuperReg, 0, SubIdx);
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
- SubReg)
- .addReg(NewSuperReg, 0, SubIdx);
return SubReg;
}
@@ -896,8 +1433,68 @@ unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
return Dst;
}
+// Change the order of operands from (0, 1, 2) to (0, 2, 1)
+void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
+ assert(Inst->getNumExplicitOperands() == 3);
+ MachineOperand Op1 = Inst->getOperand(1);
+ Inst->RemoveOperand(1);
+ Inst->addOperand(Op1);
+}
+
+bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+ const MachineOperand *MO) const {
+ const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+ const MCInstrDesc &InstDesc = get(MI->getOpcode());
+ const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+ const TargetRegisterClass *DefinedRC =
+ OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
+ if (!MO)
+ MO = &MI->getOperand(OpIdx);
+
+ if (isVALU(InstDesc.Opcode) && usesConstantBus(MRI, *MO)) {
+ unsigned SGPRUsed =
+ MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ if (i == OpIdx)
+ continue;
+ if (usesConstantBus(MRI, MI->getOperand(i)) &&
+ MI->getOperand(i).isReg() && MI->getOperand(i).getReg() != SGPRUsed) {
+ return false;
+ }
+ }
+ }
+
+ if (MO->isReg()) {
+ assert(DefinedRC);
+ const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg());
+
+ // In order to be legal, the common sub-class must be equal to the
+ // class of the current operand. For example:
+ //
+ // v_mov_b32 s0 ; Operand defined as vsrc_32
+ // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+ //
+ // s_sendmsg 0, s0 ; Operand defined as m0reg
+ // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+ return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+ }
+
+
+ // Handle non-register types that are treated like immediates.
+ assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+
+ if (!DefinedRC) {
+ // This operand expects an immediate.
+ return true;
+ }
+
+ return isImmOperandLegal(MI, OpIdx, *MO);
+}
+
void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src0);
int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
@@ -907,45 +1504,40 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
// Legalize VOP2
if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
- MachineOperand &Src0 = MI->getOperand(Src0Idx);
- MachineOperand &Src1 = MI->getOperand(Src1Idx);
-
- // If the instruction implicitly reads VCC, we can't have any SGPR operands,
- // so move any.
- bool ReadsVCC = MI->readsRegister(AMDGPU::VCC, &RI);
- if (ReadsVCC && Src0.isReg() &&
- RI.isSGPRClass(MRI.getRegClass(Src0.getReg()))) {
+ // Legalize src0
+ if (!isOperandLegal(MI, Src0Idx))
legalizeOpWithMove(MI, Src0Idx);
- return;
- }
- if (ReadsVCC && Src1.isReg() &&
- RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
- legalizeOpWithMove(MI, Src1Idx);
+ // Legalize src1
+ if (isOperandLegal(MI, Src1Idx))
return;
- }
- // Legalize VOP2 instructions where src1 is not a VGPR. An SGPR input must
- // be the first operand, and there can only be one.
- if (Src1.isImm() || Src1.isFPImm() ||
- (Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())))) {
- if (MI->isCommutable()) {
- if (commuteInstruction(MI))
- return;
- }
- legalizeOpWithMove(MI, Src1Idx);
+ // Usually src0 of VOP2 instructions allow more types of inputs
+ // than src1, so try to commute the instruction to decrease our
+ // chances of having to insert a MOV instruction to legalize src1.
+ if (MI->isCommutable()) {
+ if (commuteInstruction(MI))
+ // If we are successful in commuting, then we know MI is legal, so
+ // we are done.
+ return;
}
+
+ legalizeOpWithMove(MI, Src1Idx);
+ return;
}
// XXX - Do any VOP3 instructions read VCC?
// Legalize VOP3
if (isVOP3(MI->getOpcode())) {
- int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx};
- unsigned SGPRReg = AMDGPU::NoRegister;
+ int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
+
+ // Find the one SGPR operand we are allowed to use.
+ unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
for (unsigned i = 0; i < 3; ++i) {
int Idx = VOP3Idx[i];
if (Idx == -1)
- continue;
+ break;
MachineOperand &MO = MI->getOperand(Idx);
if (MO.isReg()) {
@@ -1045,106 +1637,216 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
// Legalize MUBUF* instructions
// FIXME: If we start using the non-addr64 instructions for compute, we
// may need to legalize them here.
+ int SRsrcIdx =
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
+ if (SRsrcIdx != -1) {
+ // We have an MUBUF instruction
+ MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
+ unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
+ if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
+ RI.getRegClass(SRsrcRC))) {
+ // The operands are legal.
+ // FIXME: We may need to legalize operands besided srsrc.
+ return;
+ }
- int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::srsrc);
- int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::vaddr);
- if (SRsrcIdx != -1 && VAddrIdx != -1) {
- const TargetRegisterClass *VAddrRC =
- RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass);
-
- if(VAddrRC->getSize() == 8 &&
- MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) {
- // We have a MUBUF instruction that uses a 64-bit vaddr register and
- // srsrc has the incorrect register class. In order to fix this, we
- // need to extract the pointer from the resource descriptor (srsrc),
- // add it to the value of vadd, then store the result in the vaddr
- // operand. Then, we need to set the pointer field of the resource
- // descriptor to zero.
+ MachineBasicBlock &MBB = *MI->getParent();
+ // Extract the the ptr from the resource descriptor.
- MachineBasicBlock &MBB = *MI->getParent();
- MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx);
- MachineOperand &VAddrOp = MI->getOperand(VAddrIdx);
- unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi;
- unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
- unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
- unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
-
- // SRsrcPtrLo = srsrc:sub0
- SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
-
- // SRsrcPtrHi = srsrc:sub1
- SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
-
- // VAddrLo = vaddr:sub0
- VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp,
- &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
-
- // VAddrHi = vaddr:sub1
- VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp,
- &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
-
- // NewVaddrLo = SRsrcPtrLo + VAddrLo
+ // SRsrcPtrLo = srsrc:sub0
+ unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
+ &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
+
+ // SRsrcPtrHi = srsrc:sub1
+ unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
+ &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
+
+ // Create an empty resource descriptor
+ unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
+
+ // Zero64 = 0
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
+ Zero64)
+ .addImm(0);
+
+ // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+ SRsrcFormatLo)
+ .addImm(RsrcDataFormat & 0xFFFFFFFF);
+
+ // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+ SRsrcFormatHi)
+ .addImm(RsrcDataFormat >> 32);
+
+ // NewSRsrc = {Zero64, SRsrcFormat}
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+ NewSRsrc)
+ .addReg(Zero64)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(SRsrcFormatLo)
+ .addImm(AMDGPU::sub2)
+ .addReg(SRsrcFormatHi)
+ .addImm(AMDGPU::sub3);
+
+ MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
+ unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ unsigned NewVAddrLo;
+ unsigned NewVAddrHi;
+ if (VAddr) {
+ // This is already an ADDR64 instruction so we need to add the pointer
+ // extracted from the resource descriptor to the current value of VAddr.
+ NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
NewVAddrLo)
.addReg(SRsrcPtrLo)
- .addReg(VAddrLo)
- .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit);
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
+ .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
- // NewVaddrHi = SRsrcPtrHi + VAddrHi
+ // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
NewVAddrHi)
.addReg(SRsrcPtrHi)
- .addReg(VAddrHi)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
.addReg(AMDGPU::VCC, RegState::ImplicitDefine)
.addReg(AMDGPU::VCC, RegState::Implicit);
- // NewVaddr = {NewVaddrHi, NewVaddrLo}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
- NewVAddr)
- .addReg(NewVAddrLo)
- .addImm(AMDGPU::sub0)
- .addReg(NewVAddrHi)
- .addImm(AMDGPU::sub1);
+ } else {
+ // This instructions is the _OFFSET variant, so we need to convert it to
+ // ADDR64.
+ MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
+ MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
+ MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
+ assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF "
+ "with non-zero soffset is not implemented");
+ (void)SOffset;
+
+ // Create the new instruction.
+ unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
+ MachineInstr *Addr64 =
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+ .addOperand(*VData)
+ .addOperand(*SRsrc)
+ .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+ // This will be replaced later
+ // with the new value of vaddr.
+ .addOperand(*Offset);
+
+ MI->removeFromParent();
+ MI = Addr64;
+
+ NewVAddrLo = SRsrcPtrLo;
+ NewVAddrHi = SRsrcPtrHi;
+ VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
+ SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
+ }
- // Zero64 = 0
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
- Zero64)
- .addImm(0);
+ // NewVaddr = {NewVaddrHi, NewVaddrLo}
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+ NewVAddr)
+ .addReg(NewVAddrLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(NewVAddrHi)
+ .addImm(AMDGPU::sub1);
- // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
- SRsrcFormatLo)
- .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
-
- // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
- SRsrcFormatHi)
- .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
-
- // NewSRsrc = {Zero64, SRsrcFormat}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
- NewSRsrc)
- .addReg(Zero64)
- .addImm(AMDGPU::sub0_sub1)
- .addReg(SRsrcFormatLo)
- .addImm(AMDGPU::sub2)
- .addReg(SRsrcFormatHi)
- .addImm(AMDGPU::sub3);
- // Update the instruction to use NewVaddr
- MI->getOperand(VAddrIdx).setReg(NewVAddr);
- // Update the instruction to use NewSRsrc
- MI->getOperand(SRsrcIdx).setReg(NewSRsrc);
+ // Update the instruction to use NewVaddr
+ VAddr->setReg(NewVAddr);
+ // Update the instruction to use NewSRsrc
+ SRsrc->setReg(NewSRsrc);
+ }
+}
+
+void SIInstrInfo::splitSMRD(MachineInstr *MI,
+ const TargetRegisterClass *HalfRC,
+ unsigned HalfImmOp, unsigned HalfSGPROp,
+ MachineInstr *&Lo, MachineInstr *&Hi) const {
+
+ DebugLoc DL = MI->getDebugLoc();
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned RegLo = MRI.createVirtualRegister(HalfRC);
+ unsigned RegHi = MRI.createVirtualRegister(HalfRC);
+ unsigned HalfSize = HalfRC->getSize();
+ const MachineOperand *OffOp =
+ getNamedOperand(*MI, AMDGPU::OpName::offset);
+ const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
+
+ // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
+ // on VI.
+ if (OffOp) {
+ bool isVI = RI.ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ unsigned OffScale = isVI ? 1 : 4;
+ // Handle the _IMM variant
+ unsigned LoOffset = OffOp->getImm() * OffScale;
+ unsigned HiOffset = LoOffset + HalfSize;
+ Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
+ .addOperand(*SBase)
+ .addImm(LoOffset / OffScale);
+
+ if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
+ unsigned OffsetSGPR =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
+ .addImm(HiOffset); // The offset in register is in bytes.
+ Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
+ .addOperand(*SBase)
+ .addReg(OffsetSGPR);
+ } else {
+ Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
+ .addOperand(*SBase)
+ .addImm(HiOffset / OffScale);
}
+ } else {
+ // Handle the _SGPR variant
+ MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
+ Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
+ .addOperand(*SBase)
+ .addOperand(*SOff);
+ unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
+ .addOperand(*SOff)
+ .addImm(HalfSize);
+ Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
+ .addOperand(*SBase)
+ .addReg(OffsetSGPR);
+ }
+
+ unsigned SubLo, SubHi;
+ switch (HalfSize) {
+ case 4:
+ SubLo = AMDGPU::sub0;
+ SubHi = AMDGPU::sub1;
+ break;
+ case 8:
+ SubLo = AMDGPU::sub0_sub1;
+ SubHi = AMDGPU::sub2_sub3;
+ break;
+ case 16:
+ SubLo = AMDGPU::sub0_sub1_sub2_sub3;
+ SubHi = AMDGPU::sub4_sub5_sub6_sub7;
+ break;
+ case 32:
+ SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
+ SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
+ break;
+ default:
+ llvm_unreachable("Unhandled HalfSize");
}
+
+ BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
+ .addOperand(MI->getOperand(0))
+ .addReg(RegLo)
+ .addImm(SubLo)
+ .addReg(RegHi)
+ .addImm(SubHi);
}
void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
@@ -1155,7 +1857,7 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::S_LOAD_DWORDX2_SGPR:
case AMDGPU::S_LOAD_DWORDX4_IMM:
- case AMDGPU::S_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_LOAD_DWORDX4_SGPR: {
unsigned NewOpcode = getVALUOp(*MI);
unsigned RegOffset;
unsigned ImmOffset;
@@ -1165,10 +1867,13 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
ImmOffset = 0;
} else {
assert(MI->getOperand(2).isImm());
- // SMRD instructions take a dword offsets and MUBUF instructions
- // take a byte offset.
- ImmOffset = MI->getOperand(2).getImm() << 2;
+ // SMRD instructions take a dword offsets on SI and byte offset on VI
+ // and MUBUF instructions always take a byte offset.
+ ImmOffset = MI->getOperand(2).getImm();
+ if (RI.ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+ ImmOffset <<= 2;
RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
if (isUInt<12>(ImmOffset)) {
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
RegOffset)
@@ -1186,13 +1891,14 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
.addImm(0);
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
- .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+ .addImm(RsrcDataFormat & 0xFFFFFFFF);
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
- .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+ .addImm(RsrcDataFormat >> 32);
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
.addReg(DWord0)
.addImm(AMDGPU::sub0)
@@ -1202,14 +1908,44 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
.addImm(AMDGPU::sub2)
.addReg(DWord3)
.addImm(AMDGPU::sub3);
- MI->setDesc(get(NewOpcode));
- if (MI->getOperand(2).isReg()) {
- MI->getOperand(2).setReg(MI->getOperand(1).getReg());
- } else {
- MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
- }
- MI->getOperand(1).setReg(SRsrc);
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+ MI->setDesc(get(NewOpcode));
+ if (MI->getOperand(2).isReg()) {
+ MI->getOperand(2).setReg(MI->getOperand(1).getReg());
+ } else {
+ MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
+ }
+ MI->getOperand(1).setReg(SRsrc);
+ MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+
+ const TargetRegisterClass *NewDstRC =
+ RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
+
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ break;
+ }
+ case AMDGPU::S_LOAD_DWORDX8_IMM:
+ case AMDGPU::S_LOAD_DWORDX8_SGPR: {
+ MachineInstr *Lo, *Hi;
+ splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
+ AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
+ MI->eraseFromParent();
+ moveSMRDToVALU(Lo, MRI);
+ moveSMRDToVALU(Hi, MRI);
+ break;
+ }
+
+ case AMDGPU::S_LOAD_DWORDX16_IMM:
+ case AMDGPU::S_LOAD_DWORDX16_SGPR: {
+ MachineInstr *Lo, *Hi;
+ splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
+ AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
+ MI->eraseFromParent();
+ moveSMRDToVALU(Lo, MRI);
+ moveSMRDToVALU(Hi, MRI);
+ break;
+ }
}
}
@@ -1281,8 +2017,32 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
Inst->eraseFromParent();
continue;
+ case AMDGPU::S_BFE_I64: {
+ splitScalar64BitBFE(Worklist, Inst);
+ Inst->eraseFromParent();
+ continue;
+ }
+
+ case AMDGPU::S_LSHL_B32:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_ASHR_I32:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHR_B32:
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
+ swapOperands(Inst);
+ }
+ break;
+
case AMDGPU::S_BFE_U64:
- case AMDGPU::S_BFE_I64:
case AMDGPU::S_BFM_B64:
llvm_unreachable("Moving this op to VALU not implemented");
}
@@ -1311,17 +2071,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// We are converting these to a BFE, so we need to add the missing
// operands for the size and offset.
unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
- Inst->addOperand(Inst->getOperand(1));
- Inst->getOperand(1).ChangeToImmediate(0);
- Inst->addOperand(MachineOperand::CreateImm(0));
- Inst->addOperand(MachineOperand::CreateImm(0));
Inst->addOperand(MachineOperand::CreateImm(0));
Inst->addOperand(MachineOperand::CreateImm(Size));
- // XXX - Other pointless operands. There are 4, but it seems you only need
- // 3 to not hit an assertion later in MCInstLower.
- Inst->addOperand(MachineOperand::CreateImm(0));
- Inst->addOperand(MachineOperand::CreateImm(0));
} else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
// The VALU version adds the second operand to the result, so insert an
// extra 0 operand.
@@ -1340,16 +2092,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
-
Inst->RemoveOperand(2); // Remove old immediate.
- Inst->addOperand(Inst->getOperand(1));
- Inst->getOperand(1).ChangeToImmediate(0);
- Inst->addOperand(MachineOperand::CreateImm(0));
Inst->addOperand(MachineOperand::CreateImm(Offset));
- Inst->addOperand(MachineOperand::CreateImm(0));
Inst->addOperand(MachineOperand::CreateImm(BitWidth));
- Inst->addOperand(MachineOperand::CreateImm(0));
- Inst->addOperand(MachineOperand::CreateImm(0));
}
// Update the destination register class.
@@ -1403,7 +2148,7 @@ unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
}
const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
- return &AMDGPU::VReg_32RegClass;
+ return &AMDGPU::VGPR_32RegClass;
}
void SIInstrInfo::splitScalar64BitUnaryOp(
@@ -1562,6 +2307,67 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
Worklist.push_back(Second);
}
+void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst) const {
+ MachineBasicBlock &MBB = *Inst->getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst->getDebugLoc();
+
+ MachineOperand &Dest = Inst->getOperand(0);
+ uint32_t Imm = Inst->getOperand(2).getImm();
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+ (void) Offset;
+
+ // Only sext_inreg cases handled.
+ assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
+ BitWidth <= 32 &&
+ Offset == 0 &&
+ "Not implemented");
+
+ if (BitWidth < 32) {
+ unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
+ .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
+ .addImm(0)
+ .addImm(BitWidth);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
+ .addImm(31)
+ .addReg(MidRegLo);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+ .addReg(MidRegLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(MidRegHi)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ return;
+ }
+
+ MachineOperand &Src = Inst->getOperand(1);
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
+ .addImm(31)
+ .addReg(Src.getReg(), 0, AMDGPU::sub0);
+
+ BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+ .addReg(Src.getReg(), 0, AMDGPU::sub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(TmpReg)
+ .addImm(AMDGPU::sub1);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+}
+
void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
MachineInstr *Inst) const {
// Add the implict and explicit register definitions.
@@ -1580,13 +2386,81 @@ void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
}
}
+unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
+ int OpIndices[3]) const {
+ const MCInstrDesc &Desc = get(MI->getOpcode());
+
+ // Find the one SGPR operand we are allowed to use.
+ unsigned SGPRReg = AMDGPU::NoRegister;
+
+ // First we need to consider the instruction's operand requirements before
+ // legalizing. Some operands are required to be SGPRs, such as implicit uses
+ // of VCC, but we are still bound by the constant bus requirement to only use
+ // one.
+ //
+ // If the operand's class is an SGPR, we can never move it.
+
+ for (const MachineOperand &MO : MI->implicit_operands()) {
+ // We only care about reads.
+ if (MO.isDef())
+ continue;
+
+ if (MO.getReg() == AMDGPU::VCC)
+ return AMDGPU::VCC;
+
+ if (MO.getReg() == AMDGPU::FLAT_SCR)
+ return AMDGPU::FLAT_SCR;
+ }
+
+ unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
+ const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+ for (unsigned i = 0; i < 3; ++i) {
+ int Idx = OpIndices[i];
+ if (Idx == -1)
+ break;
+
+ const MachineOperand &MO = MI->getOperand(Idx);
+ if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
+ SGPRReg = MO.getReg();
+
+ if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+ UsedSGPRs[i] = MO.getReg();
+ }
+
+ if (SGPRReg != AMDGPU::NoRegister)
+ return SGPRReg;
+
+ // We don't have a required SGPR operand, so we have a bit more freedom in
+ // selecting operands to move.
+
+ // Try to select the most used SGPR. If an SGPR is equal to one of the
+ // others, we choose that.
+ //
+ // e.g.
+ // V_FMA_F32 v0, s0, s0, s0 -> No moves
+ // V_FMA_F32 v0, s0, s1, s0 -> Move s1
+
+ if (UsedSGPRs[0] != AMDGPU::NoRegister) {
+ if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
+ SGPRReg = UsedSGPRs[0];
+ }
+
+ if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
+ if (UsedSGPRs[1] == UsedSGPRs[2])
+ SGPRReg = UsedSGPRs[1];
+ }
+
+ return SGPRReg;
+}
+
MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
unsigned ValueReg,
unsigned Address, unsigned OffsetReg) const {
const DebugLoc &DL = MBB->findDebugLoc(I);
- unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+ unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
getIndirectIndexBegin(*MBB->getParent()));
return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
@@ -1604,7 +2478,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
unsigned ValueReg,
unsigned Address, unsigned OffsetReg) const {
const DebugLoc &DL = MBB->findDebugLoc(I);
- unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+ unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
getIndirectIndexBegin(*MBB->getParent()));
return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
@@ -1626,7 +2500,7 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
for (int Index = Begin; Index <= End; ++Index)
- Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index));
+ Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
@@ -1644,11 +2518,19 @@ void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
}
-const MachineOperand *SIInstrInfo::getNamedOperand(const MachineInstr& MI,
- unsigned OperandName) const {
+MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
+ unsigned OperandName) const {
int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
if (Idx == -1)
return nullptr;
return &MI.getOperand(Idx);
}
+
+uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+ uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
+ if (ST.isAmdHsaOS())
+ RsrcDataFormat |= (1ULL << 56);
+
+ return RsrcDataFormat;
+}
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 4687539fdf58..f766dc85e86a 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -13,10 +13,11 @@
//===----------------------------------------------------------------------===//
-#ifndef SIINSTRINFO_H
-#define SIINSTRINFO_H
+#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H
+#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H
#include "AMDGPUInstrInfo.h"
+#include "SIDefines.h"
#include "SIRegisterInfo.h"
namespace llvm {
@@ -44,6 +45,8 @@ private:
const TargetRegisterClass *RC,
const MachineOperand &Op) const;
+ void swapOperands(MachineBasicBlock::iterator Inst) const;
+
void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst, unsigned Opcode) const;
@@ -52,9 +55,16 @@ private:
void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst) const;
+ void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst) const;
void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
+ bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
+ MachineInstr *MIb) const;
+
+ unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
+
public:
explicit SIInstrInfo(const AMDGPUSubtarget &st);
@@ -62,11 +72,30 @@ public:
return RI;
}
+ bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1,
+ int64_t &Offset2) const override;
+
+ bool getLdStBaseRegImmOfs(MachineInstr *LdSt,
+ unsigned &BaseReg, unsigned &Offset,
+ const TargetRegisterInfo *TRI) const final;
+
+ bool shouldClusterLoads(MachineInstr *FirstLdSt,
+ MachineInstr *SecondLdSt,
+ unsigned NumLoads) const final;
+
void copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, DebugLoc DL,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
+ unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ RegScavenger *RS,
+ unsigned TmpReg,
+ unsigned Offset,
+ unsigned Size) const;
+
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill, int FrameIndex,
@@ -79,29 +108,102 @@ public:
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
- virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+ bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+ // \brief Returns an opcode that can be used to move a value to a \p DstRC
+ // register. If there is no hardware instruction that can store to \p
+ // DstRC, then AMDGPU::COPY is returned.
+ unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
unsigned commuteOpcode(unsigned Opcode) const;
MachineInstr *commuteInstruction(MachineInstr *MI,
- bool NewMI=false) const override;
+ bool NewMI = false) const override;
+ bool findCommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const override;
bool isTriviallyReMaterializable(const MachineInstr *MI,
AliasAnalysis *AA = nullptr) const;
+ bool areMemAccessesTriviallyDisjoint(
+ MachineInstr *MIa, MachineInstr *MIb,
+ AliasAnalysis *AA = nullptr) const override;
+
MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
unsigned DstReg, unsigned SrcReg) const override;
bool isMov(unsigned Opcode) const override;
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
- bool isDS(uint16_t Opcode) const;
- int isMIMG(uint16_t Opcode) const;
- int isSMRD(uint16_t Opcode) const;
- bool isVOP1(uint16_t Opcode) const;
- bool isVOP2(uint16_t Opcode) const;
- bool isVOP3(uint16_t Opcode) const;
- bool isVOPC(uint16_t Opcode) const;
+
+ bool isSALU(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SALU;
+ }
+
+ bool isVALU(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VALU;
+ }
+
+ bool isSOP1(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOP1;
+ }
+
+ bool isSOP2(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOP2;
+ }
+
+ bool isSOPC(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPC;
+ }
+
+ bool isSOPK(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPK;
+ }
+
+ bool isSOPP(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SOPP;
+ }
+
+ bool isVOP1(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+ }
+
+ bool isVOP2(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+ }
+
+ bool isVOP3(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+ }
+
+ bool isVOPC(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+ }
+
+ bool isMUBUF(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
+ }
+
+ bool isMTBUF(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
+ }
+
+ bool isSMRD(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+ }
+
+ bool isDS(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::DS;
+ }
+
+ bool isMIMG(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+ }
+
+ bool isFLAT(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FLAT;
+ }
+
bool isInlineConstant(const APInt &Imm) const;
bool isInlineConstant(const MachineOperand &MO) const;
bool isLiteralConstant(const MachineOperand &MO) const;
@@ -109,14 +211,28 @@ public:
bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
const MachineOperand &MO) const;
+ /// \brief Return true if the given offset Size in bytes can be folded into
+ /// the immediate offsets of a memory instruction for the given address space.
+ bool canFoldOffset(unsigned OffsetSize, unsigned AS) const;
+
/// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
/// This function will return false if you pass it a 32-bit instruction.
bool hasVALU32BitEncoding(unsigned Opcode) const;
+ /// \brief Returns true if this operand uses the constant bus.
+ bool usesConstantBus(const MachineRegisterInfo &MRI,
+ const MachineOperand &MO) const;
+
+ /// \brief Return true if this instruction has any modifiers.
+ /// e.g. src[012]_mod, omod, clamp.
+ bool hasModifiers(unsigned Opcode) const;
+
+ bool hasModifiersSet(const MachineInstr &MI,
+ unsigned OpName) const;
+
bool verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const override;
- bool isSALUInstr(const MachineInstr &MI) const;
static unsigned getVALUOp(const MachineInstr &MI);
bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
@@ -144,10 +260,21 @@ public:
/// instead of MOV.
void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
+ /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+ /// for \p MI.
+ bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+ const MachineOperand *MO = nullptr) const;
+
/// \brief Legalize all operands in this instruction. This function may
/// create new instruction and insert them before \p MI.
void legalizeOperands(MachineInstr *MI) const;
+ /// \brief Split an SMRD instruction into two smaller loads of half the
+ // size storing the results in \p Lo and \p Hi.
+ void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
+ unsigned HalfImmOp, unsigned HalfSGPROp,
+ MachineInstr *&Lo, MachineInstr *&Hi) const;
+
void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
/// \brief Replace this instruction's opcode with the equivalent VALU
@@ -181,8 +308,15 @@ public:
/// \brief Returns the operand named \p Op. If \p MI does not have an
/// operand named \c Op, this function returns nullptr.
- const MachineOperand *getNamedOperand(const MachineInstr& MI,
- unsigned OperandName) const;
+ MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
+
+ const MachineOperand *getNamedOperand(const MachineInstr &MI,
+ unsigned OpName) const {
+ return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
+ }
+
+ uint64_t getDefaultRsrcDataFormat() const;
+
};
namespace AMDGPU {
@@ -192,21 +326,34 @@ namespace AMDGPU {
int getCommuteRev(uint16_t Opcode);
int getCommuteOrig(uint16_t Opcode);
int getMCOpcode(uint16_t Opcode, unsigned Gen);
+ int getAddr64Inst(uint16_t Opcode);
+ int getAtomicRetOp(uint16_t Opcode);
+ int getAtomicNoRetOp(uint16_t Opcode);
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_TID_ENABLE = 1LL << 55;
} // End namespace AMDGPU
-} // End namespace llvm
+namespace SI {
+namespace KernelInputOffsets {
+
+/// Offsets in bytes from the start of the input buffer
+enum Offsets {
+ NGROUPS_X = 0,
+ NGROUPS_Y = 4,
+ NGROUPS_Z = 8,
+ GLOBAL_SIZE_X = 12,
+ GLOBAL_SIZE_Y = 16,
+ GLOBAL_SIZE_Z = 20,
+ LOCAL_SIZE_X = 24,
+ LOCAL_SIZE_Y = 28,
+ LOCAL_SIZE_Z = 32
+};
+
+} // End namespace KernelInputOffsets
+} // End namespace SI
-namespace SIInstrFlags {
- enum Flags {
- // First 4 bits are the instruction encoding
- VM_CNT = 1 << 0,
- EXP_CNT = 1 << 1,
- LGKM_CNT = 1 << 2
- };
-}
+} // End namespace llvm
-#endif //SIINSTRINFO_H
+#endif
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index b0ac20f558d0..7cc9588c8e4b 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -7,11 +7,61 @@
//
//===----------------------------------------------------------------------===//
+class vop {
+ field bits<9> SI3;
+ field bits<10> VI3;
+}
+
+class vopc <bits<8> si, bits<8> vi = !add(0x40, si)> : vop {
+ field bits<8> SI = si;
+ field bits<8> VI = vi;
+
+ field bits<9> SI3 = {0, si{7-0}};
+ field bits<10> VI3 = {0, 0, vi{7-0}};
+}
+
+class vop1 <bits<8> si, bits<8> vi = si> : vop {
+ field bits<8> SI = si;
+ field bits<8> VI = vi;
+
+ field bits<9> SI3 = {1, 1, si{6-0}};
+ field bits<10> VI3 = !add(0x140, vi);
+}
+
+class vop2 <bits<6> si, bits<6> vi = si> : vop {
+ field bits<6> SI = si;
+ field bits<6> VI = vi;
+
+ field bits<9> SI3 = {1, 0, 0, si{5-0}};
+ field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}};
+}
+
+class vop3 <bits<9> si, bits<10> vi = {0, si}> : vop {
+ let SI3 = si;
+ let VI3 = vi;
+}
+
+class sop1 <bits<8> si, bits<8> vi = si> {
+ field bits<8> SI = si;
+ field bits<8> VI = vi;
+}
+
+class sop2 <bits<7> si, bits<7> vi = si> {
+ field bits<7> SI = si;
+ field bits<7> VI = vi;
+}
+
+class sopk <bits<5> si, bits<5> vi = si> {
+ field bits<5> SI = si;
+ field bits<5> VI = vi;
+}
+
// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
// in AMDGPUMCInstLower.h
def SISubtarget {
int NONE = -1;
int SI = 0;
+ int VI = 1;
}
//===----------------------------------------------------------------------===//
@@ -105,6 +155,22 @@ def as_i32imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
}]>;
+def as_i64imm: SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64);
+}]>;
+
def IMM8bit : PatLeaf <(imm),
[{return isUInt<8>(N->getZExtValue());}]
>;
@@ -117,6 +183,10 @@ def IMM16bit : PatLeaf <(imm),
[{return isUInt<16>(N->getZExtValue());}]
>;
+def IMM20bit : PatLeaf <(imm),
+ [{return isUInt<20>(N->getZExtValue());}]
+>;
+
def IMM32bit : PatLeaf <(imm),
[{return isUInt<32>(N->getZExtValue());}]
>;
@@ -130,13 +200,17 @@ class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
return isInlineImmediate(N);
}]>;
+class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
+ return isInlineImmediate(N);
+}]>;
+
class SGPRImm <dag frag> : PatLeaf<frag, [{
if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() <
AMDGPUSubtarget::SOUTHERN_ISLANDS) {
return false;
}
const SIRegisterInfo *SIRI =
- static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+ static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
U != E; ++U) {
if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
@@ -159,13 +233,70 @@ def sopp_brtarget : Operand<OtherVT> {
let OperandType = "OPERAND_PCREL";
}
+include "SIInstrFormats.td"
+include "VIInstrFormats.td"
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def offen : Operand<i1> {
+ let PrintMethod = "printOffen";
+}
+def idxen : Operand<i1> {
+ let PrintMethod = "printIdxen";
+}
+def addr64 : Operand<i1> {
+ let PrintMethod = "printAddr64";
+}
+def mbuf_offset : Operand<i16> {
+ let PrintMethod = "printMBUFOffset";
+}
+def ds_offset : Operand<i16> {
+ let PrintMethod = "printDSOffset";
+}
+def ds_offset0 : Operand<i8> {
+ let PrintMethod = "printDSOffset0";
+}
+def ds_offset1 : Operand<i8> {
+ let PrintMethod = "printDSOffset1";
+}
+def glc : Operand <i1> {
+ let PrintMethod = "printGLC";
+}
+def slc : Operand <i1> {
+ let PrintMethod = "printSLC";
+}
+def tfe : Operand <i1> {
+ let PrintMethod = "printTFE";
+}
+
+def omod : Operand <i32> {
+ let PrintMethod = "printOModSI";
+}
+
+def ClampMod : Operand <i1> {
+ let PrintMethod = "printClampSI";
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
//===----------------------------------------------------------------------===//
// Complex patterns
//===----------------------------------------------------------------------===//
+def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
+
def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
+def MUBUFAddr64Atomic : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+
+def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
+def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
+def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
+def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
//===----------------------------------------------------------------------===//
// SI assembler operands
@@ -174,9 +305,20 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
def SIOperand {
int ZERO = 0x80;
int VCC = 0x6A;
+ int FLAT_SCR = 0x68;
}
-include "SIInstrFormats.td"
+def SRCMODS {
+ int NONE = 0;
+}
+
+def DSTCLAMP {
+ int NONE = 0;
+}
+
+def DSTOMOD {
+ int NONE = 0;
+}
//===----------------------------------------------------------------------===//
//
@@ -194,43 +336,175 @@ include "SIInstrFormats.td"
//
//===----------------------------------------------------------------------===//
+class SIMCInstr <string pseudo, int subtarget> {
+ string PseudoInstr = pseudo;
+ int Subtarget = subtarget;
+}
+
+//===----------------------------------------------------------------------===//
+// EXP classes
+//===----------------------------------------------------------------------===//
+
+class EXPCommon : InstSI<
+ (outs),
+ (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
+ VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3),
+ "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
+ [] > {
+
+ let EXP_CNT = 1;
+ let Uses = [EXEC];
+}
+
+multiclass EXP_m {
+
+ let isPseudo = 1 in {
+ def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ;
+ }
+
+ def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe;
+
+ def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi;
+}
+
//===----------------------------------------------------------------------===//
// Scalar classes
//===----------------------------------------------------------------------===//
-class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
- op, (outs SReg_32:$dst), (ins SSrc_32:$src0),
- opName#" $dst, $src0", pattern
->;
+class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ SOP1 <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+}
-class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
- op, (outs SReg_64:$dst), (ins SSrc_64:$src0),
- opName#" $dst, $src0", pattern
->;
+class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> :
+ SOP1 <outs, ins, asm, pattern>,
+ SOP1e <op.SI>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> :
+ SOP1 <outs, ins, asm, pattern>,
+ SOP1e <op.VI>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> {
+ def "" : SOP1_Pseudo <opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+ pattern>;
+
+ def _si : SOP1_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+ opName#" $dst, $src0", pattern>;
+
+ def _vi : SOP1_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+ opName#" $dst, $src0", pattern>;
+}
+
+multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> {
+ def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+ pattern>;
+
+ def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+ opName#" $dst, $src0", pattern>;
+
+ def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+ opName#" $dst, $src0", pattern>;
+}
+
+// no input, 64-bit output.
+multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> {
+ def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>;
+
+ def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins),
+ opName#" $dst", pattern> {
+ let SSRC0 = 0;
+ }
+
+ def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins),
+ opName#" $dst", pattern> {
+ let SSRC0 = 0;
+ }
+}
// 64-bit input, 32-bit output.
-class SOP1_32_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
- op, (outs SReg_32:$dst), (ins SSrc_64:$src0),
- opName#" $dst, $src0", pattern
->;
+multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> {
+ def "" : SOP1_Pseudo <opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+ pattern>;
-class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
- op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
- opName#" $dst, $src0, $src1", pattern
->;
+ def _si : SOP1_Real_si <op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+ opName#" $dst, $src0", pattern>;
-class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
- op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
- opName#" $dst, $src0, $src1", pattern
->;
+ def _vi : SOP1_Real_vi <op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0),
+ opName#" $dst, $src0", pattern>;
+}
-class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
- op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
- opName#" $dst, $src0, $src1", pattern
->;
+class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> :
+ SOP2<outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let Size = 4;
+}
+
+class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> :
+ SOP2<outs, ins, asm, pattern>,
+ SOP2e<op.SI>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> :
+ SOP2<outs, ins, asm, pattern>,
+ SOP2e<op.VI>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
+ def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
+
+ def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+ opName#" $dst, $src0, $src1 [$scc]", pattern>;
+
+ def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+ opName#" $dst, $src0, $src1 [$scc]", pattern>;
+}
+
+multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> {
+ def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1), pattern>;
+ def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>;
-class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt,
+ def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
+ (ins SSrc_32:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>;
+}
+
+multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> {
+ def "" : SOP2_Pseudo <opName, (outs SReg_64:$dst),
+ (ins SSrc_64:$src0, SSrc_64:$src1), pattern>;
+
+ def _si : SOP2_Real_si <op, opName, (outs SReg_64:$dst),
+ (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1", pattern>;
+
+ def _vi : SOP2_Real_vi <op, opName, (outs SReg_64:$dst),
+ (ins SSrc_64:$src0, SSrc_64:$src1), opName#" $dst, $src0, $src1", pattern>;
+}
+
+multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> {
+ def "" : SOP2_Pseudo <opName, (outs SReg_64:$dst),
+ (ins SSrc_64:$src0, SSrc_32:$src1), pattern>;
+
+ def _si : SOP2_Real_si <op, opName, (outs SReg_64:$dst),
+ (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>;
+
+ def _vi : SOP2_Real_vi <op, opName, (outs SReg_64:$dst),
+ (ins SSrc_64:$src0, SSrc_32:$src1), opName#" $dst, $src0, $src1", pattern>;
+}
+
+
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
string opName, PatLeaf cond> : SOPC <
op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
opName#" $dst, $src0, $src1", []>;
@@ -241,28 +515,90 @@ class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
: SOPC_Helper<op, SSrc_64, i64, opName, cond>;
-class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
- op, (outs SReg_32:$dst), (ins i16imm:$src0),
- opName#" $dst, $src0", pattern
->;
+class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ SOPK <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+}
-class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
- op, (outs SReg_64:$dst), (ins i16imm:$src0),
- opName#" $dst, $src0", pattern
->;
+class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> :
+ SOPK <outs, ins, asm, pattern>,
+ SOPKe <op.SI>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> :
+ SOPK <outs, ins, asm, pattern>,
+ SOPKe <op.VI>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
+ def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+ pattern>;
+
+ def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+ opName#" $dst, $src0", pattern>;
+
+ def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0),
+ opName#" $dst, $src0", pattern>;
+}
+
+multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
+ def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
+ (ins SReg_32:$src0, u16imm:$src1), pattern>;
+
+ def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
+ (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0", pattern>;
+
+ def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
+ (ins SReg_32:$src0, u16imm:$src1), opName#" $dst, $src0", pattern>;
+}
+
+//===----------------------------------------------------------------------===//
+// SMRD classes
+//===----------------------------------------------------------------------===//
+
+class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ SMRD <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+}
+
+class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
+ string asm> :
+ SMRD <outs, ins, asm, []>,
+ SMRDe <op, imm>,
+ SIMCInstr<opName, SISubtarget.SI>;
-multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
+class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
+ string asm> :
+ SMRD <outs, ins, asm, []>,
+ SMEMe_vi <op, imm>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
+ string asm, list<dag> pattern> {
+
+ def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
+
+ def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
+
+ def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
+}
+
+multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
RegisterClass dstClass> {
- def _IMM : SMRD <
- op, 1, (outs dstClass:$dst),
+ defm _IMM : SMRD_m <
+ op, opName#"_IMM", 1, (outs dstClass:$dst),
(ins baseClass:$sbase, u32imm:$offset),
- asm#" $dst, $sbase, $offset", []
+ opName#" $dst, $sbase, $offset", []
>;
- def _SGPR : SMRD <
- op, 0, (outs dstClass:$dst),
+ defm _SGPR : SMRD_m <
+ op, opName#"_SGPR", 0, (outs dstClass:$dst),
(ins baseClass:$sbase, SReg_32:$soff),
- asm#" $dst, $sbase, $soff", []
+ opName#" $dst, $sbase, $soff", []
>;
}
@@ -270,6 +606,210 @@ multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
// Vector ALU classes
//===----------------------------------------------------------------------===//
+// This must always be right before the operand being input modified.
+def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
+ let PrintMethod = "printOperandAndMods";
+}
+def InputModsNoDefault : Operand <i32> {
+ let PrintMethod = "printOperandAndMods";
+}
+
+class getNumSrcArgs<ValueType Src1, ValueType Src2> {
+ int ret =
+ !if (!eq(Src1.Value, untyped.Value), 1, // VOP1
+ !if (!eq(Src2.Value, untyped.Value), 2, // VOP2
+ 3)); // VOP3
+}
+
+// Returns the register class to use for the destination of VOP[123C]
+// instructions for the given VT.
+class getVALUDstForVT<ValueType VT> {
+ RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32,
+ !if(!eq(VT.Size, 64), VReg_64,
+ SReg_64)); // else VT == i1
+}
+
+// Returns the register class to use for source 0 of VOP[12C]
+// instructions for the given VT.
+class getVOPSrc0ForVT<ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+}
+
+// Returns the register class to use for source 1 of VOP[12C] for the
+// given VT.
+class getVOPSrc1ForVT<ValueType VT> {
+ RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
+}
+
+// Returns the register classes for the source arguments of a VOP[12C]
+// instruction for the given SrcVTs.
+class getInRC32 <list<ValueType> SrcVT> {
+ list<DAGOperand> ret = [
+ getVOPSrc0ForVT<SrcVT[0]>.ret,
+ getVOPSrc1ForVT<SrcVT[1]>.ret
+ ];
+}
+
+// Returns the register class to use for sources of VOP3 instructions for the
+// given VT.
+class getVOP3SrcForVT<ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
+}
+
+// Returns the register classes for the source arguments of a VOP3
+// instruction for the given SrcVTs.
+class getInRC64 <list<ValueType> SrcVT> {
+ list<DAGOperand> ret = [
+ getVOP3SrcForVT<SrcVT[0]>.ret,
+ getVOP3SrcForVT<SrcVT[1]>.ret,
+ getVOP3SrcForVT<SrcVT[2]>.ret
+ ];
+}
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+class hasModifiers<ValueType SrcVT> {
+ bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
+ !if(!eq(SrcVT.Value, f64.Value), 1, 0));
+}
+
+// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
+class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+ dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1
+ !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
+ (ins)));
+}
+
+// Returns the input arguments for VOP3 instructions for the given SrcVT.
+class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs,
+ bit HasModifiers> {
+
+ dag ret =
+ !if (!eq(NumSrcArgs, 1),
+ !if (!eq(HasModifiers, 1),
+ // VOP1 with modifiers
+ (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+ ClampMod:$clamp, omod:$omod)
+ /* else */,
+ // VOP1 without modifiers
+ (ins Src0RC:$src0)
+ /* endif */ ),
+ !if (!eq(NumSrcArgs, 2),
+ !if (!eq(HasModifiers, 1),
+ // VOP 2 with modifiers
+ (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+ InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
+ ClampMod:$clamp, omod:$omod)
+ /* else */,
+ // VOP2 without modifiers
+ (ins Src0RC:$src0, Src1RC:$src1)
+ /* endif */ )
+ /* NumSrcArgs == 3 */,
+ !if (!eq(HasModifiers, 1),
+ // VOP3 with modifiers
+ (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+ InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
+ InputModsNoDefault:$src2_modifiers, Src2RC:$src2,
+ ClampMod:$clamp, omod:$omod)
+ /* else */,
+ // VOP3 without modifiers
+ (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
+ /* endif */ )));
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP[12C]
+// instruction. This does not add the _e32 suffix, so it can be reused
+// by getAsm64.
+class getAsm32 <int NumSrcArgs> {
+ string src1 = ", $src1";
+ string src2 = ", $src2";
+ string ret = " $dst, $src0"#
+ !if(!eq(NumSrcArgs, 1), "", src1)#
+ !if(!eq(NumSrcArgs, 3), src2, "");
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP3
+// instruction.
+class getAsm64 <int NumSrcArgs, bit HasModifiers> {
+ string src0 = "$src0_modifiers,";
+ string src1 = !if(!eq(NumSrcArgs, 1), "",
+ !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+ " $src1_modifiers,"));
+ string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+ string ret =
+ !if(!eq(HasModifiers, 0),
+ getAsm32<NumSrcArgs>.ret,
+ " $dst, "#src0#src1#src2#"$clamp"#"$omod");
+}
+
+
+class VOPProfile <list<ValueType> _ArgVT> {
+
+ field list<ValueType> ArgVT = _ArgVT;
+
+ field ValueType DstVT = ArgVT[0];
+ field ValueType Src0VT = ArgVT[1];
+ field ValueType Src1VT = ArgVT[2];
+ field ValueType Src2VT = ArgVT[3];
+ field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret;
+ field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
+ field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret;
+ field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
+ field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
+ field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+
+ field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
+ field bit HasModifiers = hasModifiers<Src0VT>.ret;
+
+ field dag Outs = (outs DstRC:$dst);
+
+ field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
+ field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ HasModifiers>.ret;
+
+ field string Asm32 = "_e32"#getAsm32<NumSrcArgs>.ret;
+ field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret;
+}
+
+def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
+def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
+def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
+def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>;
+def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>;
+def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
+def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
+def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+
+def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
+def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
+def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
+def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
+def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
+ let Src0RC32 = VCSrc_32;
+}
+
+def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
+ let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+ let Asm64 = " $dst, $src0_modifiers, $src1";
+}
+
+def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
+ let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+ let Asm64 = " $dst, $src0_modifiers, $src1";
+}
+
+def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
+def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
+
+def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
+def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
+def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
+def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
+
+
class VOP <string opName> {
string OpName = opName;
}
@@ -279,399 +819,943 @@ class VOP2_REV <string revOp, bit isOrig> {
bit IsOrig = isOrig;
}
-class SIMCInstr <string pseudo, int subtarget> {
- string PseudoInstr = pseudo;
- int Subtarget = subtarget;
+class AtomicNoRet <string noRetOp, bit isRet> {
+ string NoRetOp = noRetOp;
+ bit IsRet = isRet;
+}
+
+class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+ VOP1Common <outs, ins, "", pattern>,
+ VOP <opName>,
+ SIMCInstr <opName#"_e32", SISubtarget.NONE> {
+ let isPseudo = 1;
+}
+
+multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName> {
+ def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+
+ def _si : VOP1<op.SI, outs, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI>;
+ def _vi : VOP1<op.VI, outs, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI>;
+}
+
+class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+ VOP2Common <outs, ins, "", pattern>,
+ VOP <opName>,
+ SIMCInstr<opName#"_e32", SISubtarget.NONE> {
+ let isPseudo = 1;
+}
+
+multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName, string revOpSI, string revOpVI> {
+ def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+ VOP2_REV<revOpSI#"_e32", !eq(revOpSI, opName)>;
+
+ def _si : VOP2 <op.SI, outs, ins, opName#asm, []>,
+ VOP2_REV<revOpSI#"_e32_si", !eq(revOpSI, opName)>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI>;
+ def _vi : VOP2 <op.VI, outs, ins, opName#asm, []>,
+ VOP2_REV<revOpVI#"_e32_vi", !eq(revOpVI, opName)>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI>;
+}
+
+class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
+
+ bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
+ bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0);
+ bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ? ,0) ,0);
+ bits<2> omod = !if(HasModifiers, ?, 0);
+ bits<1> clamp = !if(HasModifiers, ?, 0);
+ bits<9> src1 = !if(HasSrc1, ?, 0);
+ bits<9> src2 = !if(HasSrc2, ?, 0);
}
class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
VOP3Common <outs, ins, "", pattern>,
VOP <opName>,
- SIMCInstr<opName, SISubtarget.NONE> {
+ SIMCInstr<opName#"_e64", SISubtarget.NONE> {
let isPseudo = 1;
}
class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
- VOP3 <op, outs, ins, asm, []>,
- SIMCInstr<opName, SISubtarget.SI>;
+ VOP3Common <outs, ins, asm, []>,
+ VOP3e <op>,
+ SIMCInstr<opName#"_e64", SISubtarget.SI>;
+
+class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> :
+ VOP3Common <outs, ins, asm, []>,
+ VOP3e_vi <op>,
+ SIMCInstr <opName#"_e64", SISubtarget.VI>;
+
+// VI only instruction
+class VOP3_vi <bits<10> op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern, int NumSrcArgs, bit HasMods = 1> :
+ VOP3Common <outs, ins, asm, pattern>,
+ VOP <opName>,
+ VOP3e_vi <op>,
+ VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
+ !if(!eq(NumSrcArgs, 2), 0, 1),
+ HasMods>;
-multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName> {
+multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName, int NumSrcArgs, bit HasMods = 1> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
- def _si : VOP3_Real_si <op, outs, ins, asm, opName>;
-
+ def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+ VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
+ !if(!eq(NumSrcArgs, 2), 0, 1),
+ HasMods>;
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
+ !if(!eq(NumSrcArgs, 2), 0, 1),
+ HasMods>;
}
-multiclass VOP3_1_m <bits<8> op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName> {
+// VOP3_m without source modifiers
+multiclass VOP3_m_nosrcmod <vop op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName, int NumSrcArgs, bit HasMods = 1> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
- let src1 = 0, src1_modifiers = 0, src2 = 0, src2_modifiers = 0 in {
-
- def _si : VOP3_Real_si <
- {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
- outs, ins, asm, opName
- >;
-
- } // src1 = 0, src1_modifiers = 0, src2 = 0, src2_modifiers = 0
+ let src0_modifiers = 0,
+ src1_modifiers = 0,
+ src2_modifiers = 0 in {
+ def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
+ }
}
-multiclass VOP3_2_m <bits<6> op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp> {
+multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName, bit HasMods = 1> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
- let src2 = 0, src2_modifiers = 0 in {
+ def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+ VOP3DisableFields<0, 0, HasMods>;
- def _si : VOP3_Real_si <
- {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
- outs, ins, asm, opName>,
- VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
- } // src2 = 0, src2_modifiers = 0
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<0, 0, HasMods>;
}
-// This must always be right before the operand being input modified.
-def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
- let PrintMethod = "printOperandAndMods";
+multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName, string revOpSI, string revOpVI,
+ bit HasMods = 1, bit UseFullOp = 0> {
+
+ def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+ VOP2_REV<revOpSI#"_e64", !eq(revOpSI, opName)>;
+
+ def _si : VOP3_Real_si <op.SI3,
+ outs, ins, asm, opName>,
+ VOP2_REV<revOpSI#"_e64_si", !eq(revOpSI, opName)>,
+ VOP3DisableFields<1, 0, HasMods>;
+
+ def _vi : VOP3_Real_vi <op.VI3,
+ outs, ins, asm, opName>,
+ VOP2_REV<revOpVI#"_e64_vi", !eq(revOpVI, opName)>,
+ VOP3DisableFields<1, 0, HasMods>;
}
-multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
- string opName, list<dag> pattern> {
-
- def _e32 : VOP1 <
- op, (outs drc:$dst), (ins src:$src0),
- opName#"_e32 $dst, $src0", pattern
- >, VOP <opName>;
-
- defm _e64 : VOP3_1_m <
- op,
- (outs drc:$dst),
- (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod),
- opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", [], opName>;
-}
-
-multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern>
- : VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>;
-
-multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern>
- : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
-
-multiclass VOP1_32_64 <bits<8> op, string opName, list<dag> pattern>
- : VOP1_Helper <op, VReg_32, VSrc_64, opName, pattern>;
-
-multiclass VOP1_64_32 <bits<8> op, string opName, list<dag> pattern>
- : VOP1_Helper <op, VReg_64, VSrc_32, opName, pattern>;
-
-multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
- string opName, list<dag> pattern, string revOp> {
- def _e32 : VOP2 <
- op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1),
- opName#"_e32 $dst, $src0, $src1", pattern
- >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
-
- defm _e64 : VOP3_2_m <
- op,
- (outs vrc:$dst),
- (ins InputMods:$src0_modifiers, arc:$src0,
- InputMods:$src1_modifiers, arc:$src1,
- i32imm:$clamp, i32imm:$omod),
- opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", [],
- opName, revOp>;
-}
-
-multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern,
- string revOp = opName>
- : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern, revOp>;
-
-multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern,
- string revOp = opName>
- : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern, revOp>;
-
-multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
- RegisterClass src0_rc, string revOp = opName> {
-
- def _e32 : VOP2 <
- op, (outs VReg_32:$dst), (ins src0_rc:$src0, VReg_32:$src1),
- opName#"_e32 $dst, $src0, $src1", pattern
- >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
-
- def _e64 : VOP3b <
- {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
- (outs VReg_32:$dst),
- (ins InputMods: $src0_modifiers, VSrc_32:$src0,
- InputMods:$src1_modifiers, VSrc_32:$src1,
- i32imm:$clamp, i32imm:$omod),
- opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
- >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
- let src2 = 0;
- let src2_modifiers = 0;
- /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
- can write it into any SGPR. We currently don't use the carry out,
- so for now hardcode it to VCC as well */
- let sdst = SIOperand.VCC;
- }
+multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName, string revOp,
+ bit HasMods = 1, bit UseFullOp = 0> {
+ def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+ VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+ // The VOP2 variant puts the carry out into VCC, the VOP3 variant
+ // can write it into any SGPR. We currently don't use the carry out,
+ // so for now hardcode it to VCC as well.
+ let sdst = SIOperand.VCC, Defs = [VCC] in {
+ def _si : VOP3b <op.SI3, outs, ins, asm, pattern>,
+ VOP3DisableFields<1, 0, HasMods>,
+ SIMCInstr<opName#"_e64", SISubtarget.SI>,
+ VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>;
+
+ // TODO: Do we need this VI variant here?
+ /*def _vi : VOP3b_vi <op.VI3, outs, ins, asm, pattern>,
+ VOP3DisableFields<1, 0, HasMods>,
+ SIMCInstr<opName#"_e64", SISubtarget.VI>,
+ VOP2_REV<revOp#"_e64_vi", !eq(revOp, opName)>;*/
+ } // End sdst = SIOperand.VCC, Defs = [VCC]
}
-multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
- string opName, ValueType vt, PatLeaf cond, bit defExec = 0> {
- def _e32 : VOPC <
- op, (ins arc:$src0, vrc:$src1),
- opName#"_e32 $dst, $src0, $src1", []
- >, VOP <opName> {
+multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName,
+ bit HasMods, bit defExec> {
+
+ def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+ def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 0, HasMods> {
let Defs = !if(defExec, [EXEC], []);
}
- def _e64 : VOP3 <
- {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
- (outs SReg_64:$dst),
- (ins InputMods:$src0_modifiers, arc:$src0,
- InputMods:$src1_modifiers, arc:$src1,
- InstFlag:$clamp, InstFlag:$omod),
- opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod",
- !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
- [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
- )
- >, VOP <opName> {
+ def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
+ VOP3DisableFields<1, 0, HasMods> {
let Defs = !if(defExec, [EXEC], []);
- let src2 = 0;
- let src2_modifiers = 0;
}
}
-multiclass VOPC_32 <bits<8> op, string opName,
- ValueType vt = untyped, PatLeaf cond = COND_NULL>
- : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond>;
+multiclass VOP1_Helper <vop1 op, string opName, dag outs,
+ dag ins32, string asm32, list<dag> pat32,
+ dag ins64, string asm64, list<dag> pat64,
+ bit HasMods> {
+
+ defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
+
+ defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>;
+}
+
+multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag> : VOP1_Helper <
+ op, opName, P.Outs,
+ P.Ins32, P.Asm32, [],
+ P.Ins64, P.Asm64,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+ i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
+ P.HasModifiers
+>;
+
+multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag> {
+
+ def _e32 : VOP1 <op.SI, P.Outs, P.Ins32, opName#P.Asm32, []>,
+ VOP <opName>;
+
+ def _e64 : VOP3Common <P.Outs, P.Ins64, opName#P.Asm64,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+ i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0))])>,
+ VOP <opName>,
+ VOP3e <op.SI3>,
+ VOP3DisableFields<0, 0, P.HasModifiers>;
+}
+
+multiclass VOP2_Helper <vop2 op, string opName, dag outs,
+ dag ins32, string asm32, list<dag> pat32,
+ dag ins64, string asm64, list<dag> pat64,
+ string revOpSI, string revOpVI, bit HasMods> {
+ defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOpSI, revOpVI>;
-multiclass VOPC_64 <bits<8> op, string opName,
- ValueType vt = untyped, PatLeaf cond = COND_NULL>
- : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
+ defm _e64 : VOP3_2_m <op,
+ outs, ins64, opName#"_e64"#asm64, pat64, opName, revOpSI, revOpVI, HasMods
+ >;
+}
+
+multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOpSI = opName, string revOpVI = revOpSI> : VOP2_Helper <
+ op, opName, P.Outs,
+ P.Ins32, P.Asm32, [],
+ P.Ins64, P.Asm64,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+ revOpSI, revOpVI, P.HasModifiers
+>;
-multiclass VOPCX_32 <bits<8> op, string opName,
- ValueType vt = untyped, PatLeaf cond = COND_NULL>
- : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>;
+multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
+ dag ins32, string asm32, list<dag> pat32,
+ dag ins64, string asm64, list<dag> pat64,
+ string revOp, bit HasMods> {
-multiclass VOPCX_64 <bits<8> op, string opName,
- ValueType vt = untyped, PatLeaf cond = COND_NULL>
- : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>;
+ defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp, revOp>;
+
+ defm _e64 : VOP3b_2_m <op,
+ outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods
+ >;
+}
-multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
- op, (outs VReg_32:$dst),
- (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
- VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2,
- InstFlag:$clamp, InstFlag:$omod),
- opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
+multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag,
+ string revOp = opName> : VOP2b_Helper <
+ op, opName, P.Outs,
+ P.Ins32, P.Asm32, [],
+ P.Ins64, P.Asm64,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+ revOp, P.HasModifiers
>;
-class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 <
- op, (outs VReg_64:$dst),
- (ins VSrc_64:$src0, VSrc_32:$src1),
- opName#" $dst, $src0, $src1", pattern
->, VOP <opName> {
+class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+ VOPCCommon <ins, "", pattern>,
+ VOP <opName>,
+ SIMCInstr<opName#"_e32", SISubtarget.NONE> {
+ let isPseudo = 1;
+}
+
+multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
+ string opName, bit DefExec> {
+ def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
+
+ def _si : VOPC<op.SI, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI> {
+ let Defs = !if(DefExec, [EXEC], []);
+ }
+
+ def _vi : VOPC<op.VI, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI> {
+ let Defs = !if(DefExec, [EXEC], []);
+ }
+}
- let src2 = 0;
- let src2_modifiers = 0;
- let src0_modifiers = 0;
- let clamp = 0;
- let omod = 0;
+multiclass VOPC_Helper <vopc op, string opName,
+ dag ins32, string asm32, list<dag> pat32,
+ dag out64, dag ins64, string asm64, list<dag> pat64,
+ bit HasMods, bit DefExec> {
+ defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+
+ defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64,
+ opName, HasMods, DefExec>;
}
-class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
- op, (outs VReg_64:$dst),
- (ins InputMods:$src0_modifiers, VSrc_64:$src0,
- InputMods:$src1_modifiers, VSrc_64:$src1,
- InputMods:$src2_modifiers, VSrc_64:$src2,
- InstFlag:$clamp, InstFlag:$omod),
- opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern
->, VOP <opName>;
+multiclass VOPCInst <vopc op, string opName,
+ VOPProfile P, PatLeaf cond = COND_NULL,
+ bit DefExec = 0> : VOPC_Helper <
+ op, opName,
+ P.Ins32, P.Asm32, [],
+ (outs SReg_64:$dst), P.Ins64, P.Asm64,
+ !if(P.HasModifiers,
+ [(set i1:$dst,
+ (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ cond))],
+ [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
+ P.HasModifiers, DefExec
+>;
+
+multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
+ bit DefExec = 0> : VOPC_Helper <
+ op, opName,
+ P.Ins32, P.Asm32, [],
+ (outs SReg_64:$dst), P.Ins64, P.Asm64,
+ !if(P.HasModifiers,
+ [(set i1:$dst,
+ (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
+ [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
+ P.HasModifiers, DefExec
+>;
+
+
+multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+ VOPCInst <op, opName, VOP_F32_F32_F32, cond>;
+
+multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+ VOPCInst <op, opName, VOP_F64_F64_F64, cond>;
+
+multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+ VOPCInst <op, opName, VOP_I32_I32_I32, cond>;
+
+multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+ VOPCInst <op, opName, VOP_I64_I64_I64, cond>;
+
+
+multiclass VOPCX <vopc op, string opName, VOPProfile P,
+ PatLeaf cond = COND_NULL>
+ : VOPCInst <op, opName, P, cond, 1>;
+
+multiclass VOPCX_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+ VOPCX <op, opName, VOP_F32_F32_F32, cond>;
+multiclass VOPCX_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+ VOPCX <op, opName, VOP_F64_F64_F64, cond>;
-class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc,
- string opName, list<dag> pattern> : VOP3 <
- op, (outs vrc:$dst0, SReg_64:$dst1),
- (ins arc:$src0, arc:$src1, arc:$src2,
- InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
- opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
->, VOP <opName>;
+multiclass VOPCX_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+ VOPCX <op, opName, VOP_I32_I32_I32, cond>;
+multiclass VOPCX_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+ VOPCX <op, opName, VOP_I64_I64_I64, cond>;
-class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> :
+multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
+ list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
+ op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods
+>;
+
+multiclass VOPC_CLASS_F32 <vopc op, string opName> :
+ VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
+
+multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
+ VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F64 <vopc op, string opName> :
+ VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
+
+multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
+ VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
+
+multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag> : VOP3_Helper <
+ op, opName, P.Outs, P.Ins64, P.Asm64,
+ !if(!eq(P.NumSrcArgs, 3),
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1,
+ P.Src2VT:$src2))]),
+ !if(!eq(P.NumSrcArgs, 2),
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
+ /* P.NumSrcArgs == 1 */,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))),
+ P.NumSrcArgs, P.HasModifiers
+>;
+
+class VOP3InstVI <bits<10> op, string opName, VOPProfile P,
+ SDPatternOperator node = null_frag> : VOP3_vi <
+ op, opName#"_vi", P.Outs, P.Ins64, opName#P.Asm64,
+ !if(!eq(P.NumSrcArgs, 3),
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1,
+ P.Src2VT:$src2))]),
+ !if(!eq(P.NumSrcArgs, 2),
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
+ /* P.NumSrcArgs == 1 */,
+ !if(P.HasModifiers,
+ [(set P.DstVT:$dst,
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod))))],
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))),
+ P.NumSrcArgs, P.HasModifiers
+>;
+
+multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
+ string opName, list<dag> pattern> :
+ VOP3b_2_m <
+ op, (outs vrc:$vdst, SReg_64:$sdst),
+ (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
+ InputModsNoDefault:$src1_modifiers, arc:$src1,
+ InputModsNoDefault:$src2_modifiers, arc:$src2,
+ ClampMod:$clamp, omod:$omod),
+ opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern,
+ opName, opName, 1, 1
+>;
+
+multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
-class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> :
- VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
+ VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
+
+
+class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
+ (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+ (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))),
+ (Inst i32:$src0_modifiers, P.Src0VT:$src0,
+ i32:$src1_modifiers, P.Src1VT:$src1,
+ i32:$src2_modifiers, P.Src2VT:$src2,
+ i1:$clamp,
+ i32:$omod)>;
+
+//===----------------------------------------------------------------------===//
+// Interpolation opcodes
+//===----------------------------------------------------------------------===//
+
+class VINTRP_Pseudo <string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> :
+ VINTRPCommon <outs, ins, asm, pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+}
+
+class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pattern> :
+ VINTRPCommon <outs, ins, asm, pattern>,
+ VINTRPe <op>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pattern> :
+ VINTRPCommon <outs, ins, asm, pattern>,
+ VINTRPe_vi <op>,
+ SIMCInstr<opName, SISubtarget.VI>;
+
+multiclass VINTRP_m <bits <2> op, string opName, dag outs, dag ins, string asm,
+ string disableEncoding = "", string constraints = "",
+ list<dag> pattern = []> {
+ let DisableEncoding = disableEncoding,
+ Constraints = constraints in {
+ def "" : VINTRP_Pseudo <opName, outs, ins, asm, pattern>;
+
+ def _si : VINTRP_Real_si <op, opName, outs, ins, asm, pattern>;
+
+ def _vi : VINTRP_Real_vi <op, opName, outs, ins, asm, pattern>;
+ }
+}
//===----------------------------------------------------------------------===//
// Vector I/O classes
//===----------------------------------------------------------------------===//
-class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
- DS <op, outs, ins, asm, pat> {
+class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ DS <outs, ins, "", pattern>,
+ SIMCInstr <opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+}
+
+class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
+ DS <outs, ins, asm, []>,
+ DSe <op>,
+ SIMCInstr <opName, SISubtarget.SI>;
+
+class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
+ DS <outs, ins, asm, []>,
+ DSe_vi <op>,
+ SIMCInstr <opName, SISubtarget.VI>;
+
+class DS_1A_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> :
+ DS <outs, ins, asm, []>,
+ DSe <op>,
+ SIMCInstr <opName, SISubtarget.SI> {
+
+ // Single load interpret the 2 i8imm operands as a single i16 offset.
bits<16> offset;
+ let offset0 = offset{7-0};
+ let offset1 = offset{15-8};
+}
+
+class DS_1A_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> :
+ DS <outs, ins, asm, []>,
+ DSe_vi <op>,
+ SIMCInstr <opName, SISubtarget.VI> {
// Single load interpret the 2 i8imm operands as a single i16 offset.
+ bits<16> offset;
let offset0 = offset{7-0};
let offset1 = offset{15-8};
}
-class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+multiclass DS_1A_Load_m <bits<8> op, string opName, dag outs, dag ins, string asm,
+ list<dag> pat> {
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+ let data0 = 0, data1 = 0 in {
+ def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
+}
+
+multiclass DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass>
+ : DS_1A_Load_m <
op,
+ asm,
(outs regClass:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset),
- asm#" $vdst, $addr, $offset, [M0]",
- []> {
- let data0 = 0;
- let data1 = 0;
- let mayLoad = 1;
- let mayStore = 0;
+ (ins i1imm:$gds, VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0),
+ asm#" $vdst, $addr"#"$offset"#" [M0]",
+ []>;
+
+multiclass DS_Load2_m <bits<8> op, string opName, dag outs, dag ins, string asm,
+ list<dag> pat> {
+ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+ let data0 = 0, data1 = 0 in {
+ def _si : DS_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
-class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+multiclass DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass>
+ : DS_Load2_m <
op,
+ asm,
(outs regClass:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1),
- asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]",
- []> {
- let data0 = 0;
- let data1 = 0;
- let mayLoad = 1;
- let mayStore = 0;
+ (ins i1imm:$gds, VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
+ M0Reg:$m0),
+ asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]",
+ []>;
+
+multiclass DS_1A_Store_m <bits<8> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pat> {
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+ let data1 = 0, vdst = 0 in {
+ def _si : DS_1A_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_1A_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
-class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+multiclass DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass>
+ : DS_1A_Store_m <
op,
+ asm,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset),
- asm#" $addr, $data0, $offset [M0]",
- []> {
- let data1 = 0;
- let mayStore = 1;
- let mayLoad = 0;
- let vdst = 0;
+ (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0),
+ asm#" $addr, $data0"#"$offset"#" [M0]",
+ []>;
+
+multiclass DS_Store_m <bits<8> op, string opName, dag outs, dag ins,
+ string asm, list<dag> pat> {
+ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, pat>;
+
+ let vdst = 0 in {
+ def _si : DS_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_Real_vi <op, opName, outs, ins, asm>;
+ }
+ }
}
-class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+multiclass DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass>
+ : DS_Store_m <
op,
+ asm,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1),
- asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]",
- []> {
- let mayStore = 1;
- let mayLoad = 0;
- let vdst = 0;
+ (ins i1imm:$gds, VGPR_32:$addr, regClass:$data0, regClass:$data1,
+ ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0),
+ asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]",
+ []>;
+
+class DS_1A_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
+ DS_si <op, outs, ins, asm, pat> {
+ bits<16> offset;
+
+ // Single load interpret the 2 i8imm operands as a single i16 offset.
+ let offset0 = offset{7-0};
+ let offset1 = offset{15-8};
+
+ let hasSideEffects = 0;
}
// 1 address, 1 data.
-class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A_si <
op,
(outs rc:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
- asm#" $vdst, $addr, $data0, $offset, [M0]",
- []> {
+ (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0),
+ asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>,
+ AtomicNoRet<noRetOp, 1> {
let data1 = 0;
let mayStore = 1;
let mayLoad = 1;
+
+ let hasPostISelHook = 1; // Adjusted to no return version.
}
// 1 address, 2 data.
-class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A_si <
op,
(outs rc:$vdst),
- (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
- asm#" $vdst, $addr, $data0, $data1, $offset, [M0]",
- []> {
+ (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0),
+ asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]",
+ []>,
+ AtomicNoRet<noRetOp, 1> {
let mayStore = 1;
let mayLoad = 1;
+ let hasPostISelHook = 1; // Adjusted to no return version.
}
// 1 address, 2 data.
-class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A_si <
op,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
- asm#" $addr, $data0, $data1, $offset, [M0]",
- []> {
+ (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0),
+ asm#" $addr, $data0, $data1"#"$offset"#" [M0]",
+ []>,
+ AtomicNoRet<noRetOp, 0> {
let mayStore = 1;
let mayLoad = 1;
}
// 1 address, 1 data.
-class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A_si <
op,
(outs),
- (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
- asm#" $addr, $data0, $offset, [M0]",
- []> {
+ (ins i1imm:$gds, VGPR_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0),
+ asm#" $addr, $data0"#"$offset"#" [M0]",
+ []>,
+ AtomicNoRet<noRetOp, 0> {
let data1 = 0;
let mayStore = 1;
let mayLoad = 1;
}
-class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
- op,
- (outs),
+//===----------------------------------------------------------------------===//
+// MTBUF classes
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ MTBUF <outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+}
+
+class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
+ string asm> :
+ MTBUF <outs, ins, asm, []>,
+ MTBUFe <op>,
+ SIMCInstr<opName, SISubtarget.SI>;
+
+class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> :
+ MTBUF <outs, ins, asm, []>,
+ MTBUFe_vi <op>,
+ SIMCInstr <opName, SISubtarget.VI>;
+
+multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
+ list<dag> pattern> {
+
+ def "" : MTBUF_Pseudo <opName, outs, ins, pattern>;
+
+ def _si : MTBUF_Real_si <op, opName, outs, ins, asm>;
+
+ def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>;
+
+}
+
+let mayStore = 1, mayLoad = 0 in {
+
+multiclass MTBUF_Store_Helper <bits<3> op, string opName,
+ RegisterClass regClass> : MTBUF_m <
+ op, opName, (outs),
(ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
- i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
- SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
- asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
- #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
- []> {
- let mayStore = 1;
- let mayLoad = 0;
+ i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
+ SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
+ opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+ #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
+>;
+
+} // mayStore = 1, mayLoad = 0
+
+let mayLoad = 1, mayStore = 0 in {
+
+multiclass MTBUF_Load_Helper <bits<3> op, string opName,
+ RegisterClass regClass> : MTBUF_m <
+ op, opName, (outs regClass:$dst),
+ (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+ i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
+ i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset),
+ opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+ #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
+>;
+
+} // mayLoad = 1, mayStore = 0
+
+//===----------------------------------------------------------------------===//
+// MUBUF classes
+//===----------------------------------------------------------------------===//
+
+class MUBUF_si <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+ MUBUF <outs, ins, asm, pattern>, MUBUFe <op> {
+ let lds = 0;
+}
+
+class MUBUF_vi <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+ MUBUF <outs, ins, asm, pattern>, MUBUFe_vi <op> {
+ let lds = 0;
+}
+
+class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+
+ bit IsAddr64 = is_addr64;
+ string OpName = NAME # suffix;
+}
+
+class MUBUFAtomicAddr64 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern>
+ : MUBUF_si <op, outs, ins, asm, pattern> {
+
+ let offen = 0;
+ let idxen = 0;
+ let addr64 = 1;
+ let tfe = 0;
+ let lds = 0;
+ let soffset = 128;
+}
+
+class MUBUFAtomicOffset <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern>
+ : MUBUF_si <op, outs, ins, asm, pattern> {
+
+ let offen = 0;
+ let idxen = 0;
+ let addr64 = 0;
+ let tfe = 0;
+ let lds = 0;
+ let vaddr = 0;
+}
+
+multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc,
+ ValueType vt, SDPatternOperator atomic> {
+
+ let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in {
+
+ // No return variants
+ let glc = 0 in {
+
+ def _ADDR64 : MUBUFAtomicAddr64 <
+ op, (outs),
+ (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+ mbuf_offset:$offset, slc:$slc),
+ name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", []
+ >, MUBUFAddr64Table<1>, AtomicNoRet<NAME#"_ADDR64", 0>;
+
+ def _OFFSET : MUBUFAtomicOffset <
+ op, (outs),
+ (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
+ SCSrc_32:$soffset, slc:$slc),
+ name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", []
+ >, MUBUFAddr64Table<0>, AtomicNoRet<NAME#"_OFFSET", 0>;
+ } // glc = 0
+
+ // Variant that return values
+ let glc = 1, Constraints = "$vdata = $vdata_in",
+ DisableEncoding = "$vdata_in" in {
+
+ def _RTN_ADDR64 : MUBUFAtomicAddr64 <
+ op, (outs rc:$vdata),
+ (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
+ mbuf_offset:$offset, slc:$slc),
+ name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc",
+ [(set vt:$vdata,
+ (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset,
+ i1:$slc), vt:$vdata_in))]
+ >, MUBUFAddr64Table<1, "_RTN">, AtomicNoRet<NAME#"_ADDR64", 1>;
+
+ def _RTN_OFFSET : MUBUFAtomicOffset <
+ op, (outs rc:$vdata),
+ (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset,
+ SCSrc_32:$soffset, slc:$slc),
+ name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
+ [(set vt:$vdata,
+ (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
+ i1:$slc), vt:$vdata_in))]
+ >, MUBUFAddr64Table<0, "_RTN">, AtomicNoRet<NAME#"_OFFSET", 1>;
+
+ } // glc = 1
+
+ } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
}
multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> {
- let lds = 0, mayLoad = 1 in {
+ let mayLoad = 1, mayStore = 0 in {
let addr64 = 0 in {
let offen = 0, idxen = 0, vaddr = 0 in {
- def _OFFSET : MUBUF <op, (outs regClass:$vdata),
+ def _OFFSET : MUBUF_si <op, (outs regClass:$vdata),
(ins SReg_128:$srsrc,
- u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
- i1imm:$slc, i1imm:$tfe),
- asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+ mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc,
+ slc:$slc, tfe:$tfe),
+ asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+ [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
+ i32:$soffset, i16:$offset,
+ i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MUBUFAddr64Table<0>;
}
let offen = 1, idxen = 0 in {
- def _OFFEN : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc, VReg_32:$vaddr,
- SSrc_32:$soffset, u16imm:$offset, i1imm:$glc, i1imm:$slc,
- i1imm:$tfe),
- asm#" $vdata, $srsrc + $vaddr + $soffset + $offset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+ def _OFFEN : MUBUF_si <op, (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VGPR_32:$vaddr,
+ SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
+ tfe:$tfe),
+ asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
}
let offen = 0, idxen = 1 in {
- def _IDXEN : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc, VReg_32:$vaddr,
- u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
- i1imm:$slc, i1imm:$tfe),
- asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+ def _IDXEN : MUBUF_si <op, (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VGPR_32:$vaddr,
+ mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc,
+ slc:$slc, tfe:$tfe),
+ asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
}
let offen = 1, idxen = 1 in {
- def _BOTHEN : MUBUF <op, (outs regClass:$vdata),
+ def _BOTHEN : MUBUF_si <op, (outs regClass:$vdata),
(ins SReg_128:$srsrc, VReg_64:$vaddr,
- SSrc_32:$soffset, i1imm:$glc,
- i1imm:$slc, i1imm:$tfe),
- asm#" $vdata, $srsrc[$vaddr[0]] + $vaddr[1] + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+ SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
+ asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>;
}
}
let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
- def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
- (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
- asm#" $vdata, $srsrc + $vaddr + $offset",
+ def _ADDR64 : MUBUF_si <op, (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
+ asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
[(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
- i64:$vaddr, u16imm:$offset)))]>;
+ i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>;
+ }
+ }
+}
+
+multiclass MUBUF_Load_Helper_vi <bits<7> op, string asm, RegisterClass regClass,
+ ValueType load_vt = i32,
+ SDPatternOperator ld = null_frag> {
+
+ let lds = 0, mayLoad = 1 in {
+ let offen = 0, idxen = 0, vaddr = 0 in {
+ def _OFFSET : MUBUF_vi <op, (outs regClass:$vdata),
+ (ins SReg_128:$srsrc,
+ mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc,
+ slc:$slc, tfe:$tfe),
+ asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+ [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
+ i32:$soffset, i16:$offset,
+ i1:$glc, i1:$slc, i1:$tfe)))]>,
+ MUBUFAddr64Table<0>;
+ }
+
+ let offen = 1, idxen = 0 in {
+ def _OFFEN : MUBUF_vi <op, (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VGPR_32:$vaddr,
+ SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
+ tfe:$tfe),
+ asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+ }
+
+ let offen = 0, idxen = 1 in {
+ def _IDXEN : MUBUF_vi <op, (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VGPR_32:$vaddr,
+ mbuf_offset:$offset, SCSrc_32:$soffset, glc:$glc,
+ slc:$slc, tfe:$tfe),
+ asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
+ }
+
+ let offen = 1, idxen = 1 in {
+ def _BOTHEN : MUBUF_vi <op, (outs regClass:$vdata),
+ (ins SReg_128:$srsrc, VReg_64:$vaddr,
+ SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
+ asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>;
}
}
}
@@ -679,23 +1763,51 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
ValueType store_vt, SDPatternOperator st> {
- def "" : MUBUF <
- op, (outs),
- (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
- u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$slc,
- i1imm:$tfe),
- name#" $vdata, $srsrc, $vaddr, $soffset, $offset $offen $idxen $glc $slc $tfe",
- []
- > {
- let addr64 = 0;
- }
+ let mayLoad = 0, mayStore = 1 in {
+ let addr64 = 0 in {
+
+ def "" : MUBUF_si <
+ op, (outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset,
+ mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
+ tfe:$tfe),
+ name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
+ "$glc"#"$slc"#"$tfe",
+ []
+ >;
- def _ADDR64 : MUBUF <
+ let offen = 0, idxen = 0, vaddr = 0 in {
+ def _OFFSET : MUBUF_si <
+ op, (outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
+ SCSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
+ name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+ [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ i16:$offset, i1:$glc, i1:$slc,
+ i1:$tfe))]
+ >, MUBUFAddr64Table<0>;
+ } // offen = 0, idxen = 0, vaddr = 0
+
+ let offen = 1, idxen = 0 in {
+ def _OFFEN : MUBUF_si <
+ op, (outs),
+ (ins vdataClass:$vdata, SReg_128:$srsrc, VGPR_32:$vaddr, SCSrc_32:$soffset,
+ mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+ name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
+ "$glc"#"$slc"#"$tfe",
+ []
+ >;
+ } // end offen = 1, idxen = 0
+
+ } // End addr64 = 0
+
+ def _ADDR64 : MUBUF_si <
op, (outs),
- (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
- name#" $vdata, $srsrc + $vaddr + $offset",
+ (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
+ name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
[(st store_vt:$vdata,
- (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
+ (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1>
+ {
let mayLoad = 0;
let mayStore = 1;
@@ -705,24 +1817,35 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass
let idxen = 0;
let glc = 0;
let addr64 = 1;
- let lds = 0;
let slc = 0;
let tfe = 0;
let soffset = 128; // ZERO
}
+ } // End mayLoad = 0, mayStore = 1
}
-class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
- op,
- (outs regClass:$dst),
- (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
- i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
- i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
- asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
- #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
- []> {
+class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
+ FLAT <op, (outs regClass:$data),
+ (ins VReg_64:$addr),
+ asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> {
+ let glc = 0;
+ let slc = 0;
+ let tfe = 0;
let mayLoad = 1;
- let mayStore = 0;
+}
+
+class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+ FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr),
+ name#" $data, $addr, [M0, FLAT_SCRATCH]",
+ []> {
+
+ let mayLoad = 0;
+ let mayStore = 1;
+
+ // Encoding
+ let glc = 0;
+ let slc = 0;
+ let tfe = 0;
}
class MIMG_Mask <string op, int channels> {
@@ -750,7 +1873,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm,
multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
int channels> {
- def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>,
+ def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>,
MIMG_Mask<asm#"_V1", channels>;
def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
MIMG_Mask<asm#"_V2", channels>;
@@ -759,7 +1882,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
}
multiclass MIMG_NoSampler <bits<7> op, string asm> {
- defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>;
+ defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
@@ -784,7 +1907,7 @@ class MIMG_Sampler_Helper <bits<7> op, string asm,
multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
int channels> {
- def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>,
+ def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32>,
MIMG_Mask<asm#"_V1", channels>;
def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>,
MIMG_Mask<asm#"_V2", channels>;
@@ -797,7 +1920,7 @@ multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
}
multiclass MIMG_Sampler <bits<7> op, string asm> {
- defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>;
+ defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1>;
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>;
defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>;
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
@@ -831,7 +1954,7 @@ class MIMG_Gather_Helper <bits<7> op, string asm,
multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
RegisterClass dst_rc,
int channels> {
- def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_32>,
+ def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32>,
MIMG_Mask<asm#"_V1", channels>;
def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64>,
MIMG_Mask<asm#"_V2", channels>;
@@ -844,7 +1967,7 @@ multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
}
multiclass MIMG_Gather <bits<7> op, string asm> {
- defm _V1 : MIMG_Gather_Src_Helper<op, asm, VReg_32, 1>;
+ defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1>;
defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2>;
defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3>;
defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4>;
@@ -898,20 +2021,40 @@ def getCommuteOrig : InstrMapping {
let ValueCols = [["1"]];
}
-def isDS : InstrMapping {
- let FilterClass = "DS";
- let RowFields = ["Inst"];
- let ColFields = ["Size"];
- let KeyCol = ["8"];
- let ValueCols = [["8"]];
-}
-
-def getMCOpcode : InstrMapping {
+def getMCOpcodeGen : InstrMapping {
let FilterClass = "SIMCInstr";
let RowFields = ["PseudoInstr"];
let ColFields = ["Subtarget"];
let KeyCol = [!cast<string>(SISubtarget.NONE)];
- let ValueCols = [[!cast<string>(SISubtarget.SI)]];
+ let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]];
+}
+
+def getAddr64Inst : InstrMapping {
+ let FilterClass = "MUBUFAddr64Table";
+ let RowFields = ["OpName"];
+ let ColFields = ["IsAddr64"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its version with a return value.
+def getAtomicRetOp : InstrMapping {
+ let FilterClass = "AtomicNoRet";
+ let RowFields = ["NoRetOp"];
+ let ColFields = ["IsRet"];
+ let KeyCol = ["0"];
+ let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its returnless version.
+def getAtomicNoRetOp : InstrMapping {
+ let FilterClass = "AtomicNoRet";
+ let RowFields = ["NoRetOp"];
+ let ColFields = ["IsRet"];
+ let KeyCol = ["1"];
+ let ValueCols = [["0"]];
}
include "SIInstructions.td"
+include "CIInstructions.td"
+include "VIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index aecd847a2ba1..e05b6bb7d0f1 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -26,18 +26,37 @@ def SendMsgImm : Operand<i32> {
let PrintMethod = "printSendMsg";
}
-def isSI : Predicate<"Subtarget.getGeneration() "
+def isGCN : Predicate<"Subtarget.getGeneration() "
">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
-
+def isSICI : Predicate<
+ "Subtarget.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+ "Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
+>;
def isCI : Predicate<"Subtarget.getGeneration() "
">= AMDGPUSubtarget::SEA_ISLANDS">;
+def isVI : Predicate <
+ "Subtarget.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>;
-def isCFDepth0 : Predicate<"isCFDepth0()">;
+def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">;
-def WAIT_FLAG : InstFlag<"printWaitFlag">;
+def SWaitMatchClass : AsmOperandClass {
+ let Name = "SWaitCnt";
+ let RenderMethod = "addImmOperands";
+ let ParserMethod = "parseSWaitCntOps";
+}
-let SubtargetPredicate = isSI in {
-let OtherPredicates = [isCFDepth0] in {
+def WAIT_FLAG : InstFlag<"printWaitFlag"> {
+ let ParserMatchClass = SWaitMatchClass;
+}
+
+let SubtargetPredicate = isGCN in {
+
+//===----------------------------------------------------------------------===//
+// EXP Instructions
+//===----------------------------------------------------------------------===//
+
+defm EXP : EXP_m;
//===----------------------------------------------------------------------===//
// SMRD Instructions
@@ -48,129 +67,135 @@ let mayLoad = 1 in {
// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
// SMRD instructions, because the SGPR_32 register class does not include M0
// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>;
defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
- 0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
+ 0x08, "s_buffer_load_dword", SReg_128, SGPR_32
>;
defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
- 0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+ 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64
>;
defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
- 0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+ 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128
>;
defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
- 0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+ 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256
>;
defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
- 0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+ 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512
>;
} // mayLoad = 1
-//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>;
//===----------------------------------------------------------------------===//
// SOP1 Instructions
//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
-
let isMoveImm = 1 in {
-def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
-def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
-def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
-def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
+ let isReMaterializable = 1 in {
+ defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>;
+ defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>;
+ } // let isRematerializeable = 1
+
+ let Uses = [SCC] in {
+ defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>;
+ defm S_CMOV_B64 : SOP1_64 <sop1<0x06, 0x03>, "s_cmov_b64", []>;
+ } // End Uses = [SCC]
} // End isMoveImm = 1
-def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
- [(set i32:$dst, (not i32:$src0))]
->;
+let Defs = [SCC] in {
+ defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32",
+ [(set i32:$dst, (not i32:$src0))]
+ >;
-def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64",
- [(set i64:$dst, (not i64:$src0))]
->;
-def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
-def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
-def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32",
+ defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64",
+ [(set i64:$dst, (not i64:$src0))]
+ >;
+ defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>;
+ defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>;
+} // End Defs = [SCC]
+
+
+defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
[(set i32:$dst, (AMDGPUbrev i32:$src0))]
>;
-def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
-} // End neverHasSideEffects = 1
+defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
-////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
-////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
-def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32",
- [(set i32:$dst, (ctpop i32:$src0))]
->;
-def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>;
+let Defs = [SCC] in {
+ //defm S_BCNT0_I32_B32 : SOP1_BCNT0 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>;
+ //defm S_BCNT0_I32_B64 : SOP1_BCNT0 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>;
+ defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32",
+ [(set i32:$dst, (ctpop i32:$src0))]
+ >;
+ defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>;
+} // End Defs = [SCC]
-////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>;
-////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
-def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32",
+//defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>;
+//defm S_FF0_I32_B64 : SOP1_FF0 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>;
+defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32",
[(set i32:$dst, (cttz_zero_undef i32:$src0))]
>;
-////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
+////defm S_FF1_I32_B64 : SOP1_FF1 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
-def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32",
+defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
[(set i32:$dst, (ctlz_zero_undef i32:$src0))]
>;
-//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
-def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
-//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
-def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
+//defm S_FLBIT_I32_B64 : SOP1_32 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
+defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", []>;
+//defm S_FLBIT_I32_I64 : SOP1_32 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
+defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
[(set i32:$dst, (sext_inreg i32:$src0, i8))]
>;
-def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16",
+defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16",
[(set i32:$dst, (sext_inreg i32:$src0, i16))]
>;
-////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
-////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
-////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
-////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
-def S_GETPC_B64 : SOP1 <
- 0x0000001f, (outs SReg_64:$dst), (ins), "S_GETPC_B64 $dst", []
-> {
- let SSRC0 = 0;
-}
-def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
-def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
-def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
-
-let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
-
-def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
-def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
-def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
-def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
-def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
-def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
-def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
-def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
-
-} // End hasSideEffects = 1
-
-def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
-def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
-def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
-def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
-def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
-def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
-//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
-def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
-def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
-def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+////defm S_BITSET0_B32 : SOP1_BITSET0 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>;
+////defm S_BITSET0_B64 : SOP1_BITSET0 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>;
+////defm S_BITSET1_B32 : SOP1_BITSET1 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>;
+////defm S_BITSET1_B64 : SOP1_BITSET1 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>;
+defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>;
+defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>;
+defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>;
+defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
+
+defm S_AND_SAVEEXEC_B64 : SOP1_64 <sop1<0x24, 0x20>, "s_and_saveexec_b64", []>;
+defm S_OR_SAVEEXEC_B64 : SOP1_64 <sop1<0x25, 0x21>, "s_or_saveexec_b64", []>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x26, 0x22>, "s_xor_saveexec_b64", []>;
+defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x27, 0x23>, "s_andn2_saveexec_b64", []>;
+defm S_ORN2_SAVEEXEC_B64 : SOP1_64 <sop1<0x28, 0x24>, "s_orn2_saveexec_b64", []>;
+defm S_NAND_SAVEEXEC_B64 : SOP1_64 <sop1<0x29, 0x25>, "s_nand_saveexec_b64", []>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2a, 0x26>, "s_nor_saveexec_b64", []>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>;
+
+} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+
+defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
+defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
+defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
+defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
+defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
+defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
+//defm S_CBRANCH_JOIN : SOP1_ <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
+defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
+let Defs = [SCC] in {
+ defm S_ABS_I32 : SOP1_32 <sop1<0x34, 0x30>, "s_abs_i32", []>;
+} // End Defs = [SCC]
+defm S_MOV_FED_B32 : SOP1_32 <sop1<0x35, 0x31>, "s_mov_fed_b32", []>;
//===----------------------------------------------------------------------===//
// SOP2 Instructions
@@ -178,145 +203,161 @@ def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
-def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>;
+defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32",
[(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
>;
} // End isCommutable = 1
-def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>;
+defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32",
[(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
>;
let Uses = [SCC] in { // Carry in comes from SCC
let isCommutable = 1 in {
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32",
[(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
} // End isCommutable = 1
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
[(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
} // End Uses = [SCC]
-} // End Defs = [SCC]
-def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
+defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
[(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
>;
-def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
+defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
[(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
>;
-def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
+defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
[(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
>;
-def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
+defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
[(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
>;
+} // End Defs = [SCC]
-def S_CSELECT_B32 : SOP2 <
- 0x0000000a, (outs SReg_32:$dst),
- (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
- []
->;
+defm S_CSELECT_B32 : SOP2_SELECT_32 <sop2<0x0a>, "s_cselect_b32", []>;
-def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+let Uses = [SCC] in {
+ defm S_CSELECT_B64 : SOP2_64 <sop2<0x0b>, "s_cselect_b64", []>;
+} // End Uses = [SCC]
-def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
+let Defs = [SCC] in {
+defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32",
[(set i32:$dst, (and i32:$src0, i32:$src1))]
>;
-def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64",
[(set i64:$dst, (and i64:$src0, i64:$src1))]
>;
-def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
+defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32",
[(set i32:$dst, (or i32:$src0, i32:$src1))]
>;
-def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
+defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64",
[(set i64:$dst, (or i64:$src0, i64:$src1))]
>;
-def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
+defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32",
[(set i32:$dst, (xor i32:$src0, i32:$src1))]
>;
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64",
[(set i64:$dst, (xor i64:$src0, i64:$src1))]
>;
-def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
-def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
-def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
-def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
-def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
-def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
-def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
-def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
-def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
-def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>;
+defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>;
+defm S_ORN2_B32 : SOP2_32 <sop2<0x16, 0x14>, "s_orn2_b32", []>;
+defm S_ORN2_B64 : SOP2_64 <sop2<0x17, 0x15>, "s_orn2_b64", []>;
+defm S_NAND_B32 : SOP2_32 <sop2<0x18, 0x16>, "s_nand_b32", []>;
+defm S_NAND_B64 : SOP2_64 <sop2<0x19, 0x17>, "s_nand_b64", []>;
+defm S_NOR_B32 : SOP2_32 <sop2<0x1a, 0x18>, "s_nor_b32", []>;
+defm S_NOR_B64 : SOP2_64 <sop2<0x1b, 0x19>, "s_nor_b64", []>;
+defm S_XNOR_B32 : SOP2_32 <sop2<0x1c, 0x1a>, "s_xnor_b32", []>;
+defm S_XNOR_B64 : SOP2_64 <sop2<0x1d, 0x1b>, "s_xnor_b64", []>;
+} // End Defs = [SCC]
// Use added complexity so these patterns are preferred to the VALU patterns.
let AddedComplexity = 1 in {
+let Defs = [SCC] in {
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32",
[(set i32:$dst, (shl i32:$src0, i32:$src1))]
>;
-def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64",
[(set i64:$dst, (shl i64:$src0, i32:$src1))]
>;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32",
[(set i32:$dst, (srl i32:$src0, i32:$src1))]
>;
-def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64",
[(set i64:$dst, (srl i64:$src0, i32:$src1))]
>;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32",
[(set i32:$dst, (sra i32:$src0, i32:$src1))]
>;
-def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64",
[(set i64:$dst, (sra i64:$src0, i32:$src1))]
>;
+} // End Defs = [SCC]
+
+defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", []>;
+defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>;
+defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32",
+ [(set i32:$dst, (mul i32:$src0, i32:$src1))]
+>;
} // End AddedComplexity = 1
-def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
-def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
-def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
-def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
-def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
-def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
-def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
-//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
-def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+let Defs = [SCC] in {
+defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>;
+defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>;
+defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>;
+defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>;
+} // End Defs = [SCC]
+
+//defm S_CBRANCH_G_FORK : SOP2_ <sop2<0x2b, 0x29>, "s_cbranch_g_fork", []>;
+let Defs = [SCC] in {
+defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>;
+} // End Defs = [SCC]
//===----------------------------------------------------------------------===//
// SOPC Instructions
//===----------------------------------------------------------------------===//
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>;
//===----------------------------------------------------------------------===//
// SOPK Instructions
//===----------------------------------------------------------------------===//
-def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
-def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
+let isReMaterializable = 1 in {
+defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
+} // End isReMaterializable = 1
+let Uses = [SCC] in {
+ defm S_CMOVK_I32 : SOPK_32 <sopk<0x02, 0x01>, "s_cmovk_i32", []>;
+}
+
+let isCompare = 1 in {
/*
This instruction is disabled for now until we can figure out how to teach
@@ -330,50 +371,46 @@ SCC = S_CMPK_EQ_I32 SGPR0, imm
VCC = COPY SCC
VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
-def S_CMPK_EQ_I32 : SOPK <
- 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
- "S_CMPK_EQ_I32",
+defm S_CMPK_EQ_I32 : SOPK_SCC <sopk<0x03, 0x02>, "s_cmpk_eq_i32",
[(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
>;
*/
-let isCompare = 1, Defs = [SCC] in {
-def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
-def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
-def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
-def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
-def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
-def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
-def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
-def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
-def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
-def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
-def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
-} // End isCompare = 1, Defs = [SCC]
-
-let Defs = [SCC], isCommutable = 1 in {
- def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
- def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
-}
+defm S_CMPK_LG_I32 : SOPK_SCC <sopk<0x04, 0x03>, "s_cmpk_lg_i32", []>;
+defm S_CMPK_GT_I32 : SOPK_SCC <sopk<0x05, 0x04>, "s_cmpk_gt_i32", []>;
+defm S_CMPK_GE_I32 : SOPK_SCC <sopk<0x06, 0x05>, "s_cmpk_ge_i32", []>;
+defm S_CMPK_LT_I32 : SOPK_SCC <sopk<0x07, 0x06>, "s_cmpk_lt_i32", []>;
+defm S_CMPK_LE_I32 : SOPK_SCC <sopk<0x08, 0x07>, "s_cmpk_le_i32", []>;
+defm S_CMPK_EQ_U32 : SOPK_SCC <sopk<0x09, 0x08>, "s_cmpk_eq_u32", []>;
+defm S_CMPK_LG_U32 : SOPK_SCC <sopk<0x0a, 0x09>, "s_cmpk_lg_u32", []>;
+defm S_CMPK_GT_U32 : SOPK_SCC <sopk<0x0b, 0x0a>, "s_cmpk_gt_u32", []>;
+defm S_CMPK_GE_U32 : SOPK_SCC <sopk<0x0c, 0x0b>, "s_cmpk_ge_u32", []>;
+defm S_CMPK_LT_U32 : SOPK_SCC <sopk<0x0d, 0x0c>, "s_cmpk_lt_u32", []>;
+defm S_CMPK_LE_U32 : SOPK_SCC <sopk<0x0e, 0x0d>, "s_cmpk_le_u32", []>;
+} // End isCompare = 1
-//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
-def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
-def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
-def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
-//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
-//def EXP : EXP_ <0x00000000, "EXP", []>;
+let isCommutable = 1 in {
+ let Defs = [SCC], isCommutable = 1 in {
+ defm S_ADDK_I32 : SOPK_32 <sopk<0x0f, 0x0e>, "s_addk_i32", []>;
+ }
+ defm S_MULK_I32 : SOPK_32 <sopk<0x10, 0x0f>, "s_mulk_i32", []>;
+}
-} // End let OtherPredicates = [isCFDepth0]
+//defm S_CBRANCH_I_FORK : SOPK_ <sopk<0x11, 0x10>, "s_cbranch_i_fork", []>;
+defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>;
+defm S_SETREG_B32 : SOPK_32 <sopk<0x13, 0x12>, "s_setreg_b32", []>;
+defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>;
+//defm S_SETREG_IMM32_B32 : SOPK_32 <sopk<0x15, 0x14>, "s_setreg_imm32_b32", []>;
//===----------------------------------------------------------------------===//
// SOPP Instructions
//===----------------------------------------------------------------------===//
-def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "S_NOP $simm16", []>;
+def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
let isTerminator = 1 in {
-def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
[(IL_retflag)]> {
let simm16 = 0;
let isBarrier = 1;
@@ -382,7 +419,7 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
let isBranch = 1 in {
def S_BRANCH : SOPP <
- 0x00000002, (ins sopp_brtarget:$simm16), "S_BRANCH $simm16",
+ 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
[(br bb:$simm16)]> {
let isBarrier = 1;
}
@@ -390,36 +427,31 @@ def S_BRANCH : SOPP <
let DisableEncoding = "$scc" in {
def S_CBRANCH_SCC0 : SOPP <
0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
- "S_CBRANCH_SCC0 $simm16", []
+ "s_cbranch_scc0 $simm16"
>;
def S_CBRANCH_SCC1 : SOPP <
0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
- "S_CBRANCH_SCC1 $simm16",
- []
+ "s_cbranch_scc1 $simm16"
>;
} // End DisableEncoding = "$scc"
def S_CBRANCH_VCCZ : SOPP <
0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
- "S_CBRANCH_VCCZ $simm16",
- []
+ "s_cbranch_vccz $simm16"
>;
def S_CBRANCH_VCCNZ : SOPP <
0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
- "S_CBRANCH_VCCNZ $simm16",
- []
+ "s_cbranch_vccnz $simm16"
>;
let DisableEncoding = "$exec" in {
def S_CBRANCH_EXECZ : SOPP <
0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
- "S_CBRANCH_EXECZ $simm16",
- []
+ "s_cbranch_execz $simm16"
>;
def S_CBRANCH_EXECNZ : SOPP <
0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
- "S_CBRANCH_EXECNZ $simm16",
- []
+ "s_cbranch_execnz $simm16"
>;
} // End DisableEncoding = "$exec"
@@ -428,7 +460,7 @@ def S_CBRANCH_EXECNZ : SOPP <
} // End isTerminator = 1
let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
+def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
[(int_AMDGPU_barrier_local)]
> {
let simm16 = 0;
@@ -438,27 +470,29 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
let mayStore = 1;
}
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
- []
->;
-//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
-//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
-//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
+def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">;
+def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">;
let Uses = [EXEC] in {
- def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
+ def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "s_sendmsg $simm16",
[(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
> {
let DisableEncoding = "$m0";
}
} // End Uses = [EXEC]
-//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
-//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
-//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
-//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
-//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
-//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">;
+def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
+def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
+ let simm16 = 0;
+}
+def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">;
+def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">;
+def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
+ let simm16 = 0;
+}
} // End hasSideEffects
//===----------------------------------------------------------------------===//
@@ -467,256 +501,260 @@ let Uses = [EXEC] in {
let isCompare = 1 in {
-defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
-defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_OLT>;
-defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_OEQ>;
-defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_OLE>;
-defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_OGT>;
-defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32">;
-defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_OGE>;
-defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", f32, COND_O>;
-defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", f32, COND_UO>;
-defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">;
-defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">;
-defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">;
-defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">;
-defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
-defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
-defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
+defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0, 0x40>, "v_cmp_f_f32">;
+defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1, 0x41>, "v_cmp_lt_f32", COND_OLT>;
+defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2, 0x42>, "v_cmp_eq_f32", COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3, 0x43>, "v_cmp_le_f32", COND_OLE>;
+defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4, 0x44>, "v_cmp_gt_f32", COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5, 0x45>, "v_cmp_lg_f32", COND_ONE>;
+defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6, 0x46>, "v_cmp_ge_f32", COND_OGE>;
+defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7, 0x47>, "v_cmp_o_f32", COND_O>;
+defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8, 0x48>, "v_cmp_u_f32", COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9, 0x49>, "v_cmp_nge_f32", COND_ULT>;
+defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa, 0x4a>, "v_cmp_nlg_f32", COND_UEQ>;
+defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb, 0x4b>, "v_cmp_ngt_f32", COND_ULE>;
+defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc, 0x4c>, "v_cmp_nle_f32", COND_UGT>;
+defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd, 0x4d>, "v_cmp_neq_f32", COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe, 0x4e>, "v_cmp_nlt_f32", COND_UGE>;
+defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf, 0x4f>, "v_cmp_tru_f32">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">;
-defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">;
-defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">;
-defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">;
-defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">;
-defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">;
-defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">;
-defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">;
-defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">;
-defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">;
-defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">;
-defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">;
-defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">;
-defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">;
-defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">;
-defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">;
+defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10, 0x50>, "v_cmpx_f_f32">;
+defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11, 0x51>, "v_cmpx_lt_f32">;
+defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12, 0x52>, "v_cmpx_eq_f32">;
+defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13, 0x53>, "v_cmpx_le_f32">;
+defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14, 0x54>, "v_cmpx_gt_f32">;
+defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15, 0x55>, "v_cmpx_lg_f32">;
+defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16, 0x56>, "v_cmpx_ge_f32">;
+defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17, 0x57>, "v_cmpx_o_f32">;
+defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18, 0x58>, "v_cmpx_u_f32">;
+defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19, 0x59>, "v_cmpx_nge_f32">;
+defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a, 0x5a>, "v_cmpx_nlg_f32">;
+defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b, 0x5b>, "v_cmpx_ngt_f32">;
+defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c, 0x5c>, "v_cmpx_nle_f32">;
+defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d, 0x5d>, "v_cmpx_neq_f32">;
+defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e, 0x5e>, "v_cmpx_nlt_f32">;
+defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f, 0x5f>, "v_cmpx_tru_f32">;
} // End hasSideEffects = 1
-defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
-defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
-defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_OEQ>;
-defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_OLE>;
-defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_OGT>;
-defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">;
-defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_OGE>;
-defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", f64, COND_O>;
-defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", f64, COND_UO>;
-defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">;
-defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">;
-defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">;
-defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">;
-defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
-defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
-defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
+defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20, 0x60>, "v_cmp_f_f64">;
+defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21, 0x61>, "v_cmp_lt_f64", COND_OLT>;
+defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22, 0x62>, "v_cmp_eq_f64", COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23, 0x63>, "v_cmp_le_f64", COND_OLE>;
+defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24, 0x64>, "v_cmp_gt_f64", COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25, 0x65>, "v_cmp_lg_f64", COND_ONE>;
+defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26, 0x66>, "v_cmp_ge_f64", COND_OGE>;
+defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27, 0x67>, "v_cmp_o_f64", COND_O>;
+defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28, 0x68>, "v_cmp_u_f64", COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29, 0x69>, "v_cmp_nge_f64", COND_ULT>;
+defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a, 0x6a>, "v_cmp_nlg_f64", COND_UEQ>;
+defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b, 0x6b>, "v_cmp_ngt_f64", COND_ULE>;
+defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c, 0x6c>, "v_cmp_nle_f64", COND_UGT>;
+defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d, 0x6d>, "v_cmp_neq_f64", COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e, 0x6e>, "v_cmp_nlt_f64", COND_UGE>;
+defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f, 0x6f>, "v_cmp_tru_f64">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">;
-defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">;
-defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">;
-defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">;
-defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">;
-defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">;
-defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">;
-defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">;
-defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">;
-defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">;
-defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">;
-defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">;
-defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">;
-defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">;
-defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">;
-defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">;
+defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30, 0x70>, "v_cmpx_f_f64">;
+defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31, 0x71>, "v_cmpx_lt_f64">;
+defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32, 0x72>, "v_cmpx_eq_f64">;
+defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33, 0x73>, "v_cmpx_le_f64">;
+defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34, 0x74>, "v_cmpx_gt_f64">;
+defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35, 0x75>, "v_cmpx_lg_f64">;
+defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36, 0x76>, "v_cmpx_ge_f64">;
+defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37, 0x77>, "v_cmpx_o_f64">;
+defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38, 0x78>, "v_cmpx_u_f64">;
+defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39, 0x79>, "v_cmpx_nge_f64">;
+defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a, 0x7a>, "v_cmpx_nlg_f64">;
+defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b, 0x7b>, "v_cmpx_ngt_f64">;
+defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c, 0x7c>, "v_cmpx_nle_f64">;
+defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d, 0x7d>, "v_cmpx_neq_f64">;
+defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e, 0x7e>, "v_cmpx_nlt_f64">;
+defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f, 0x7f>, "v_cmpx_tru_f64">;
} // End hasSideEffects = 1
-defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">;
-defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">;
-defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32">;
-defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32">;
-defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32">;
-defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32">;
-defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32">;
-defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32">;
-defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32">;
-defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32">;
-defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32">;
-defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32">;
-defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32">;
-defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">;
-defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">;
-defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">;
+let SubtargetPredicate = isSICI in {
+
+defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">;
+defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">;
+defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">;
+defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32">;
+defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">;
+defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">;
+defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">;
+defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">;
+defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">;
+defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32">;
+defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">;
+defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32">;
+defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">;
+defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">;
+defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">;
+defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">;
let hasSideEffects = 1 in {
-defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">;
-defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">;
-defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">;
-defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">;
-defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">;
-defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">;
-defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">;
-defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">;
-defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">;
-defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">;
-defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">;
-defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">;
-defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">;
-defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
-defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">;
-defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">;
+defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">;
+defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32">;
+defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">;
+defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32">;
+defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">;
+defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">;
+defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">;
+defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">;
+defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">;
+defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32">;
+defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">;
+defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32">;
+defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">;
+defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">;
+defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">;
} // End hasSideEffects = 1
-defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">;
-defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">;
-defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64">;
-defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64">;
-defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64">;
-defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64">;
-defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64">;
-defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64">;
-defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64">;
-defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64">;
-defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64">;
-defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64">;
-defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64">;
-defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64">;
-defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64">;
-defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64">;
+defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">;
+defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64">;
+defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">;
+defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64">;
+defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">;
+defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">;
+defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">;
+defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">;
+defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">;
+defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64">;
+defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">;
+defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64">;
+defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">;
+defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">;
+defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">;
+defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">;
let hasSideEffects = 1, Defs = [EXEC] in {
-defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64">;
-defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64">;
-defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64">;
-defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64">;
-defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64">;
-defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64">;
-defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64">;
-defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64">;
-defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64">;
-defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64">;
-defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64">;
-defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64">;
-defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64">;
-defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64">;
-defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64">;
-defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">;
+defm V_CMPSX_F_F64 : VOPC_F64 <vopc<0x70>, "v_cmpsx_f_f64">;
+defm V_CMPSX_LT_F64 : VOPC_F64 <vopc<0x71>, "v_cmpsx_lt_f64">;
+defm V_CMPSX_EQ_F64 : VOPC_F64 <vopc<0x72>, "v_cmpsx_eq_f64">;
+defm V_CMPSX_LE_F64 : VOPC_F64 <vopc<0x73>, "v_cmpsx_le_f64">;
+defm V_CMPSX_GT_F64 : VOPC_F64 <vopc<0x74>, "v_cmpsx_gt_f64">;
+defm V_CMPSX_LG_F64 : VOPC_F64 <vopc<0x75>, "v_cmpsx_lg_f64">;
+defm V_CMPSX_GE_F64 : VOPC_F64 <vopc<0x76>, "v_cmpsx_ge_f64">;
+defm V_CMPSX_O_F64 : VOPC_F64 <vopc<0x77>, "v_cmpsx_o_f64">;
+defm V_CMPSX_U_F64 : VOPC_F64 <vopc<0x78>, "v_cmpsx_u_f64">;
+defm V_CMPSX_NGE_F64 : VOPC_F64 <vopc<0x79>, "v_cmpsx_nge_f64">;
+defm V_CMPSX_NLG_F64 : VOPC_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">;
+defm V_CMPSX_NGT_F64 : VOPC_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64">;
+defm V_CMPSX_NLE_F64 : VOPC_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">;
+defm V_CMPSX_NEQ_F64 : VOPC_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">;
+defm V_CMPSX_NLT_F64 : VOPC_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">;
+defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">;
} // End hasSideEffects = 1, Defs = [EXEC]
-defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">;
-defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_SLT>;
-defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_SLE>;
-defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_SGT>;
-defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
-defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
-defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
+} // End SubtargetPredicate = isSICI
+
+defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80, 0xc0>, "v_cmp_f_i32">;
+defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81, 0xc1>, "v_cmp_lt_i32", COND_SLT>;
+defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82, 0xc2>, "v_cmp_eq_i32", COND_EQ>;
+defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83, 0xc3>, "v_cmp_le_i32", COND_SLE>;
+defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84, 0xc4>, "v_cmp_gt_i32", COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85, 0xc5>, "v_cmp_ne_i32", COND_NE>;
+defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86, 0xc6>, "v_cmp_ge_i32", COND_SGE>;
+defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87, 0xc7>, "v_cmp_t_i32">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">;
-defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">;
-defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">;
-defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">;
-defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">;
-defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">;
-defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">;
-defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">;
+defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90, 0xd0>, "v_cmpx_f_i32">;
+defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91, 0xd1>, "v_cmpx_lt_i32">;
+defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92, 0xd2>, "v_cmpx_eq_i32">;
+defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93, 0xd3>, "v_cmpx_le_i32">;
+defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94, 0xd4>, "v_cmpx_gt_i32">;
+defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95, 0xd5>, "v_cmpx_ne_i32">;
+defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96, 0xd6>, "v_cmpx_ge_i32">;
+defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97, 0xd7>, "v_cmpx_t_i32">;
} // End hasSideEffects = 1
-defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
-defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
-defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", i64, COND_EQ>;
-defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", i64, COND_SLE>;
-defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", i64, COND_SGT>;
-defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
-defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
-defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
+defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0, 0xe0>, "v_cmp_f_i64">;
+defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1, 0xe1>, "v_cmp_lt_i64", COND_SLT>;
+defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2, 0xe2>, "v_cmp_eq_i64", COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3, 0xe3>, "v_cmp_le_i64", COND_SLE>;
+defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4, 0xe4>, "v_cmp_gt_i64", COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5, 0xe5>, "v_cmp_ne_i64", COND_NE>;
+defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6, 0xe6>, "v_cmp_ge_i64", COND_SGE>;
+defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7, 0xe7>, "v_cmp_t_i64">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">;
-defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">;
-defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">;
-defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">;
-defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">;
-defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">;
-defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">;
-defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">;
+defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0, 0xf0>, "v_cmpx_f_i64">;
+defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1, 0xf1>, "v_cmpx_lt_i64">;
+defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2, 0xf2>, "v_cmpx_eq_i64">;
+defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3, 0xf3>, "v_cmpx_le_i64">;
+defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4, 0xf4>, "v_cmpx_gt_i64">;
+defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5, 0xf5>, "v_cmpx_ne_i64">;
+defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6, 0xf6>, "v_cmpx_ge_i64">;
+defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7, 0xf7>, "v_cmpx_t_i64">;
} // End hasSideEffects = 1
-defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
-defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
-defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", i32, COND_EQ>;
-defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", i32, COND_ULE>;
-defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", i32, COND_UGT>;
-defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
-defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
-defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
+defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0, 0xc8>, "v_cmp_f_u32">;
+defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1, 0xc9>, "v_cmp_lt_u32", COND_ULT>;
+defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2, 0xca>, "v_cmp_eq_u32", COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3, 0xcb>, "v_cmp_le_u32", COND_ULE>;
+defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4, 0xcc>, "v_cmp_gt_u32", COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5, 0xcd>, "v_cmp_ne_u32", COND_NE>;
+defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6, 0xce>, "v_cmp_ge_u32", COND_UGE>;
+defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7, 0xcf>, "v_cmp_t_u32">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">;
-defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">;
-defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">;
-defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">;
-defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">;
-defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">;
-defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">;
-defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">;
+defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0, 0xd8>, "v_cmpx_f_u32">;
+defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1, 0xd9>, "v_cmpx_lt_u32">;
+defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2, 0xda>, "v_cmpx_eq_u32">;
+defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3, 0xdb>, "v_cmpx_le_u32">;
+defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4, 0xdc>, "v_cmpx_gt_u32">;
+defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5, 0xdd>, "v_cmpx_ne_u32">;
+defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6, 0xde>, "v_cmpx_ge_u32">;
+defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7, 0xdf>, "v_cmpx_t_u32">;
} // End hasSideEffects = 1
-defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
-defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
-defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", i64, COND_EQ>;
-defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", i64, COND_ULE>;
-defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", i64, COND_UGT>;
-defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
-defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
-defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
+defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0, 0xe8>, "v_cmp_f_u64">;
+defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1, 0xe9>, "v_cmp_lt_u64", COND_ULT>;
+defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2, 0xea>, "v_cmp_eq_u64", COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3, 0xeb>, "v_cmp_le_u64", COND_ULE>;
+defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4, 0xec>, "v_cmp_gt_u64", COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5, 0xed>, "v_cmp_ne_u64", COND_NE>;
+defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6, 0xee>, "v_cmp_ge_u64", COND_UGE>;
+defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7, 0xef>, "v_cmp_t_u64">;
let hasSideEffects = 1 in {
-defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">;
-defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">;
-defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">;
-defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">;
-defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">;
-defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">;
-defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">;
-defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">;
+defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0, 0xf8>, "v_cmpx_f_u64">;
+defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1, 0xf9>, "v_cmpx_lt_u64">;
+defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2, 0xfa>, "v_cmpx_eq_u64">;
+defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3, 0xfb>, "v_cmpx_le_u64">;
+defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4, 0xfc>, "v_cmpx_gt_u64">;
+defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5, 0xfd>, "v_cmpx_ne_u64">;
+defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6, 0xfe>, "v_cmpx_ge_u64">;
+defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7, 0xff>, "v_cmpx_t_u64">;
} // End hasSideEffects = 1
-defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">;
+defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <vopc<0x88, 0x10>, "v_cmp_class_f32">;
let hasSideEffects = 1 in {
-defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">;
+defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <vopc<0x98, 0x11>, "v_cmpx_class_f32">;
} // End hasSideEffects = 1
-defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">;
+defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <vopc<0xa8, 0x12>, "v_cmp_class_f64">;
let hasSideEffects = 1 in {
-defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">;
+defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">;
} // End hasSideEffects = 1
} // End isCompare = 1
@@ -726,88 +764,88 @@ defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">;
//===----------------------------------------------------------------------===//
-def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>;
-def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>;
-def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>;
-def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>;
-def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>;
-def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>;
-def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>;
-def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>;
-def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>;
-def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>;
-def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>;
-def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>;
-def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>;
-def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>;
-def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>;
-def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>;
-def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>;
-
-def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>;
-def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>;
-def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>;
-def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>;
-def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>;
-def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>;
-def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>;
-def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>;
-def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>;
-def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>;
-def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>;
-def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>;
-def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>;
-def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>;
-//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>;
-//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>;
-def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>;
-def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>;
-def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>;
-def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>;
+def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>;
+def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>;
+def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>;
+def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>;
+def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>;
+def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>;
+def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>;
+def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>;
+def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>;
+def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>;
+def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>;
+def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
+def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
+def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
+def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
+def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>;
+def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>;
+
+def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
+def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
+def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
+def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
+def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
+def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
+def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
+def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
+def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
+def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
+def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
+def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
+def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
+def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>;
+//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">;
+//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">;
+def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
+def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
+def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
let SubtargetPredicate = isCI in {
-def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>;
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
} // End isCI
-def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>;
-def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>;
-def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>;
-def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>;
-def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>;
-def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>;
-def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>;
-def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>;
-def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>;
-def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>;
-def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>;
-def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>;
-def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>;
-def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>;
-def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>;
-def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>;
-def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>;
-
-def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>;
-def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>;
-def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>;
-def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>;
-def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>;
-def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>;
-def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>;
-def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>;
-def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>;
-def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>;
-def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>;
-def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>;
-def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>;
-def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>;
-//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>;
-//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>;
-def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>;
-def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>;
-def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>;
-def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>;
+def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
+def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
+def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
+def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
+def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
+def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
+def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
+def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
+def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
+def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
+def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
+def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
+def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
+def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
+def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
+def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
+def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
+
+def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
+def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
+def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
+def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
+def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
+def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
+def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
+def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
+def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
+def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
+def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
+def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
+def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
+def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
+//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">;
+//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">;
+def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
+def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
+def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">;
+def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">;
//let SubtargetPredicate = isCI in {
// DS_CONDXCHG32_RTN_B64
@@ -816,719 +854,951 @@ def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>;
// TODO: _SRC2_* forms
-def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
-def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
-def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
-def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "DS_WRITE_B64", VReg_64>;
+defm DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VGPR_32>;
+defm DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VGPR_32>;
+defm DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VGPR_32>;
+defm DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>;
-def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
-def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>;
-def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>;
-def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>;
-def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>;
-def DS_READ_B64 : DS_Load_Helper <0x00000076, "DS_READ_B64", VReg_64>;
+defm DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VGPR_32>;
+defm DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VGPR_32>;
+defm DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VGPR_32>;
+defm DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VGPR_32>;
+defm DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VGPR_32>;
+defm DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>;
// 2 forms.
-def DS_WRITE2_B32 : DS_Load2_Helper <0x0000000E, "DS_WRITE2_B32", VReg_64>;
-def DS_WRITE2_B64 : DS_Load2_Helper <0x0000004E, "DS_WRITE2_B64", VReg_128>;
+defm DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VGPR_32>;
+defm DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VGPR_32>;
+defm DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>;
+defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>;
-def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "DS_READ2_B32", VReg_64>;
-def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>;
-
-// TODO: DS_READ2ST64_B32, DS_READ2ST64_B64,
-// DS_WRITE2ST64_B32, DS_WRITE2ST64_B64
+defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>;
+defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>;
+defm DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>;
+defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>;
//===----------------------------------------------------------------------===//
// MUBUF Instructions
//===----------------------------------------------------------------------===//
-//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
-//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
-//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
-//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
-//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
-//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
-//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
+let SubtargetPredicate = isSICI in {
+
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>;
defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
- 0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global
+ 0x00000008, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
>;
defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
- 0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global
+ 0x00000009, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
>;
defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
- 0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global
+ 0x0000000a, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
>;
defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
- 0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global
+ 0x0000000b, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
>;
defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
- 0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load
+ 0x0000000c, "buffer_load_dword", VGPR_32, i32, global_load
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
- 0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load
+ 0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
- 0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
+ 0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load
>;
defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
- 0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
+ 0x00000018, "buffer_store_byte", VGPR_32, i32, truncstorei8_global
>;
defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
- 0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
+ 0x0000001a, "buffer_store_short", VGPR_32, i32, truncstorei16_global
>;
defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
- 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
+ 0x0000001c, "buffer_store_dword", VGPR_32, i32, global_store
>;
defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
- 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
+ 0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
- 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
->;
-//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
-//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
-//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
-//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
-//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
-//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
-//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
-//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
-//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
-//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
-//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
-//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
-//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
-//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
-//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
-//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
-//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
-//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
-//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
-//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
-//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
-//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
-//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
-//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
-//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
-//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
-//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
-//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
-//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
-//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
-//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
-//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
-//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+ 0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store
+>;
+//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>;
+defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
+ 0x00000030, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
+ 0x00000032, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Atomic <
+ 0x00000033, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic <
+ 0x00000035, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic <
+ 0x00000036, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic <
+ 0x00000037, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic <
+ 0x00000038, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND : MUBUF_Atomic <
+ 0x00000039, "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
+ 0x0000003a, "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
+ 0x0000003b, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>;
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>;
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>;
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>;
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>;
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>;
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>;
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>;
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>;
+
+} // End SubtargetPredicate = isSICI
//===----------------------------------------------------------------------===//
// MTBUF Instructions
//===----------------------------------------------------------------------===//
-//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
-//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
-def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
-def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "TBUFFER_STORE_FORMAT_X", VReg_32>;
-def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FORMAT_XY", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>;
//===----------------------------------------------------------------------===//
// MIMG Instructions
//===----------------------------------------------------------------------===//
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
-//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
-//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
-//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
-//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
-//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
-//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
-//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
-//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">;
-//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
-//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
-//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
-//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
-//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
-//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
-//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
-//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
-//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
-//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
-//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
-//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
-//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
-//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
-//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
-//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
-defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">;
-defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "IMAGE_SAMPLE_CL">;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "IMAGE_SAMPLE_D_CL">;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">;
-defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "IMAGE_SAMPLE_B_CL">;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "IMAGE_SAMPLE_LZ">;
-defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "IMAGE_SAMPLE_C_CL">;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "IMAGE_SAMPLE_C_D_CL">;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "IMAGE_SAMPLE_C_B_CL">;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "IMAGE_SAMPLE_C_LZ">;
-defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "IMAGE_SAMPLE_O">;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "IMAGE_SAMPLE_CL_O">;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "IMAGE_SAMPLE_D_O">;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "IMAGE_SAMPLE_D_CL_O">;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "IMAGE_SAMPLE_L_O">;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "IMAGE_SAMPLE_B_O">;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "IMAGE_SAMPLE_B_CL_O">;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "IMAGE_SAMPLE_LZ_O">;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "IMAGE_SAMPLE_C_O">;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "IMAGE_SAMPLE_C_CL_O">;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "IMAGE_SAMPLE_C_D_O">;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "IMAGE_SAMPLE_C_D_CL_O">;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "IMAGE_SAMPLE_C_L_O">;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "IMAGE_SAMPLE_C_B_O">;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "IMAGE_SAMPLE_C_B_CL_O">;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "IMAGE_SAMPLE_C_LZ_O">;
-defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "IMAGE_GATHER4">;
-defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">;
-defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">;
-defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">;
-defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">;
-defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">;
-defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">;
-defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">;
-defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">;
-defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">;
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "IMAGE_SAMPLE_CD">;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "IMAGE_SAMPLE_CD_CL">;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "IMAGE_SAMPLE_C_CD">;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "IMAGE_SAMPLE_C_CD_CL">;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "IMAGE_SAMPLE_CD_O">;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "IMAGE_SAMPLE_CD_CL_O">;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "IMAGE_SAMPLE_C_CD_O">;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "IMAGE_SAMPLE_C_CD_CL_O">;
-//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
-//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
+//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>;
+//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
+//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>;
+//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>;
+//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>;
+//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>;
+//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>;
+//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>;
+//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>;
+//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>;
+//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>;
+//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>;
+//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>;
+//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>;
+//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
+defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">;
+defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">;
+defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
+defm IMAGE_SAMPLE_O : MIMG_Sampler <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
+defm IMAGE_GATHER4 : MIMG_Gather <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL : MIMG_Gather <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">;
+defm IMAGE_GATHER4_B : MIMG_Gather <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">;
+defm IMAGE_GATHER4_C : MIMG_Gather <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
+defm IMAGE_GATHER4_C_B : MIMG_Gather <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
+defm IMAGE_GATHER4_O : MIMG_Gather <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">;
+defm IMAGE_GATHER4_B_O : MIMG_Gather <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
+defm IMAGE_GATHER4_C_O : MIMG_Gather <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
+defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "image_get_lod">;
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+
+//===----------------------------------------------------------------------===//
+// Flat Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasFlatAddressSpace] in {
+def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VGPR_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VGPR_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VGPR_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VGPR_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VGPR_32>;
+def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "flat_load_dwordx2", VReg_64>;
+def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "flat_load_dwordx4", VReg_128>;
+def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "flat_load_dwordx3", VReg_96>;
+
+def FLAT_STORE_BYTE : FLAT_Store_Helper <
+ 0x00000018, "flat_store_byte", VGPR_32
+>;
+
+def FLAT_STORE_SHORT : FLAT_Store_Helper <
+ 0x0000001a, "flat_store_short", VGPR_32
+>;
+
+def FLAT_STORE_DWORD : FLAT_Store_Helper <
+ 0x0000001c, "flat_store_dword", VGPR_32
+>;
+
+def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+ 0x0000001d, "flat_store_dwordx2", VReg_64
+>;
+
+def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+ 0x0000001e, "flat_store_dwordx4", VReg_128
+>;
+
+def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+ 0x0000001e, "flat_store_dwordx3", VReg_96
+>;
+
+//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "flat_atomic_swap", []>;
+//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "flat_atomic_cmpswap", []>;
+//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "flat_atomic_add", []>;
+//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "flat_atomic_sub", []>;
+//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "flat_atomic_rsub", []>;
+//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "flat_atomic_smin", []>;
+//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "flat_atomic_umin", []>;
+//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "flat_atomic_smax", []>;
+//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "flat_atomic_umax", []>;
+//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "flat_atomic_and", []>;
+//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "flat_atomic_or", []>;
+//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "flat_atomic_xor", []>;
+//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "flat_atomic_inc", []>;
+//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "flat_atomic_dec", []>;
+//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "flat_atomic_fcmpswap", []>;
+//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "flat_atomic_fmin", []>;
+//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "flat_atomic_fmax", []>;
+//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "flat_atomic_swap_x2", []>;
+//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "flat_atomic_cmpswap_x2", []>;
+//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "flat_atomic_add_x2", []>;
+//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "flat_atomic_sub_x2", []>;
+//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "flat_atomic_rsub_x2", []>;
+//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "flat_atomic_smin_x2", []>;
+//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "flat_atomic_umin_x2", []>;
+//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "flat_atomic_smax_x2", []>;
+//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "flat_atomic_umax_x2", []>;
+//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "flat_atomic_and_x2", []>;
+//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "flat_atomic_or_x2", []>;
+//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "flat_atomic_xor_x2", []>;
+//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "flat_atomic_inc_x2", []>;
+//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "flat_atomic_dec_x2", []>;
+//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "flat_atomic_fcmpswap_x2", []>;
+//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "flat_atomic_fmin_x2", []>;
+//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "flat_atomic_fmax_x2", []>;
+
+} // End HasFlatAddressSpace predicate
//===----------------------------------------------------------------------===//
// VOP1 Instructions
//===----------------------------------------------------------------------===//
-//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
+//def V_NOP : VOP1_ <0x00000000, "v_nop", []>;
-let neverHasSideEffects = 1, isMoveImm = 1 in {
-defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
-} // End neverHasSideEffects = 1, isMoveImm = 1
+let isMoveImm = 1 in {
+defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
+} // End isMoveImm = 1
let Uses = [EXEC] in {
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+
def V_READFIRSTLANE_B32 : VOP1 <
0x00000002,
(outs SReg_32:$vdst),
- (ins VReg_32:$src0),
- "V_READFIRSTLANE_B32 $vdst, $src0",
+ (ins VGPR_32:$src0),
+ "v_readfirstlane_b32 $vdst, $src0",
[]
>;
}
-defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64",
- [(set i32:$dst, (fp_to_sint f64:$src0))]
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
+ VOP_I32_F64, fp_to_sint
>;
-defm V_CVT_F64_I32 : VOP1_64_32 <0x00000004, "V_CVT_F64_I32",
- [(set f64:$dst, (sint_to_fp i32:$src0))]
+defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32",
+ VOP_F64_I32, sint_to_fp
>;
-defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
- [(set f32:$dst, (sint_to_fp i32:$src0))]
+defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32",
+ VOP_F32_I32, sint_to_fp
>;
-defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32",
- [(set f32:$dst, (uint_to_fp i32:$src0))]
+defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32",
+ VOP_F32_I32, uint_to_fp
>;
-defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32",
- [(set i32:$dst, (fp_to_uint f32:$src0))]
+defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32",
+ VOP_I32_F32, fp_to_uint
>;
-defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
- [(set i32:$dst, (fp_to_sint f32:$src0))]
+defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32",
+ VOP_I32_F32, fp_to_sint
>;
-defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
-defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32",
- [(set i32:$dst, (fp_to_f16 f32:$src0))]
+defm V_MOV_FED_B32 : VOP1Inst <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>;
+defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32",
+ VOP_I32_F32, fp_to_f16
>;
-defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16",
- [(set f32:$dst, (f16_to_fp i32:$src0))]
+defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16",
+ VOP_F32_I32, f16_to_fp
>;
-//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
-//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
-//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
-defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
- [(set f32:$dst, (fround f64:$src0))]
+//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "v_cvt_rpi_i32_f32", []>;
+//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "v_cvt_flr_i32_f32", []>;
+//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "v_cvt_off_f32_i4", []>;
+defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64",
+ VOP_F32_F64, fround
>;
-defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
- [(set f64:$dst, (fextend f32:$src0))]
+defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32",
+ VOP_F64_F32, fextend
>;
-defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0",
- [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))]
+defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0",
+ VOP_F32_I32, AMDGPUcvt_f32_ubyte0
>;
-defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1",
- [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))]
+defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1",
+ VOP_F32_I32, AMDGPUcvt_f32_ubyte1
>;
-defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2",
- [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))]
+defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2",
+ VOP_F32_I32, AMDGPUcvt_f32_ubyte2
>;
-defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3",
- [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))]
+defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3",
+ VOP_F32_I32, AMDGPUcvt_f32_ubyte3
>;
-defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
- [(set i32:$dst, (fp_to_uint f64:$src0))]
+defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64",
+ VOP_I32_F64, fp_to_uint
>;
-defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32",
- [(set f64:$dst, (uint_to_fp i32:$src0))]
+defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
+ VOP_F64_I32, uint_to_fp
>;
-defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
- [(set f32:$dst, (AMDGPUfract f32:$src0))]
->;
-defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32",
- [(set f32:$dst, (ftrunc f32:$src0))]
+} // let SchedRW = [WriteQuarterRate32]
+
+defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32",
+ VOP_F32_F32, AMDGPUfract
>;
-defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
- [(set f32:$dst, (fceil f32:$src0))]
+defm V_TRUNC_F32 : VOP1Inst <vop1<0x21, 0x1c>, "v_trunc_f32",
+ VOP_F32_F32, ftrunc
>;
-defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
- [(set f32:$dst, (frint f32:$src0))]
+defm V_CEIL_F32 : VOP1Inst <vop1<0x22, 0x1d>, "v_ceil_f32",
+ VOP_F32_F32, fceil
>;
-defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
- [(set f32:$dst, (ffloor f32:$src0))]
+defm V_RNDNE_F32 : VOP1Inst <vop1<0x23, 0x1e>, "v_rndne_f32",
+ VOP_F32_F32, frint
>;
-defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
- [(set f32:$dst, (fexp2 f32:$src0))]
+defm V_FLOOR_F32 : VOP1Inst <vop1<0x24, 0x1f>, "v_floor_f32",
+ VOP_F32_F32, ffloor
>;
-defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
-defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
- [(set f32:$dst, (flog2 f32:$src0))]
+defm V_EXP_F32 : VOP1Inst <vop1<0x25, 0x20>, "v_exp_f32",
+ VOP_F32_F32, fexp2
>;
-defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
-defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
-defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
- [(set f32:$dst, (AMDGPUrcp f32:$src0))]
+let SchedRW = [WriteQuarterRate32] in {
+
+defm V_LOG_F32 : VOP1Inst <vop1<0x27, 0x21>, "v_log_f32",
+ VOP_F32_F32, flog2
>;
-defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
-defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32",
- [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))]
+defm V_RCP_F32 : VOP1Inst <vop1<0x2a, 0x22>, "v_rcp_f32",
+ VOP_F32_F32, AMDGPUrcp
>;
-defm V_RSQ_LEGACY_F32 : VOP1_32 <
- 0x0000002d, "V_RSQ_LEGACY_F32",
- [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))]
+defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b, 0x23>, "v_rcp_iflag_f32",
+ VOP_F32_F32
>;
-defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
- [(set f32:$dst, (AMDGPUrsq f32:$src0))]
+defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32",
+ VOP_F32_F32, AMDGPUrsq
>;
-defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
- [(set f64:$dst, (AMDGPUrcp f64:$src0))]
+
+} //let SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+
+defm V_RCP_F64 : VOP1Inst <vop1<0x2f, 0x25>, "v_rcp_f64",
+ VOP_F64_F64, AMDGPUrcp
>;
-defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
-defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
- [(set f64:$dst, (AMDGPUrsq f64:$src0))]
+defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64",
+ VOP_F64_F64, AMDGPUrsq
>;
-defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64",
- [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))]
+
+} // let SchedRW = [WriteDouble];
+
+defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32",
+ VOP_F32_F32, fsqrt
>;
-defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
- [(set f32:$dst, (fsqrt f32:$src0))]
+
+let SchedRW = [WriteDouble] in {
+
+defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
+ VOP_F64_F64, fsqrt
+>;
+
+} // let SchedRW = [WriteDouble]
+
+defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
+ VOP_F32_F32, AMDGPUsin
>;
-defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64",
- [(set f64:$dst, (fsqrt f64:$src0))]
+defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
+ VOP_F32_F32, AMDGPUcos
>;
-defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32",
- [(set f32:$dst, (AMDGPUsin f32:$src0))]
+defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
+defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
+//defm V_FREXP_EXP_I32_F64 : VOPInst <0x0000003c, "v_frexp_exp_i32_f64", VOP_I32_F32>;
+defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
+ VOP_F64_F64
>;
-defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32",
- [(set f32:$dst, (AMDGPUcos f32:$src0))]
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
+//defm V_FREXP_EXP_I32_F32 : VOPInst <0x0000003f, "v_frexp_exp_i32_f32", VOP_I32_F32>;
+defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
+ VOP_F32_F32
>;
-defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
-defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
-defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
-defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
-defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
-//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
-defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
-defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
-//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
-defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
-//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
-defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
-defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
-defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
+//def V_CLREXCP : VOP1_ <0x00000041, "v_clrexcp", []>;
+defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
+defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
+defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+// These instruction only exist on SI and CI
+let SubtargetPredicate = isSICI in {
-//===----------------------------------------------------------------------===//
-// VINTRP Instructions
-//===----------------------------------------------------------------------===//
+let SchedRW = [WriteQuarterRate32] in {
-def V_INTERP_P1_F32 : VINTRP <
- 0x00000000,
- (outs VReg_32:$dst),
- (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
- "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]",
- []> {
- let DisableEncoding = "$m0";
-}
+defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
+defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
+defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
+defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32",
+ VOP_F32_F32, AMDGPUrsq_clamped
+>;
+defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
+ VOP_F32_F32, AMDGPUrsq_legacy
+>;
-def V_INTERP_P2_F32 : VINTRP <
- 0x00000001,
- (outs VReg_32:$dst),
- (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
- "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
- []> {
+} // End let SchedRW = [WriteQuarterRate32]
- let Constraints = "$src0 = $dst";
- let DisableEncoding = "$src0,$m0";
+let SchedRW = [WriteDouble] in {
-}
+defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
+defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
+ VOP_F64_F64, AMDGPUrsq_clamped
+>;
-def V_INTERP_MOV_F32 : VINTRP <
- 0x00000002,
- (outs VReg_32:$dst),
+} // End SchedRW = [WriteDouble]
+
+} // End SubtargetPredicate = isSICI
+
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Specify SchedRW for VINTRP insturctions.
+defm V_INTERP_P1_F32 : VINTRP_m <
+ 0x00000000, "v_interp_p1_f32",
+ (outs VGPR_32:$dst),
+ (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+ "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [$m0]",
+ "$m0">;
+
+defm V_INTERP_P2_F32 : VINTRP_m <
+ 0x00000001, "v_interp_p2_f32",
+ (outs VGPR_32:$dst),
+ (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+ "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
+ "$src0,$m0",
+ "$src0 = $dst">;
+
+defm V_INTERP_MOV_F32 : VINTRP_m <
+ 0x00000002, "v_interp_mov_f32",
+ (outs VGPR_32:$dst),
(ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
- "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]",
- []> {
- let DisableEncoding = "$m0";
-}
+ "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [$m0]",
+ "$m0">;
//===----------------------------------------------------------------------===//
// VOP2 Instructions
//===----------------------------------------------------------------------===//
-def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
- (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
- "V_CNDMASK_B32_e32 $dst, $src0, $src1, [$vcc]",
- []
->{
- let DisableEncoding = "$vcc";
-}
+defm V_CNDMASK_B32_e64 : VOP3_m_nosrcmod <vop3<0x100>, (outs VGPR_32:$dst),
+ (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2),
+ "v_cndmask_b32_e64 $dst, $src0, $src1, $src2",
+ [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))],
+ "v_cndmask_b32_e64", 3
+>;
-def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
- (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2,
- InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
- "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
- [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
-> {
- let src0_modifiers = 0;
- let src1_modifiers = 0;
- let src2_modifiers = 0;
-}
-def V_READLANE_B32 : VOP2 <
- 0x00000001,
- (outs SReg_32:$vdst),
- (ins VReg_32:$src0, SSrc_32:$vsrc1),
- "V_READLANE_B32 $vdst, $src0, $vsrc1",
- []
+let isCommutable = 1 in {
+defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32",
+ VOP_F32_F32_F32, fadd
>;
-def V_WRITELANE_B32 : VOP2 <
- 0x00000002,
- (outs VReg_32:$vdst),
- (ins SReg_32:$src0, SSrc_32:$vsrc1),
- "V_WRITELANE_B32 $vdst, $src0, $vsrc1",
- []
+defm V_SUB_F32 : VOP2Inst <vop2<0x4, 0x2>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32",
+ VOP_F32_F32_F32, null_frag, "v_sub_f32"
>;
+} // End isCommutable = 1
let isCommutable = 1 in {
-defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32",
- [(set f32:$dst, (fadd f32:$src0, f32:$src1))]
+
+defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32",
+ VOP_F32_F32_F32, int_AMDGPU_mul
>;
-defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32",
- [(set f32:$dst, (fsub f32:$src0, f32:$src1))]
+defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32",
+ VOP_F32_F32_F32, fmul
>;
-defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">;
-} // End isCommutable = 1
-defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
+defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9, 0x6>, "v_mul_i32_i24",
+ VOP_I32_I32_I32, AMDGPUmul_i24
+>;
+//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "v_mul_hi_i32_i24", []>;
+defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb, 0x8>, "v_mul_u32_u24",
+ VOP_I32_I32_I32, AMDGPUmul_u24
+>;
+//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>;
-let isCommutable = 1 in {
+defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32,
+ fminnum>;
+defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32,
+ fmaxnum>;
+defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32,
+ AMDGPUsmin
+>;
+defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32,
+ AMDGPUsmax
+>;
+defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32,
+ AMDGPUumin
+>;
+defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32,
+ AMDGPUumax
+>;
-defm V_MUL_LEGACY_F32 : VOP2_32 <
- 0x00000007, "V_MUL_LEGACY_F32",
- [(set f32:$dst, (int_AMDGPU_mul f32:$src0, f32:$src1))]
+// No non-Rev Op on VI
+defm V_LSHRREV_B32 : VOP2Inst <
+ vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag,
+ "v_lshr_b32", "v_lshrrev_b32"
>;
-defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
- [(set f32:$dst, (fmul f32:$src0, f32:$src1))]
+// No non-Rev OP on VI
+defm V_ASHRREV_I32 : VOP2Inst <
+ vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag,
+ "v_ashr_i32", "v_ashrrev_i32"
>;
+// No non-Rev OP on VI
+defm V_LSHLREV_B32 : VOP2Inst <
+ vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag,
+ "v_lshl_b32", "v_lshlrev_b32"
+>;
-defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
- [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
+defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32",
+ VOP_I32_I32_I32, and>;
+defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32",
+ VOP_I32_I32_I32, or
>;
-//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
-defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
- [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
+defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32",
+ VOP_I32_I32_I32, xor
>;
-//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
+defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
+defm V_MADMK_F32 : VOP2Inst <vop2<0x20, 0x17>, "v_madmk_f32", VOP_F32_F32_F32>;
+
+let isCommutable = 1 in {
+defm V_MADAK_F32 : VOP2Inst <vop2<0x21, 0x18>, "v_madak_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
+let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
+// but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
+ VOP_I32_I32_I32, add
+>;
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32",
+ VOP_I32_I32_I32, sub
+>;
-defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
- [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))]
+defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
+ VOP_I32_I32_I32, null_frag, "v_sub_i32"
>;
-defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
- [(set f32:$dst, (AMDGPUfmax f32:$src0, f32:$src1))]
+let Uses = [VCC] in { // Carry-in comes from VCC
+defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
+ VOP_I32_I32_I32_VCC, adde
+>;
+defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
+ VOP_I32_I32_I32_VCC, sube
+>;
+defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
+ VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
>;
-defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
-defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
- [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
- [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
- [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
- [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>;
+} // End Uses = [VCC]
+} // End isCommutable = 1, Defs = [VCC]
-defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
- [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+def V_READLANE_B32 : VOP2 <
+ 0x00000001,
+ (outs SReg_32:$vdst),
+ (ins VGPR_32:$src0, SSrc_32:$vsrc1),
+ "v_readlane_b32 $vdst, $src0, $vsrc1",
+ []
+>;
+
+def V_WRITELANE_B32 : VOP2 <
+ 0x00000002,
+ (outs VGPR_32:$vdst),
+ (ins SReg_32:$src0, SSrc_32:$vsrc1),
+ "v_writelane_b32 $vdst, $src0, $vsrc1",
+ []
>;
-defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
+ VOP_F32_F32_F32
+>;
+} // End isCommutable = 1
-defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
- [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32",
+ VOP_F32_F32_F32, AMDGPUfmin_legacy
+>;
+defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32",
+ VOP_F32_F32_F32, AMDGPUfmax_legacy
+>;
+
+let isCommutable = 1 in {
+defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>;
+defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32",
+ VOP_I32_I32_I32, sra
>;
-defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
let hasPostISelHook = 1 in {
+defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>;
+}
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
- [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+} // End isCommutable = 1
+
+defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32", VOP_I32_I32_I32,
+ AMDGPUbfm>;
+defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32",
+ VOP_I32_I32_I32
+>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32",
+ VOP_I32_I32_I32
+>;
+defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32",
+ VOP_F32_F32_I32, AMDGPUldexp
>;
-}
-defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
+////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>;
+////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>;
+////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32",
+ VOP_I32_F32_F32, int_SI_packf16
+>;
+////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>;
+////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>;
-defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
- [(set i32:$dst, (and i32:$src0, i32:$src1))]>;
-defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
- [(set i32:$dst, (or i32:$src0, i32:$src1))]
+} // End let SubtargetPredicate = SICI
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1 in {
+defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140, 0x1c0>, "v_mad_legacy_f32",
+ VOP_F32_F32_F32_F32
>;
-defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
- [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+
+defm V_MAD_F32 : VOP3Inst <vop3<0x141, 0x1c1>, "v_mad_f32",
+ VOP_F32_F32_F32_F32, fmad
>;
+defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142, 0x1c2>, "v_mad_i32_i24",
+ VOP_I32_I32_I32_I32, AMDGPUmad_i24
+>;
+defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24",
+ VOP_I32_I32_I32_I32, AMDGPUmad_u24
+>;
} // End isCommutable = 1
-defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32",
- [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
-defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
-defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
-defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
-defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
-defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
-defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32",
+ VOP_F32_F32_F32_F32
+>;
+defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32",
+ VOP_F32_F32_F32_F32
+>;
+defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32",
+ VOP_F32_F32_F32_F32
+>;
+defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32",
+ VOP_F32_F32_F32_F32
+>;
-let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
-// No patterns so that the scalar instructions are always selected.
-// The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
- [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>;
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
- [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>;
-defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32,
- "V_SUB_I32">;
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32",
+ VOP_I32_I32_I32_I32, AMDGPUbfe_u32
+>;
+defm V_BFE_I32 : VOP3Inst <vop3<0x149, 0x1c9>, "v_bfe_i32",
+ VOP_I32_I32_I32_I32, AMDGPUbfe_i32
+>;
+}
-let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32",
- [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>;
-defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32",
- [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>;
-defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32,
- "V_SUBB_U32">;
-} // End Uses = [VCC]
-} // End isCommutable = 1, Defs = [VCC]
+defm V_BFI_B32 : VOP3Inst <vop3<0x14a, 0x1ca>, "v_bfi_b32",
+ VOP_I32_I32_I32_I32, AMDGPUbfi
+>;
-defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
-////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
-////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
-////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
- [(set i32:$dst, (int_SI_packf16 f32:$src0, f32:$src1))]
+let isCommutable = 1 in {
+defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32",
+ VOP_F32_F32_F32_F32, fma
+>;
+defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64",
+ VOP_F64_F64_F64_F64, fma
>;
-////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
-////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
+} // End isCommutable = 1
-//===----------------------------------------------------------------------===//
-// VOP3 Instructions
-//===----------------------------------------------------------------------===//
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
+defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e, 0x1ce>, "v_alignbit_b32",
+ VOP_I32_I32_I32_I32
+>;
+defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32",
+ VOP_I32_I32_I32_I32
+>;
-let neverHasSideEffects = 1 in {
+// Only on SI
+defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
+ VOP_F32_F32_F32_F32>;
+defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32",
+ VOP_F32_F32_F32_F32, AMDGPUfmin3>;
-defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
-defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32",
- [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32",
+ VOP_I32_I32_I32_I32, AMDGPUsmin3
>;
-defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
- [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))]
+defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32",
+ VOP_I32_I32_I32_I32, AMDGPUumin3
>;
-defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
- [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))]
+defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32",
+ VOP_F32_F32_F32_F32, AMDGPUfmax3
+>;
+defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32",
+ VOP_I32_I32_I32_I32, AMDGPUsmax3
+>;
+defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32",
+ VOP_I32_I32_I32_I32, AMDGPUumax3
+>;
+//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>;
+//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>;
+//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>;
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>;
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>;
+defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32",
+ VOP_I32_I32_I32_I32
+>;
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
+defm V_DIV_FIXUP_F32 : VOP3Inst <
+ vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
>;
-} // End neverHasSideEffects
+let SchedRW = [WriteDouble] in {
-defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
-defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
-defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
-defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+defm V_DIV_FIXUP_F64 : VOP3Inst <
+ vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
+>;
-let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
-defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
- [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
-defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
- [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
-}
+} // let SchedRW = [WriteDouble]
-defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
- [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>;
-defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
- [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
->;
-def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
- [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
->;
-//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
-defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
-
-defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
-defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
-////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
-////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
-////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
-////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
-////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
-////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
-////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
-////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
-////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
-//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
-//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
-//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
-defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
-////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
-defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32",
- [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))]
->;
-def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64",
- [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))]
->;
-
-def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64",
- [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
+ VOP_I64_I64_I32, shl
>;
-def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64",
- [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+
+// Only on SI
+defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64",
+ VOP_I64_I64_I32, srl
>;
-def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64",
- [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+
+// Only on SI
+defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
+ VOP_I64_I64_I32, sra
>;
+let SchedRW = [WriteDouble] in {
let isCommutable = 1 in {
-def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
-def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
-def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
-def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
+defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
+ VOP_F64_F64_F64, fadd
+>;
+defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64",
+ VOP_F64_F64_F64, fmul
+>;
+
+defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64",
+ VOP_F64_F64_F64, fminnum
+>;
+defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64",
+ VOP_F64_F64_F64, fmaxnum
+>;
} // isCommutable = 1
-def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
+defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
+ VOP_F64_F64_I32, AMDGPUldexp
+>;
-let isCommutable = 1 in {
+} // let SchedRW = [WriteDouble]
-defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
-defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
-defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
-defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
-} // isCommutable = 1
+defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32",
+ VOP_I32_I32_I32
+>;
+defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32",
+ VOP_I32_I32_I32
+>;
+
+defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32",
+ VOP_I32_I32_I32
+>;
+defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
+ VOP_I32_I32_I32
+>;
+
+} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
-def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+let SchedRW = [WriteDouble] in {
// Double precision division pre-scale.
-def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+} // let SchedRW = [WriteDouble]
-defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32",
- [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))]
+let isCommutable = 1 in {
+defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
+ VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>;
-def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64",
- [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))]
+let SchedRW = [WriteDouble] in {
+defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
+ VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
>;
-//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
-//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
-//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
-def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64",
- [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))]
+} // End SchedRW = [WriteDouble]
+} // End isCommutable = 1
+
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
+
+let SchedRW = [WriteDouble] in {
+defm V_TRIG_PREOP_F64 : VOP3Inst <
+ vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
>;
+} // let SchedRW = [WriteDouble]
+
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
-
let isCodeGenOnly = 1, isPseudo = 1 in {
-def V_MOV_I1 : InstSI <
- (outs VReg_1:$dst),
- (ins i1imm:$src),
- "", [(set i1:$dst, (imm:$src))]
->;
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
+// pass to enable folding of inline immediates.
+def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
+} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
-def V_AND_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (and i1:$src0, i1:$src1))]
->;
-
-def V_OR_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (or i1:$src0, i1:$src1))]
->;
-
-def V_XOR_I1 : InstSI <
- (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
- [(set i1:$dst, (xor i1:$src0, i1:$src1))]
->;
+let hasSideEffects = 1 in {
+def SGPR_USE : InstSI <(outs),(ins), "", []>;
+}
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
@@ -1557,7 +1827,7 @@ def SI_ELSE : InstSI <
def SI_LOOP : InstSI <
(outs),
(ins SReg_64:$saved, brtarget:$target),
- "SI_LOOP $saved, $target",
+ "si_loop $saved, $target",
[(int_SI_loop i64:$saved, bb:$target)]
>;
@@ -1566,35 +1836,35 @@ def SI_LOOP : InstSI <
def SI_BREAK : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$src),
- "SI_ELSE $dst, $src",
+ "si_else $dst, $src",
[(set i64:$dst, (int_SI_break i64:$src))]
>;
def SI_IF_BREAK : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$vcc, SReg_64:$src),
- "SI_IF_BREAK $dst, $vcc, $src",
+ "si_if_break $dst, $vcc, $src",
[(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))]
>;
def SI_ELSE_BREAK : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$src0, SReg_64:$src1),
- "SI_ELSE_BREAK $dst, $src0, $src1",
+ "si_else_break $dst, $src0, $src1",
[(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))]
>;
def SI_END_CF : InstSI <
(outs),
(ins SReg_64:$saved),
- "SI_END_CF $saved",
+ "si_end_cf $saved",
[(int_SI_end_cf i64:$saved)]
>;
def SI_KILL : InstSI <
(outs),
(ins VSrc_32:$src),
- "SI_KILL $src",
+ "si_kill $src",
[(int_AMDGPU_kill f32:$src)]
>;
@@ -1603,12 +1873,12 @@ def SI_KILL : InstSI <
let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
-//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri, ADDRIndirect>;
+//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
let UseNamedOperandTable = 1 in {
def SI_RegisterLoad : InstSI <
- (outs VReg_32:$dst, SReg_64:$temp),
+ (outs VGPR_32:$dst, SReg_64:$temp),
(ins FRAMEri32:$addr, i32imm:$chan),
"", []
> {
@@ -1618,7 +1888,7 @@ def SI_RegisterLoad : InstSI <
class SIRegStore<dag outs> : InstSI <
outs,
- (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
+ (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
"", []
> {
let isRegisterStore = 1;
@@ -1634,22 +1904,22 @@ def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
} // End UseNamedOperandTable = 1
def SI_INDIRECT_SRC : InstSI <
- (outs VReg_32:$dst, SReg_64:$temp),
+ (outs VGPR_32:$dst, SReg_64:$temp),
(ins unknown:$src, VSrc_32:$idx, i32imm:$off),
- "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
+ "si_indirect_src $dst, $temp, $src, $idx, $off",
[]
>;
class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
(outs rc:$dst, SReg_64:$temp),
- (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
- "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
+ (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
+ "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
[]
> {
let Constraints = "$src = $dst";
}
-def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>;
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
@@ -1659,24 +1929,10 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
let usesCustomInserter = 1 in {
-// This pseudo instruction takes a pointer as input and outputs a resource
-// constant that can be used with the ADDR64 MUBUF instructions.
-def SI_ADDR64_RSRC : InstSI <
- (outs SReg_128:$srsrc),
- (ins SSrc_64:$ptr),
- "", []
->;
-
-def SI_BUFFER_RSRC : InstSI <
- (outs SReg_128:$srsrc),
- (ins SReg_32:$ptr_lo, SReg_32:$ptr_hi, SSrc_32:$data_lo, SSrc_32:$data_hi),
- "", []
->;
-
def V_SUB_F64 : InstSI <
(outs VReg_64:$dst),
(ins VReg_64:$src0, VReg_64:$src1),
- "V_SUB_F64 $dst, $src0, $src1",
+ "v_sub_f64 $dst, $src0, $src1",
[(set f64:$dst, (fsub f64:$src0, f64:$src1))]
>;
@@ -1684,18 +1940,20 @@ def V_SUB_F64 : InstSI <
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
- def _SAVE : InstSI <
- (outs VReg_32:$dst),
- (ins sgpr_class:$src, i32imm:$frame_idx),
- "", []
- >;
-
- def _RESTORE : InstSI <
- (outs sgpr_class:$dst),
- (ins VReg_32:$src, i32imm:$frame_idx),
- "", []
- >;
-
+ let UseNamedOperandTable = 1 in {
+ def _SAVE : InstSI <
+ (outs),
+ (ins sgpr_class:$src, i32imm:$frame_idx, SReg_64:$scratch_ptr,
+ SReg_32:$scratch_offset),
+ "", []
+ >;
+
+ def _RESTORE : InstSI <
+ (outs sgpr_class:$dst),
+ (ins i32imm:$frame_idx, SReg_64:$scratch_ptr, SReg_32:$scratch_offset),
+ "", []
+ >;
+ } // End UseNamedOperandTable = 1
}
defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
@@ -1704,6 +1962,30 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+ let UseNamedOperandTable = 1 in {
+ def _SAVE : InstSI <
+ (outs),
+ (ins vgpr_class:$src, i32imm:$frame_idx, SReg_64:$scratch_ptr,
+ SReg_32:$scratch_offset),
+ "", []
+ >;
+
+ def _RESTORE : InstSI <
+ (outs vgpr_class:$dst),
+ (ins i32imm:$frame_idx, SReg_64:$scratch_ptr, SReg_32:$scratch_offset),
+ "", []
+ >;
+ } // End UseNamedOperandTable = 1
+}
+
+defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
+defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
+defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+
let Defs = [SCC] in {
def SI_CONSTDATA_PTR : InstSI <
@@ -1716,13 +1998,15 @@ def SI_CONSTDATA_PTR : InstSI <
} // end IsCodeGenOnly, isPseudo
-} // end SubtargetPredicate = SI
+} // end SubtargetPredicate = isGCN
-let Predicates = [isSI] in {
+let Predicates = [isGCN] in {
def : Pat<
(int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
- (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0))
+ (V_CNDMASK_B32_e64 $src2, $src1,
+ (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0,
+ DSTCLAMP.NONE, DSTOMOD.NONE))
>;
def : Pat <
@@ -1730,12 +2014,16 @@ def : Pat <
(SI_KILL 0xbf800000)
>;
+let Predicates = [isSICI] in {
+
/* int_SI_vs_load_input */
def : Pat<
(SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
(BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
>;
+} // End Predicates = [isSICI]
+
/* int_SI_export */
def : Pat <
(int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
@@ -1750,7 +2038,7 @@ def : Pat <
multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
- // 1. Offset as 8bit DWORD immediate
+ // 1. SI-CI: Offset as 8bit DWORD immediate
def : Pat <
(constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
(vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
@@ -1769,6 +2057,28 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
>;
}
+multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+ // 1. VI: Offset as 20bit immediate in bytes
+ def : Pat <
+ (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
+ (vt (Instr_IMM $sbase, (as_i32imm $offset)))
+ >;
+
+ // 2. Offset loaded in an 32bit SGPR
+ def : Pat <
+ (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+ (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+ >;
+
+ // 3. No offset at all
+ def : Pat <
+ (constant_load i64:$sbase),
+ (vt (Instr_IMM $sbase, 0))
+ >;
+}
+
+let Predicates = [isSICI] in {
defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
@@ -1776,6 +2086,19 @@ defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+} // End Predicates = [isSICI]
+
+let Predicates = [isVI] in {
+defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+} // End Predicates = [isVI]
+
+let Predicates = [isSICI] in {
// 1. Offset as 8bit DWORD immediate
def : Pat <
@@ -1783,42 +2106,36 @@ def : Pat <
(S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
>;
+} // End Predicates = [isSICI]
+
// 2. Offset loaded in an 32bit SGPR
def : Pat <
(SIload_constant v4i32:$sbase, imm:$offset),
(S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
>;
-} // Predicates = [isSI] in {
-
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
-let Predicates = [isSI, isCFDepth0] in {
-
def : Pat <
(i64 (ctpop i64:$src)),
- (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (S_BCNT1_I32_B64 $src), sub0),
- (S_MOV_B32 0), sub1)
+ (i64 (REG_SEQUENCE SReg_64,
+ (S_BCNT1_I32_B64 $src), sub0,
+ (S_MOV_B32 0), sub1))
>;
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
-// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector
+// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : Pat <
(i32 (addc i32:$src0, i32:$src1)),
- (S_ADD_I32 $src0, $src1)
+ (S_ADD_U32 $src0, $src1)
>;
-} // Predicates = [isSI, isCFDepth0]
-
-let Predicates = [isSI] in {
-
//===----------------------------------------------------------------------===//
// SOPP Patterns
//===----------------------------------------------------------------------===//
@@ -1842,49 +2159,9 @@ defm : RsqPat<V_RSQ_F32_e32, f32>;
// VOP2 Patterns
//===----------------------------------------------------------------------===//
-class BinOp64Pat <SDNode node, Instruction inst> : Pat <
- (node i64:$src0, i64:$src1),
- (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (inst (EXTRACT_SUBREG i64:$src0, sub0),
- (EXTRACT_SUBREG i64:$src1, sub0)), sub0),
- (inst (EXTRACT_SUBREG i64:$src0, sub1),
- (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
->;
-
-def : BinOp64Pat <or, V_OR_B32_e32>;
-def : BinOp64Pat <xor, V_XOR_B32_e32>;
-
-class SextInReg <ValueType vt, int ShiftAmt> : Pat <
- (sext_inreg i32:$src0, vt),
- (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
->;
-
-def : SextInReg <i8, 24>;
-def : SextInReg <i16, 16>;
-
def : Pat <
(i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
- (V_BCNT_U32_B32_e32 $popcnt, $val)
->;
-
-def : Pat <
- (i32 (ctpop i32:$popcnt)),
- (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0)
->;
-
-def : Pat <
- (i64 (ctpop i64:$src)),
- (INSERT_SUBREG
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1),
- (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)),
- sub0),
- (V_MOV_B32_e32 0), sub1)
->;
-
-def : Pat <
- (addc i32:$src0, i32:$src1),
- (V_ADD_I32_e32 $src0, $src1)
+ (V_BCNT_U32_B32_e64 $popcnt, $val)
>;
/********** ======================= **********/
@@ -2222,10 +2499,10 @@ foreach Index = 0-15 in {
}
def : BitConvert <i32, f32, SReg_32>;
-def : BitConvert <i32, f32, VReg_32>;
+def : BitConvert <i32, f32, VGPR_32>;
def : BitConvert <f32, i32, SReg_32>;
-def : BitConvert <f32, i32, VReg_32>;
+def : BitConvert <f32, i32, VGPR_32>;
def : BitConvert <i64, f64, VReg_64>;
@@ -2258,62 +2535,63 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
/********** Src & Dst modifiers **********/
/********** =================== **********/
-def FCLAMP_SI : AMDGPUShaderInst <
- (outs VReg_32:$dst),
- (ins VSrc_32:$src0),
- "FCLAMP_SI $dst, $src0",
- []
-> {
- let usesCustomInserter = 1;
-}
-
def : Pat <
- (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
- (FCLAMP_SI f32:$src)
+ (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
+ (f32 FP_ZERO), (f32 FP_ONE)),
+ (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod)
>;
/********** ================================ **********/
/********** Floating point absolute/negative **********/
/********** ================================ **********/
-// Manipulate the sign bit directly, as e.g. using the source negation modifier
-// in V_ADD_F32_e64 $src, 0, [...] does not result in -0.0 for $src == +0.0,
-// breaking the piglit *s-floatBitsToInt-neg* tests
-
-// TODO: Look into not implementing isFNegFree/isFAbsFree for SI, and possibly
-// removing these patterns
+// Prevent expanding both fneg and fabs.
+// FIXME: Should use S_OR_B32
def : Pat <
(fneg (fabs f32:$src)),
(V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
>;
-def FABS_SI : AMDGPUShaderInst <
- (outs VReg_32:$dst),
- (ins VSrc_32:$src0),
- "FABS_SI $dst, $src0",
- []
-> {
- let usesCustomInserter = 1;
-}
+// FIXME: Should use S_OR_B32
+def : Pat <
+ (fneg (fabs f64:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ sub0,
+ (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+ (V_MOV_B32_e32 0x80000000)), // Set sign bit.
+ sub1)
+>;
def : Pat <
(fabs f32:$src),
- (FABS_SI f32:$src)
+ (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff))
>;
-def FNEG_SI : AMDGPUShaderInst <
- (outs VReg_32:$dst),
- (ins VSrc_32:$src0),
- "FNEG_SI $dst, $src0",
- []
-> {
- let usesCustomInserter = 1;
-}
-
def : Pat <
(fneg f32:$src),
- (FNEG_SI f32:$src)
+ (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
+>;
+
+def : Pat <
+ (fabs f64:$src),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ sub0,
+ (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+ (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
+ sub1)
+>;
+
+def : Pat <
+ (fneg f64:$src),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ sub0,
+ (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+ (V_MOV_B32_e32 0x80000000)),
+ sub1)
>;
/********** ================== **********/
@@ -2327,7 +2605,7 @@ def : Pat <
def : Pat <
(SGPRImm<(f32 fpimm)>:$imm),
- (S_MOV_B32 fpimm:$imm)
+ (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
@@ -2337,7 +2615,7 @@ def : Pat <
def : Pat <
(f32 fpimm:$imm),
- (V_MOV_B32_e32 fpimm:$imm)
+ (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
@@ -2345,21 +2623,38 @@ def : Pat <
(S_MOV_B64 InlineImm<i64>:$imm)
>;
+// XXX - Should this use a s_cmp to set SCC?
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : Pat <
+ (i1 imm:$imm),
+ (S_MOV_B64 (i64 (as_i64imm $imm)))
+>;
+
+def : Pat <
+ (f64 InlineFPImm<f64>:$imm),
+ (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
+>;
+
/********** ===================== **********/
/********** Interpolation Paterns **********/
/********** ===================== **********/
+// The value of $params is constant through out the entire kernel.
+// We need to use S_MOV_B32 $params, because CSE ignores copies, so
+// without it we end up with a lot of redundant moves.
+
def : Pat <
(int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params),
- (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params)
+ (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, (S_MOV_B32 $params))
>;
def : Pat <
- (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij),
+ (int_SI_fs_interp imm:$attr_chan, imm:$attr, i32:$params, v2i32:$ij),
(V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0),
- imm:$attr_chan, imm:$attr, i32:$params),
+ imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)),
(EXTRACT_SUBREG $ij, sub1),
- imm:$attr_chan, imm:$attr, $params)
+ imm:$attr_chan, imm:$attr, (S_MOV_B32 $params))
>;
/********** ================== **********/
@@ -2376,28 +2671,30 @@ def : Pat <
def : Pat<
(fdiv f64:$src0, f64:$src1),
- (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0))
+ (V_MUL_F64 0 /* src0_modifiers */, $src0,
+ 0 /* src1_modifiers */, (V_RCP_F64_e32 $src1),
+ 0 /* clamp */, 0 /* omod */)
>;
def : Pat <
(int_AMDGPU_cube v4f32:$src),
- (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
- (V_CUBETC_F32 (EXTRACT_SUBREG $src, sub0),
- (EXTRACT_SUBREG $src, sub1),
- (EXTRACT_SUBREG $src, sub2)),
- sub0),
- (V_CUBESC_F32 (EXTRACT_SUBREG $src, sub0),
- (EXTRACT_SUBREG $src, sub1),
- (EXTRACT_SUBREG $src, sub2)),
- sub1),
- (V_CUBEMA_F32 (EXTRACT_SUBREG $src, sub0),
- (EXTRACT_SUBREG $src, sub1),
- (EXTRACT_SUBREG $src, sub2)),
- sub2),
- (V_CUBEID_F32 (EXTRACT_SUBREG $src, sub0),
- (EXTRACT_SUBREG $src, sub1),
- (EXTRACT_SUBREG $src, sub2)),
- sub3)
+ (REG_SEQUENCE VReg_128,
+ (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
+ 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1),
+ 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2),
+ 0 /* clamp */, 0 /* omod */), sub0,
+ (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
+ 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+ 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2),
+ 0 /* clamp */, 0 /* omod */), sub1,
+ (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
+ 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+ 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+ 0 /* clamp */, 0 /* omod */), sub2,
+ (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
+ 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+ 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+ 0 /* clamp */, 0 /* omod */), sub3)
>;
def : Pat <
@@ -2413,12 +2710,16 @@ class Ext32Pat <SDNode ext> : Pat <
def : Ext32Pat <zext>;
def : Ext32Pat <anyext>;
+let Predicates = [isSICI] in {
+
// Offset in an 32Bit VGPR
def : Pat <
(SIload_constant v4i32:$sbase, i32:$voff),
(BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
>;
+} // End Predicates = [isSICI]
+
// The multiplication scales from [0,1] to the unsigned integer range
def : Pat <
(AMDGPUurecip i32:$src0),
@@ -2427,12 +2728,16 @@ def : Pat <
(V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>;
+let Predicates = [isSICI] in {
+
def : Pat <
(int_SI_tid),
(V_MBCNT_HI_U32_B32_e32 0xffffffff,
- (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0))
+ (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
>;
+}
+
//===----------------------------------------------------------------------===//
// VOP3 Patterns
//===----------------------------------------------------------------------===//
@@ -2441,84 +2746,74 @@ def : IMad24Pat<V_MAD_I32_I24>;
def : UMad24Pat<V_MAD_U32_U24>;
def : Pat <
- (fadd f64:$src0, f64:$src1),
- (V_ADD_F64 $src0, $src1, (i64 0))
->;
-
-def : Pat <
- (fmul f64:$src0, f64:$src1),
- (V_MUL_F64 $src0, $src1, (i64 0))
->;
-
-def : Pat <
- (mul i32:$src0, i32:$src1),
- (V_MUL_LO_I32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
(mulhu i32:$src0, i32:$src1),
- (V_MUL_HI_U32 $src0, $src1, (i32 0))
+ (V_MUL_HI_U32 $src0, $src1)
>;
def : Pat <
(mulhs i32:$src0, i32:$src1),
- (V_MUL_HI_I32 $src0, $src1, (i32 0))
+ (V_MUL_HI_I32 $src0, $src1)
>;
-defm : BFIPatterns <V_BFI_B32, S_MOV_B32>;
+def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>;
+
+
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
/********** ======================= **********/
/********** Load/Store Patterns **********/
/********** ======================= **********/
-multiclass DSReadPat <DS inst, ValueType vt, PatFrag frag> {
- def : Pat <
- (vt (frag (add i32:$ptr, (i32 IMM16bit:$offset)))),
- (inst (i1 0), $ptr, (as_i16imm $offset))
- >;
+class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
+ (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
+ (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1))
+>;
- def : Pat <
- (frag i32:$src0),
- (vt (inst 0, $src0, 0))
- >;
-}
+def : DSReadPat <DS_READ_I8, i32, sextloadi8_local>;
+def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>;
+def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
+def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
+def : DSReadPat <DS_READ_B32, i32, local_load>;
-defm : DSReadPat <DS_READ_I8, i32, sextloadi8_local>;
-defm : DSReadPat <DS_READ_U8, i32, az_extloadi8_local>;
-defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
-defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
-defm : DSReadPat <DS_READ_B32, i32, local_load>;
-defm : DSReadPat <DS_READ_B64, v2i32, local_load>;
+let AddedComplexity = 100 in {
-multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
- def : Pat <
- (frag vt:$value, (add i32:$ptr, (i32 IMM16bit:$offset))),
- (inst (i1 0), $ptr, $value, (as_i16imm $offset))
- >;
+def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>;
- def : Pat <
- (frag vt:$val, i32:$ptr),
- (inst 0, $ptr, $val, 0)
- >;
-}
+} // End AddedComplexity = 100
-defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
-defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
-defm : DSWritePat <DS_WRITE_B32, i32, local_store>;
-defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>;
+def : Pat <
+ (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+ i8:$offset1))),
+ (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1))
+>;
-multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> {
- def : Pat <
- (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value),
- (inst (i1 0), $ptr, $value, (as_i16imm $offset))
- >;
+class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
+ (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
+ (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1))
+>;
- def : Pat <
- (frag i32:$ptr, vt:$val),
- (inst 0, $ptr, $val, 0)
- >;
-}
+def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
+def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
+def : DSWritePat <DS_WRITE_B32, i32, local_store>;
+
+let AddedComplexity = 100 in {
+
+def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>;
+} // End AddedComplexity = 100
+
+def : Pat <
+ (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+ i8:$offset1)),
+ (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0),
+ (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
+ (S_MOV_B32 -1))
+>;
+
+class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+ (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1))
+>;
// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
//
@@ -2530,69 +2825,56 @@ multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> {
// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
// needs to be a VGPR. The SGPR copy pass will fix this, and it's
// easier since there is no v_mov_b64.
-multiclass DSAtomicIncRetPat<DS inst, ValueType vt,
- Instruction LoadImm, PatFrag frag> {
- def : Pat <
- (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)),
- (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
- >;
-
- def : Pat <
- (frag i32:$ptr, (vt 1)),
- (inst 0, $ptr, (LoadImm (vt -1)), 0)
- >;
-}
+class DSAtomicIncRetPat<DS inst, ValueType vt,
+ Instruction LoadImm, PatFrag frag> : Pat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
+ (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1))
+>;
-multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> {
- def : Pat <
- (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap),
- (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
- >;
- def : Pat <
- (frag i32:$ptr, vt:$cmp, vt:$swap),
- (inst 0, $ptr, $cmp, $swap, 0)
- >;
-}
+class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
+ (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1))
+>;
// 32-bit atomics.
-defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
- S_MOV_B32, atomic_load_add_local>;
-defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
- S_MOV_B32, atomic_load_sub_local>;
-
-defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
-defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
-defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
-defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
-defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
-defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
-defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
-defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
-defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
-defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
-
-defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
+def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
+ S_MOV_B32, atomic_load_add_local>;
+def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
+ S_MOV_B32, atomic_load_sub_local>;
+
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
+
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
// 64-bit atomics.
-defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
- S_MOV_B64, atomic_load_add_local>;
-defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
- S_MOV_B64, atomic_load_sub_local>;
+def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
+ S_MOV_B64, atomic_load_add_local>;
+def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
+ S_MOV_B64, atomic_load_sub_local>;
-defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
-defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
-defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
-defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
-defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
-defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
-defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
-defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
-defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
-defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
-defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
//===----------------------------------------------------------------------===//
@@ -2602,12 +2884,12 @@ defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
PatFrag constant_ld> {
def : Pat <
- (vt (constant_ld (add i64:$ptr, i64:$offset))),
- (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+ (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))),
+ (Instr_ADDR64 $srsrc, $vaddr, $offset)
>;
-
}
+let Predicates = [isSICI] in {
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
@@ -2615,6 +2897,7 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
+} // End Predicates = [isSICI]
class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
(vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
@@ -2622,6 +2905,7 @@ class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
(Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
>;
+let Predicates = [isSICI] in {
def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
@@ -2629,6 +2913,7 @@ def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+} // End Predicates = [isSICI]
// BUFFER_LOAD_DWORD*, addr64=0
multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
@@ -2667,26 +2952,28 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
>;
}
+let Predicates = [isSICI] in {
defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
+} // End Predicates = [isSICI]
class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
- (st vt:$value, (MUBUFAddr32 v4i32:$srsrc, i32:$vaddr, i32:$soffset,
- u16imm:$offset, i1imm:$offen, i1imm:$idxen,
- i1imm:$glc, i1imm:$slc, i1imm:$tfe)),
- (Instr $value, $srsrc, $vaddr, $soffset, $offset, $offen, $idxen,
- $glc, $slc, $tfe)
+ (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
+ u16imm:$offset)),
+ (Instr $value, $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
>;
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE, i32, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT, i32, truncstorei16_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORD, i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2, v2i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4, v4i32, store_private>;
+let Predicates = [isSICI] in {
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+} // End Predicates = [isSICI]
/*
class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
@@ -2694,11 +2981,13 @@ class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
(Instr $value, $srsrc, $vaddr, $offset)
>;
+let Predicates = [isSICI] in {
def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
+} // End Predicates = [isSICI]
*/
@@ -2725,29 +3014,26 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
let SubtargetPredicate = isCI in {
-// Sea island new arithmetic instructinos
-let neverHasSideEffects = 1 in {
-defm V_TRUNC_F64 : VOP1_64 <0x00000017, "V_TRUNC_F64",
- [(set f64:$dst, (ftrunc f64:$src0))]
->;
-defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64",
- [(set f64:$dst, (fceil f64:$src0))]
+defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
+ VOP_I32_I32_I32
>;
-defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64",
- [(set f64:$dst, (ffloor f64:$src0))]
+defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
+ VOP_I32_I32_I32
>;
-defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64",
- [(set f64:$dst, (frint f64:$src0))]
+defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
+ VOP_I32_I32_I32
>;
-defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
-defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
-defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
-def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>;
+let isCommutable = 1 in {
+defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
+ VOP_I64_I32_I32_I64
+>;
// XXX - Does this set VCC?
-def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
-} // End neverHasSideEffects = 1
+defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
+ VOP_I64_I32_I32_I64
+>;
+} // End isCommutable = 1
// Remaining instructions:
// FLAT_*
@@ -2756,8 +3042,6 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
// S_CBRANCH_CDBGSYS_OR_USER
// S_CBRANCH_CDBGSYS_AND_USER
// S_DCACHE_INV_VOL
-// V_EXP_LEGACY_F32
-// V_LOG_LEGACY_F32
// DS_NOP
// DS_GWS_SEMA_RELEASE_ALL
// DS_WRAP_RTN_B32
@@ -2770,8 +3054,39 @@ def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
// BUFFER_LOAD_DWORDX3
// BUFFER_STORE_DWORDX3
-} // End iSCI
+} // End isCI
+
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
+ PatFrag flat_ld> :
+ Pat <(vt (flat_ld i64:$ptr)),
+ (Instr_ADDR64 $ptr)
+>;
+
+def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
+
+class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
+ Pat <(st vt:$value, i64:$ptr),
+ (Instr $value, $ptr)
+ >;
+def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
+def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
+def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
/********** ====================== **********/
/********** Indirect adressing **********/
@@ -2821,44 +3136,37 @@ defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
def : Pat<(i32 (sext_inreg i32:$src, i1)),
(S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
-// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
-// might not be worth the effort, and will need to expand to shifts when
-// fixing SGPR copies.
-
// Handle sext_inreg in i64
def : Pat <
(i64 (sext_inreg i64:$src, i1)),
- (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16
- (S_MOV_B32 -1), sub1)
+ (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i8)),
- (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
- (S_MOV_B32 -1), sub1)
+ (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i16)),
- (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
- (S_MOV_B32 -1), sub1)
+ (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16
+>;
+
+def : Pat <
+ (i64 (sext_inreg i64:$src, i32)),
+ (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16
>;
class ZExt_i64_i32_Pat <SDNode ext> : Pat <
(i64 (ext i32:$src)),
- (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
- (S_MOV_B32 0), sub1)
+ (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1)
>;
class ZExt_i64_i1_Pat <SDNode ext> : Pat <
(i64 (ext i1:$src)),
- (INSERT_SUBREG
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
- (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0),
- (S_MOV_B32 0), sub1)
+ (REG_SEQUENCE VReg_64,
+ (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
+ (S_MOV_B32 0), sub1)
>;
@@ -2869,20 +3177,38 @@ def : ZExt_i64_i1_Pat<anyext>;
def : Pat <
(i64 (sext i32:$src)),
- (INSERT_SUBREG
- (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
- (S_ASHR_I32 $src, 31), sub1)
+ (REG_SEQUENCE SReg_64, $src, sub0,
+ (S_ASHR_I32 $src, 31), sub1)
>;
def : Pat <
(i64 (sext i1:$src)),
- (INSERT_SUBREG
- (INSERT_SUBREG
- (i64 (IMPLICIT_DEF)),
- (V_CNDMASK_B32_e64 0, -1, $src), sub0),
+ (REG_SEQUENCE VReg_64,
+ (V_CNDMASK_B32_e64 0, -1, $src), sub0,
(V_CNDMASK_B32_e64 0, -1, $src), sub1)
>;
+// If we need to perform a logical operation on i1 values, we need to
+// use vector comparisons since there is only one SCC register. Vector
+// comparisions still write to a pair of SGPRs, so treat these as
+// 64-bit comparisons. When legalizing SGPR copies, instructions
+// resulting in the copies from SCC to these instructions will be
+// moved to the VALU.
+def : Pat <
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B64 $src0, $src1)
+>;
+
+def : Pat <
+ (i1 (xor i1:$src0, i1:$src1)),
+ (S_XOR_B64 $src0, $src1)
+>;
+
def : Pat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
@@ -2895,7 +3221,7 @@ def : Pat <
def : Pat <
(f64 (sint_to_fp i1:$src)),
- (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+ (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
>;
def : Pat <
@@ -2914,13 +3240,25 @@ def : Pat <
def : Pat <
(i1 (trunc i32:$a)),
- (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1)
+ (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
+>;
+
+def : Pat <
+ (i32 (bswap i32:$a)),
+ (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
+ (V_ALIGNBIT_B32 $a, $a, 24),
+ (V_ALIGNBIT_B32 $a, $a, 8))
+>;
+
+def : Pat <
+ (f32 (select i1:$src2, f32:$src1, f32:$src0)),
+ (V_CNDMASK_B32_e64 $src0, $src1, $src2)
>;
//============================================================================//
// Miscellaneous Optimization Patterns
//============================================================================//
-def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
-} // End isSI predicate
+} // End isGCN predicate
diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp
new file mode 100644
index 000000000000..0cb67465012d
--- /dev/null
+++ b/lib/Target/R600/SILoadStoreOptimizer.cpp
@@ -0,0 +1,434 @@
+//===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to fuse DS instructions with close by immediate offsets.
+// This will fuse operations such as
+// ds_read_b32 v0, v2 offset:16
+// ds_read_b32 v1, v2 offset:32
+// ==>
+// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
+//
+//
+// Future improvements:
+//
+// - This currently relies on the scheduler to place loads and stores next to
+// each other, and then only merges adjacent pairs of instructions. It would
+// be good to be more flexible with interleaved instructions, and possibly run
+// before scheduling. It currently missing stores of constants because loading
+// the constant into the data register is placed between the stores, although
+// this is arguably a scheduling problem.
+//
+// - Live interval recomputing seems inefficient. This currently only matches
+// one pair, and recomputes live intervals and moves on to the next pair. It
+// would be better to compute a list of all merges that need to occur
+//
+// - With a list of instructions to process, we can also merge more. If a
+// cluster of loads have offsets that are too large to fit in the 8-bit
+// offsets, but are close enough to fit in the 8 bits, we can add to the base
+// pointer and use the new reduced offsets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-load-store-opt"
+
+namespace {
+
+class SILoadStoreOptimizer : public MachineFunctionPass {
+private:
+ const TargetMachine *TM;
+ const SIInstrInfo *TII;
+ const SIRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ LiveIntervals *LIS;
+
+
+ static bool offsetsCanBeCombined(unsigned Offset0,
+ unsigned Offset1,
+ unsigned EltSize);
+
+ MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
+ unsigned EltSize);
+
+ void updateRegDefsUses(unsigned SrcReg,
+ unsigned DstReg,
+ unsigned SubIdx);
+
+ MachineBasicBlock::iterator mergeRead2Pair(
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ unsigned EltSize);
+
+ MachineBasicBlock::iterator mergeWrite2Pair(
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ unsigned EltSize);
+
+public:
+ static char ID;
+
+ SILoadStoreOptimizer() :
+ MachineFunctionPass(ID),
+ TM(nullptr),
+ TII(nullptr),
+ TRI(nullptr),
+ MRI(nullptr),
+ LIS(nullptr) {
+
+ }
+
+ SILoadStoreOptimizer(const TargetMachine &TM_) :
+ MachineFunctionPass(ID),
+ TM(&TM_),
+ TII(static_cast<const SIInstrInfo*>(TM->getSubtargetImpl()->getInstrInfo())) {
+ initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool optimizeBlock(MachineBasicBlock &MBB);
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "SI Load / Store Optimizer";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreserved<LiveVariables>();
+ AU.addRequired<LiveIntervals>();
+
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
+ "SI Load / Store Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
+ "SI Load / Store Optimizer", false, false)
+
+char SILoadStoreOptimizer::ID = 0;
+
+char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
+
+FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
+ return new SILoadStoreOptimizer(TM);
+}
+
+bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
+ unsigned Offset1,
+ unsigned Size) {
+ // XXX - Would the same offset be OK? Is there any reason this would happen or
+ // be useful?
+ if (Offset0 == Offset1)
+ return false;
+
+ // This won't be valid if the offset isn't aligned.
+ if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
+ return false;
+
+ unsigned EltOffset0 = Offset0 / Size;
+ unsigned EltOffset1 = Offset1 / Size;
+
+ // Check if the new offsets fit in the reduced 8-bit range.
+ if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
+ return true;
+
+ // If the offset in elements doesn't fit in 8-bits, we might be able to use
+ // the stride 64 versions.
+ if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
+ return false;
+
+ return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
+ unsigned EltSize){
+ MachineBasicBlock::iterator E = I->getParent()->end();
+ MachineBasicBlock::iterator MBBI = I;
+ ++MBBI;
+
+ if (MBBI->getOpcode() != I->getOpcode())
+ return E;
+
+ // Don't merge volatiles.
+ if (MBBI->hasOrderedMemoryRef())
+ return E;
+
+ int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+ const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+ const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg0.getReg() == AddrReg1.getReg() &&
+ AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+ AMDGPU::OpName::offset);
+ unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+ unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+ // Check both offsets fit in the reduced range.
+ if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
+ return MBBI;
+ }
+
+ return E;
+}
+
+void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg,
+ unsigned DstReg,
+ unsigned SubIdx) {
+ for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg),
+ E = MRI->reg_end(); I != E; ) {
+ MachineOperand &O = *I;
+ ++I;
+ O.substVirtReg(DstReg, SubIdx, *TRI);
+ }
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ unsigned EltSize) {
+ MachineBasicBlock *MBB = I->getParent();
+
+ // Be careful, since the addresses could be subregisters themselves in weird
+ // cases, like vectors of pointers.
+ const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+ const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
+
+ unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
+ unsigned DestReg1
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg();
+
+ unsigned Offset0
+ = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+ unsigned Offset1
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+ unsigned NewOffset0 = Offset0 / EltSize;
+ unsigned NewOffset1 = Offset1 / EltSize;
+ unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+
+ // Prefer the st64 form if we can use it, even if we can fit the offset in the
+ // non st64 version. I'm not sure if there's any real reason to do this.
+ bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+ if (UseST64) {
+ NewOffset0 /= 64;
+ NewOffset1 /= 64;
+ Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+ }
+
+ assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+ (NewOffset0 != NewOffset1) &&
+ "Computed offset doesn't fit");
+
+ const MCInstrDesc &Read2Desc = TII->get(Opc);
+
+ const TargetRegisterClass *SuperRC
+ = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+ unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+
+ DebugLoc DL = I->getDebugLoc();
+ MachineInstrBuilder Read2
+ = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
+ .addImm(0) // gds
+ .addOperand(*AddrReg) // addr
+ .addImm(NewOffset0) // offset0
+ .addImm(NewOffset1) // offset1
+ .addOperand(*M0Reg) // M0
+ .addMemOperand(*I->memoperands_begin())
+ .addMemOperand(*Paired->memoperands_begin());
+
+ LIS->InsertMachineInstrInMaps(Read2);
+
+ unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+ unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+ updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
+ updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
+
+ LIS->RemoveMachineInstrFromMaps(I);
+ LIS->RemoveMachineInstrFromMaps(Paired);
+ I->eraseFromParent();
+ Paired->eraseFromParent();
+
+ LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
+ LIS->shrinkToUses(&AddrRegLI);
+
+ LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg());
+ LIS->shrinkToUses(&M0RegLI);
+
+ // Currently m0 is treated as a register class with one member instead of an
+ // implicit physical register. We are using the virtual register for the first
+ // one, but we still need to update the live range of the now unused second m0
+ // virtual register to avoid verifier errors.
+ const MachineOperand *PairedM0Reg
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0);
+ LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg());
+ LIS->shrinkToUses(&PairedM0RegLI);
+
+ LIS->getInterval(DestReg); // Create new LI
+
+ DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+ return Read2.getInstr();
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+ MachineBasicBlock::iterator I,
+ MachineBasicBlock::iterator Paired,
+ unsigned EltSize) {
+ MachineBasicBlock *MBB = I->getParent();
+
+ // Be sure to use .addOperand(), and not .addReg() with these. We want to be
+ // sure we preserve the subregister index and any register flags set on them.
+ const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+ const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
+ const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
+ const MachineOperand *Data1
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
+
+
+ unsigned Offset0
+ = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+ unsigned Offset1
+ = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+ unsigned NewOffset0 = Offset0 / EltSize;
+ unsigned NewOffset1 = Offset1 / EltSize;
+ unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+
+ // Prefer the st64 form if we can use it, even if we can fit the offset in the
+ // non st64 version. I'm not sure if there's any real reason to do this.
+ bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+ if (UseST64) {
+ NewOffset0 /= 64;
+ NewOffset1 /= 64;
+ Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+ }
+
+ assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+ (NewOffset0 != NewOffset1) &&
+ "Computed offset doesn't fit");
+
+ const MCInstrDesc &Write2Desc = TII->get(Opc);
+ DebugLoc DL = I->getDebugLoc();
+
+ MachineInstrBuilder Write2
+ = BuildMI(*MBB, I, DL, Write2Desc)
+ .addImm(0) // gds
+ .addOperand(*Addr) // addr
+ .addOperand(*Data0) // data0
+ .addOperand(*Data1) // data1
+ .addImm(NewOffset0) // offset0
+ .addImm(NewOffset1) // offset1
+ .addOperand(*M0Reg) // m0
+ .addMemOperand(*I->memoperands_begin())
+ .addMemOperand(*Paired->memoperands_begin());
+
+ // XXX - How do we express subregisters here?
+ unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(),
+ M0Reg->getReg()};
+
+ LIS->RemoveMachineInstrFromMaps(I);
+ LIS->RemoveMachineInstrFromMaps(Paired);
+ I->eraseFromParent();
+ Paired->eraseFromParent();
+
+ LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
+
+ DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+ return Write2.getInstr();
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
+ bool Modified = false;
+
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+ MachineInstr &MI = *I;
+
+ // Don't combine if volatile.
+ if (MI.hasOrderedMemoryRef()) {
+ ++I;
+ continue;
+ }
+
+ unsigned Opc = MI.getOpcode();
+ if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
+ unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ if (Match != E) {
+ Modified = true;
+ I = mergeRead2Pair(I, Match, Size);
+ } else {
+ ++I;
+ }
+
+ continue;
+ } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+ unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ if (Match != E) {
+ Modified = true;
+ I = mergeWrite2Pair(I, Match, Size);
+ } else {
+ ++I;
+ }
+
+ continue;
+ }
+
+ ++I;
+ }
+
+ return Modified;
+}
+
+bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
+ const TargetSubtargetInfo *STM = MF.getTarget().getSubtargetImpl();
+ TRI = static_cast<const SIRegisterInfo*>(STM->getRegisterInfo());
+ TII = static_cast<const SIInstrInfo*>(STM->getInstrInfo());
+ MRI = &MF.getRegInfo();
+
+ LIS = &getAnalysis<LiveIntervals>();
+
+ DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+
+ assert(!MRI->isSSA());
+
+ bool Modified = false;
+
+ for (MachineBasicBlock &MBB : MF)
+ Modified |= optimizeBlock(MBB);
+
+ return Modified;
+}
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 75b5a5e027ff..068b22f37704 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -49,8 +49,10 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -86,7 +88,6 @@ private:
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
- void InitM0ForLDS(MachineBasicBlock::iterator MI);
void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
void IndirectSrc(MachineInstr &MI);
void IndirectDst(MachineInstr &MI);
@@ -307,10 +308,9 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
#endif
// Clear this thread from the exec mask if the operand is negative
- if ((Op.isImm() || Op.isFPImm())) {
+ if ((Op.isImm())) {
// Constant operand: Set exec mask to 0 or do nothing
- if (Op.isImm() ? (Op.getImm() & 0x80000000) :
- Op.getFPImm()->isNegative()) {
+ if (Op.getImm() & 0x80000000) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addImm(0);
}
@@ -323,14 +323,6 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
MI.eraseFromParent();
}
-/// The m0 register stores the maximum allowable address for LDS reads and
-/// writes. Its value must be at least the size in bytes of LDS allocated by
-/// the shader. For simplicity, we set it to the maximum possible value.
-void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
- AMDGPU::M0).addImm(0xffffffff);
-}
-
void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
MachineBasicBlock &MBB = *MI.getParent();
@@ -347,7 +339,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
} else {
assert(AMDGPU::SReg_64RegClass.contains(Save));
- assert(AMDGPU::VReg_32RegClass.contains(Idx));
+ assert(AMDGPU::VGPR_32RegClass.contains(Idx));
// Save the EXEC mask
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
@@ -389,12 +381,6 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
.addReg(Save);
}
- // FIXME: Are there any values other than the LDS address clamp that need to
- // be stored in the m0 register and may be live for more than a few
- // instructions? If so, we should save the m0 register at the beginning
- // of this function and restore it here.
- // FIXME: Add support for LDS direct loads.
- InitM0ForLDS(&MI);
MI.eraseFromParent();
}
@@ -442,13 +428,14 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
}
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
- TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
- TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+ TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TRI =
+ static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
bool HaveKill = false;
- bool NeedM0 = false;
bool NeedWQM = false;
+ bool NeedFlat = false;
unsigned Depth = 0;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -460,10 +447,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (TII->isDS(MI.getOpcode())) {
- NeedM0 = true;
+ if (TII->isDS(MI.getOpcode()))
NeedWQM = true;
- }
+
+ // Flat uses m0 in case it needs to access LDS.
+ if (TII->isFLAT(MI.getOpcode()))
+ NeedFlat = true;
switch (MI.getOpcode()) {
default: break;
@@ -530,23 +519,54 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::V_INTERP_MOV_F32:
NeedWQM = true;
break;
-
}
}
}
- if (NeedM0) {
- MachineBasicBlock &MBB = MF.front();
- // Initialize M0 to a value that won't cause LDS access to be discarded
- // due to offset clamping
- InitM0ForLDS(MBB.getFirstNonPHI());
- }
-
if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
MachineBasicBlock &MBB = MF.front();
BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
}
+ // FIXME: This seems inappropriate to do here.
+ if (NeedFlat && MFI->IsKernel) {
+ // Insert the prologue initializing the SGPRs pointing to the scratch space
+ // for flat accesses.
+ const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+ // TODO: What to use with function calls?
+
+ // FIXME: This is reporting stack size that is used in a scratch buffer
+ // rather than registers as well.
+ uint64_t StackSizeBytes = FrameInfo->getStackSize();
+
+ int IndirectBegin
+ = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
+ // Convert register index to 256-byte unit.
+ uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
+
+ assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
+ "Stack limits should be smaller than 16-bits");
+
+ // Initialize the flat scratch register pair.
+ // TODO: Can we use one s_mov_b64 here?
+
+ // Offset is in units of 256-bytes.
+ MachineBasicBlock &MBB = MF.front();
+ DebugLoc NoDL;
+ MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
+ const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
+
+ assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
+
+ BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
+ .addImm(StackOffset);
+
+ // Documentation says size is "per-thread scratch size in bytes"
+ BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
+ .addImm(StackSizeBytes);
+ }
+
return true;
}
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
index db19235995be..67421e231d8d 100644
--- a/lib/Target/R600/SILowerI1Copies.cpp
+++ b/lib/Target/R600/SILowerI1Copies.cpp
@@ -15,6 +15,7 @@
#define DEBUG_TYPE "si-i1-copies"
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -39,14 +40,14 @@ public:
initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
}
- virtual bool runOnMachineFunction(MachineFunction &MF) override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
- virtual const char *getPassName() const override {
- return "SI Lower il Copies";
+ const char *getPassName() const override {
+ return "SI Lower i1 Copies";
}
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineDominatorTree>();
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -55,10 +56,10 @@ public:
} // End anonymous namespace.
INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
- "SI Lower il Copies", false, false)
+ "SI Lower i1 Copies", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
- "SI Lower il Copies", false, false)
+ "SI Lower i1 Copies", false, false)
char SILowerI1Copies::ID = 0;
@@ -70,9 +71,9 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- MF.getTarget().getInstrInfo());
- const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
std::vector<unsigned> I1Defs;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -84,71 +85,67 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
+ if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
+ unsigned Reg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+ if (RC == &AMDGPU::VReg_1RegClass)
+ MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
continue;
}
- if (MI.getOpcode() == AMDGPU::V_AND_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
+ if (MI.getOpcode() != AMDGPU::COPY)
continue;
- }
- if (MI.getOpcode() == AMDGPU::V_OR_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
- continue;
- }
+ const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Src = MI.getOperand(1);
- if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32));
+ if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
+ !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
continue;
- }
-
- if (MI.getOpcode() != AMDGPU::COPY ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
- continue;
-
- const TargetRegisterClass *DstRC =
- MRI.getRegClass(MI.getOperand(0).getReg());
- const TargetRegisterClass *SrcRC =
- MRI.getRegClass(MI.getOperand(1).getReg());
+ const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
if (DstRC == &AMDGPU::VReg_1RegClass &&
TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
- I1Defs.push_back(MI.getOperand(0).getReg());
- BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
- .addOperand(MI.getOperand(0))
- .addImm(0)
- .addImm(-1)
- .addOperand(MI.getOperand(1))
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(0);
+ I1Defs.push_back(Dst.getReg());
+ DebugLoc DL = MI.getDebugLoc();
+
+ MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
+ if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
+ if (DefInst->getOperand(1).isImm()) {
+ I1Defs.push_back(Dst.getReg());
+
+ int64_t Val = DefInst->getOperand(1).getImm();
+ assert(Val == 0 || Val == -1);
+
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
+ .addOperand(Dst)
+ .addImm(Val);
+ MI.eraseFromParent();
+ continue;
+ }
+ }
+
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+ .addOperand(Dst)
+ .addImm(0)
+ .addImm(-1)
+ .addOperand(Src);
MI.eraseFromParent();
} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
SrcRC == &AMDGPU::VReg_1RegClass) {
BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
- .addOperand(MI.getOperand(0))
- .addImm(0)
- .addOperand(MI.getOperand(1))
- .addImm(0)
- .addImm(0)
- .addImm(0)
- .addImm(0);
+ .addOperand(Dst)
+ .addOperand(Src)
+ .addImm(0);
MI.eraseFromParent();
}
}
}
for (unsigned Reg : I1Defs)
- MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+ MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
return false;
}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index c53a7e10d548..198dd568374e 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,8 +10,10 @@
#include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
@@ -26,72 +28,50 @@ void SIMachineFunctionInfo::anchor() {}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
+ TIDReg(AMDGPU::NoRegister),
+ HasSpilledVGPRs(false),
PSInputAddr(0),
- SpillTracker(),
- NumUserSGPRs(0) { }
+ NumUserSGPRs(0),
+ LDSWaveSpillSize(0) { }
-static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
- unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
+ MachineFunction *MF,
+ unsigned FrameIndex,
+ unsigned SubIdx) {
+ const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(
+ MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
+ Offset += SubIdx * 4;
- // We need to add this register as live out for the function, in order to
- // have the live range calculated directly.
- //
- // When register spilling begins, we have already calculated the live
- // live intervals for all the registers. Since we are spilling SGPRs to
- // VGPRs, we need to update the Lane VGPR's live interval every time we
- // spill or restore a register.
- //
- // Unfortunately, there is no good way to update the live interval as
- // the TargetInstrInfo callbacks for spilling and restoring don't give
- // us access to the live interval information.
- //
- // We are lucky, though, because the InlineSpiller calls
- // LiveRangeEdit::calculateRegClassAndHint() which iterates through
- // all the new register that have been created when restoring a register
- // and calls LiveIntervals::getInterval(), which creates and computes
- // the live interval for the newly created register. However, once this
- // live intervals is created, it doesn't change and since we usually reuse
- // the Lane VGPR multiple times, this means any uses after the first aren't
- // added to the live interval.
- //
- // To work around this, we add Lane VGPRs to the functions live out list,
- // so that we can guarantee its live range will cover all of its uses.
+ unsigned LaneVGPRIdx = Offset / (64 * 4);
+ unsigned Lane = (Offset / 4) % 64;
- for (MachineBasicBlock &MBB : *MF) {
- if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
- MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
- return VGPR;
- }
- }
+ struct SpilledReg Spill;
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("Could not find S_ENDPGM instruction.");
+ if (!LaneVGPRs.count(LaneVGPRIdx)) {
+ unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+ LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
+ MRI.setPhysRegUsed(LaneVGPR);
- return VGPR;
-}
-
-unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes(
- MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) {
- unsigned StartLane = CurrentLane;
- CurrentLane += NumRegs;
- if (!LaneVGPR) {
- LaneVGPR = createLaneVGPR(MRI, MF);
- } else {
- if (CurrentLane >= MAX_LANES) {
- StartLane = CurrentLane = 0;
- LaneVGPR = createLaneVGPR(MRI, MF);
+ // Add this register as live-in to all blocks to avoid machine verifer
+ // complaining about use of an undefined physical register.
+ for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
+ BI != BE; ++BI) {
+ BI->addLiveIn(LaneVGPR);
}
}
- return StartLane;
-}
-void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
- unsigned Reg,
- int Lane) {
- SpilledRegisters[FrameIndex] = SpilledReg(Reg, Lane);
+ Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
+ Spill.Lane = Lane;
+ return Spill;
}
-const SIMachineFunctionInfo::SpilledReg&
-SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) {
- return SpilledRegisters[FrameIndex];
+unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>();
+ // FIXME: We should get this information from kernel attributes if it
+ // is available.
+ return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 9684d285cec2..71852717d7e6 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -12,10 +12,11 @@
//===----------------------------------------------------------------------===//
-#ifndef SIMACHINEFUNCTIONINFO_H_
-#define SIMACHINEFUNCTIONINFO_H_
+#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
#include "AMDGPUMachineFunction.h"
+#include "SIRegisterInfo.h"
#include <map>
namespace llvm {
@@ -26,6 +27,10 @@ class MachineRegisterInfo;
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
void anchor() override;
+
+ unsigned TIDReg;
+ bool HasSpilledVGPRs;
+
public:
struct SpilledReg {
@@ -36,33 +41,25 @@ public:
bool hasLane() { return Lane != -1;}
};
- struct RegSpillTracker {
- private:
- unsigned CurrentLane;
- std::map<unsigned, SpilledReg> SpilledRegisters;
- public:
- unsigned LaneVGPR;
- RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
- /// \p NumRegs The number of consecutive registers what need to be spilled.
- /// This function will ensure that all registers are stored in
- /// the same VGPR.
- /// \returns The lane to be used for storing the first register.
- unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF,
- unsigned NumRegs = 1);
- void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
- const SpilledReg& getSpilledReg(unsigned FrameIndex);
- bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
- };
-
// SIMachineFunctionInfo definition
SIMachineFunctionInfo(const MachineFunction &MF);
+ SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
+ unsigned SubIdx);
unsigned PSInputAddr;
- struct RegSpillTracker SpillTracker;
unsigned NumUserSGPRs;
+ std::map<unsigned, unsigned> LaneVGPRs;
+ unsigned LDSWaveSpillSize;
+ bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
+ unsigned getTIDReg() const { return TIDReg; };
+ void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+ bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
+ void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
+
+ unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
};
} // End namespace llvm
-#endif //_SIMACHINEFUNCTIONINFO_H_
+#endif
diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp
new file mode 100644
index 000000000000..f0e7edec6b48
--- /dev/null
+++ b/lib/Target/R600/SIPrepareScratchRegs.cpp
@@ -0,0 +1,196 @@
+//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This pass loads scratch pointer and scratch offset into a register or a
+/// frame index which can be used anywhere in the program. These values will
+/// be used for spilling VGPRs.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIPrepareScratchRegs : public MachineFunctionPass {
+
+private:
+ static char ID;
+
+public:
+ SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "SI prepare scratch registers";
+ }
+
+};
+
+} // End anonymous namespace
+
+char SIPrepareScratchRegs::ID = 0;
+
+FunctionPass *llvm::createSIPrepareScratchRegs() {
+ return new SIPrepareScratchRegs();
+}
+
+bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+ MachineBasicBlock *Entry = MF.begin();
+ MachineBasicBlock::iterator I = Entry->begin();
+ DebugLoc DL = I->getDebugLoc();
+
+ // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
+ // run this pass.
+ if (!MFI->hasSpilledVGPRs())
+ return false;
+
+ unsigned ScratchPtrPreloadReg =
+ TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+ unsigned ScratchOffsetPreloadReg =
+ TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
+ if (!Entry->isLiveIn(ScratchPtrPreloadReg))
+ Entry->addLiveIn(ScratchPtrPreloadReg);
+
+ if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
+ Entry->addLiveIn(ScratchOffsetPreloadReg);
+
+ // Load the scratch pointer
+ unsigned ScratchPtrReg =
+ TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass);
+ int ScratchPtrFI = -1;
+
+ if (ScratchPtrReg != AMDGPU::NoRegister) {
+ // Found an SGPR to use.
+ MRI.setPhysRegUsed(ScratchPtrReg);
+ BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B64), ScratchPtrReg)
+ .addReg(ScratchPtrPreloadReg);
+ } else {
+ // No SGPR is available, we must spill.
+ ScratchPtrFI = FrameInfo->CreateSpillStackObject(8, 4);
+ BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S64_SAVE))
+ .addReg(ScratchPtrPreloadReg)
+ .addFrameIndex(ScratchPtrFI);
+ }
+
+ // Load the scratch offset.
+ unsigned ScratchOffsetReg =
+ TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
+ int ScratchOffsetFI = ~0;
+
+ if (ScratchOffsetReg != AMDGPU::NoRegister) {
+ // Found an SGPR to use
+ MRI.setPhysRegUsed(ScratchOffsetReg);
+ BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
+ .addReg(ScratchOffsetPreloadReg);
+ } else {
+ // No SGPR is available, we must spill.
+ ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
+ BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
+ .addReg(ScratchOffsetPreloadReg)
+ .addFrameIndex(ScratchOffsetFI);
+ }
+
+
+ // Now that we have the scratch pointer and offset values, we need to
+ // add them to all the SI_SPILL_V* instructions.
+
+ RegScavenger RS;
+ bool UseRegScavenger =
+ (ScratchPtrReg == AMDGPU::NoRegister ||
+ ScratchOffsetReg == AMDGPU::NoRegister);
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; ++BI) {
+
+ MachineBasicBlock &MBB = *BI;
+ if (UseRegScavenger)
+ RS.enterBasicBlock(&MBB);
+
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+ DebugLoc DL = MI.getDebugLoc();
+ switch(MI.getOpcode()) {
+ default: break;;
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ case AMDGPU::SI_SPILL_V512_RESTORE:
+
+ // Scratch Pointer
+ if (ScratchPtrReg == AMDGPU::NoRegister) {
+ ScratchPtrReg = RS.scavengeRegister(&AMDGPU::SGPR_64RegClass, 0);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S64_RESTORE),
+ ScratchPtrReg)
+ .addFrameIndex(ScratchPtrFI)
+ .addReg(AMDGPU::NoRegister)
+ .addReg(AMDGPU::NoRegister);
+ } else if (!MBB.isLiveIn(ScratchPtrReg)) {
+ MBB.addLiveIn(ScratchPtrReg);
+ }
+
+ if (ScratchOffsetReg == AMDGPU::NoRegister) {
+ ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
+ ScratchOffsetReg)
+ .addFrameIndex(ScratchOffsetFI)
+ .addReg(AMDGPU::NoRegister)
+ .addReg(AMDGPU::NoRegister);
+ } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
+ MBB.addLiveIn(ScratchOffsetReg);
+ }
+
+ if (ScratchPtrReg == AMDGPU::NoRegister ||
+ ScratchOffsetReg == AMDGPU::NoRegister) {
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ Ctx.emitError("ran out of SGPRs for spilling VGPRs");
+ ScratchPtrReg = AMDGPU::SGPR0;
+ ScratchOffsetReg = AMDGPU::SGPR0;
+ }
+ MI.getOperand(2).setReg(ScratchPtrReg);
+ MI.getOperand(3).setReg(ScratchOffsetReg);
+
+ break;
+ }
+ if (UseRegScavenger)
+ RS.forward();
+ }
+ }
+ return true;
+}
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 2a9a2ac5dd61..f9feea470f15 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -20,7 +20,10 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st)
@@ -30,7 +33,21 @@ SIRegisterInfo::SIRegisterInfo(const AMDGPUSubtarget &st)
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::EXEC);
+
+ // EXEC_LO and EXEC_HI could be allocated and used as regular register,
+ // but this seems likely to result in bugs, so I'm marking them as reserved.
+ Reserved.set(AMDGPU::EXEC_LO);
+ Reserved.set(AMDGPU::EXEC_HI);
+
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+ Reserved.set(AMDGPU::FLAT_SCR);
+ Reserved.set(AMDGPU::FLAT_SCR_LO);
+ Reserved.set(AMDGPU::FLAT_SCR_HI);
+
+ // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
+ Reserved.set(AMDGPU::VGPR255);
+ Reserved.set(AMDGPU::VGPR254);
+
return Reserved;
}
@@ -43,23 +60,238 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const
return Fn.getFrameInfo()->hasStackObjects();
}
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+ switch (Op) {
+ case AMDGPU::SI_SPILL_S512_SAVE:
+ case AMDGPU::SI_SPILL_S512_RESTORE:
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V512_RESTORE:
+ return 16;
+ case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ return 8;
+ case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ return 4;
+ case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V96_RESTORE:
+ return 3;
+ case AMDGPU::SI_SPILL_S64_SAVE:
+ case AMDGPU::SI_SPILL_S64_RESTORE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ return 2;
+ case AMDGPU::SI_SPILL_S32_SAVE:
+ case AMDGPU::SI_SPILL_S32_RESTORE:
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ return 1;
+ default: llvm_unreachable("Invalid spill opcode");
+ }
+}
+
+void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp,
+ unsigned Value,
+ unsigned ScratchPtr,
+ unsigned ScratchOffset,
+ int64_t Offset,
+ RegScavenger *RS) const {
+
+ const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+ MachineBasicBlock *MBB = MI->getParent();
+ const MachineFunction *MF = MI->getParent()->getParent();
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ DebugLoc DL = MI->getDebugLoc();
+ bool IsLoad = TII->get(LoadStoreOp).mayLoad();
+
+ bool RanOutOfSGPRs = false;
+ unsigned SOffset = ScratchOffset;
+
+ unsigned RsrcReg = RS->scavengeRegister(&AMDGPU::SReg_128RegClass, MI, 0);
+ if (RsrcReg == AMDGPU::NoRegister) {
+ RanOutOfSGPRs = true;
+ RsrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+ }
+
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned Size = NumSubRegs * 4;
+
+ uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+ 0xffffffff; // Size
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B64),
+ getSubReg(RsrcReg, AMDGPU::sub0_sub1))
+ .addReg(ScratchPtr)
+ .addReg(RsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32),
+ getSubReg(RsrcReg, AMDGPU::sub2))
+ .addImm(Rsrc & 0xffffffff)
+ .addReg(RsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32),
+ getSubReg(RsrcReg, AMDGPU::sub3))
+ .addImm(Rsrc >> 32)
+ .addReg(RsrcReg, RegState::ImplicitDefine);
+
+ if (!isUInt<12>(Offset + Size)) {
+ SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+ if (SOffset == AMDGPU::NoRegister) {
+ RanOutOfSGPRs = true;
+ SOffset = AMDGPU::SGPR0;
+ }
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+ .addReg(ScratchOffset)
+ .addImm(Offset);
+ Offset = 0;
+ }
+
+ if (RanOutOfSGPRs)
+ Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
+
+ for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
+ unsigned SubReg = NumSubRegs > 1 ?
+ getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
+ Value;
+ bool IsKill = (i == e - 1);
+
+ BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+ .addReg(SubReg, getDefRegState(IsLoad))
+ .addReg(RsrcReg, getKillRegState(IsKill))
+ .addImm(Offset)
+ .addReg(SOffset, getKillRegState(IsKill))
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+ }
+}
+
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineFunction *MF = MI->getParent()->getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+ DebugLoc DL = MI->getDebugLoc();
+
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
int Index = MI->getOperand(FIOperandNum).getIndex();
- int64_t Offset = FrameInfo->getObjectOffset(Index);
- FIOp.ChangeToImmediate(Offset);
- if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
- unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
- .addImm(Offset);
- FIOp.ChangeToRegister(TmpReg, false);
+ switch (MI->getOpcode()) {
+ // SGPR register spill
+ case AMDGPU::SI_SPILL_S512_SAVE:
+ case AMDGPU::SI_SPILL_S256_SAVE:
+ case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S64_SAVE:
+ case AMDGPU::SI_SPILL_S32_SAVE: {
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
+ &AMDGPU::SGPR_32RegClass, i);
+ struct SIMachineFunctionInfo::SpilledReg Spill =
+ MFI->getSpilledReg(MF, Index, i);
+
+ if (Spill.VGPR == AMDGPU::NoRegister) {
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+ }
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+ .addReg(SubReg)
+ .addImm(Spill.Lane);
+
+ }
+ MI->eraseFromParent();
+ break;
+ }
+
+ // SGPR register restore
+ case AMDGPU::SI_SPILL_S512_RESTORE:
+ case AMDGPU::SI_SPILL_S256_RESTORE:
+ case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S64_RESTORE:
+ case AMDGPU::SI_SPILL_S32_RESTORE: {
+ unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+ for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+ unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
+ &AMDGPU::SGPR_32RegClass, i);
+ bool isM0 = SubReg == AMDGPU::M0;
+ struct SIMachineFunctionInfo::SpilledReg Spill =
+ MFI->getSpilledReg(MF, Index, i);
+
+ if (Spill.VGPR == AMDGPU::NoRegister) {
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+ }
+
+ if (isM0) {
+ SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+ }
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+ .addReg(Spill.VGPR)
+ .addImm(Spill.Lane)
+ .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ if (isM0) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(SubReg);
+ }
+ }
+ TII->insertNOPs(MI, 3);
+ MI->eraseFromParent();
+ break;
+ }
+
+ // VGPR register spill
+ case AMDGPU::SI_SPILL_V512_SAVE:
+ case AMDGPU::SI_SPILL_V256_SAVE:
+ case AMDGPU::SI_SPILL_V128_SAVE:
+ case AMDGPU::SI_SPILL_V96_SAVE:
+ case AMDGPU::SI_SPILL_V64_SAVE:
+ case AMDGPU::SI_SPILL_V32_SAVE:
+ buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+ TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_ptr)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+ FrameInfo->getObjectOffset(Index), RS);
+ MI->eraseFromParent();
+ break;
+ case AMDGPU::SI_SPILL_V32_RESTORE:
+ case AMDGPU::SI_SPILL_V64_RESTORE:
+ case AMDGPU::SI_SPILL_V128_RESTORE:
+ case AMDGPU::SI_SPILL_V256_RESTORE:
+ case AMDGPU::SI_SPILL_V512_RESTORE: {
+ buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+ TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_ptr)->getReg(),
+ TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
+ FrameInfo->getObjectOffset(Index), RS);
+ MI->eraseFromParent();
+ break;
+ }
+
+ default: {
+ int64_t Offset = FrameInfo->getObjectOffset(Index);
+ FIOp.ChangeToImmediate(Offset);
+ if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
+ unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj);
+ BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addImm(Offset);
+ FIOp.ChangeToRegister(TmpReg, false);
+ }
+ }
}
}
@@ -67,7 +299,7 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
MVT VT) const {
switch(VT.SimpleTy) {
default:
- case MVT::i32: return &AMDGPU::VReg_32RegClass;
+ case MVT::i32: return &AMDGPU::VGPR_32RegClass;
}
}
@@ -78,13 +310,17 @@ unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
- const TargetRegisterClass *BaseClasses[] = {
- &AMDGPU::VReg_32RegClass,
+ static const TargetRegisterClass *BaseClasses[] = {
+ &AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,
+ &AMDGPU::VReg_96RegClass,
+ &AMDGPU::VReg_128RegClass,
&AMDGPU::SReg_128RegClass,
- &AMDGPU::SReg_256RegClass
+ &AMDGPU::VReg_256RegClass,
+ &AMDGPU::SReg_256RegClass,
+ &AMDGPU::VReg_512RegClass
};
for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -95,15 +331,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
return nullptr;
}
-bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
- if (!RC) {
- return false;
- }
- return !hasVGPRs(RC);
-}
-
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
- return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) ||
+ return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
@@ -118,7 +347,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
} else if (SRC == &AMDGPU::SCCRegRegClass) {
return &AMDGPU::VCCRegRegClass;
} else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
- return &AMDGPU::VReg_32RegClass;
+ return &AMDGPU::VGPR_32RegClass;
} else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
return &AMDGPU::VReg_64RegClass;
} else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
@@ -148,24 +377,61 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
const TargetRegisterClass *SubRC,
unsigned Channel) const {
+
+ switch (Reg) {
+ case AMDGPU::VCC:
+ switch(Channel) {
+ case 0: return AMDGPU::VCC_LO;
+ case 1: return AMDGPU::VCC_HI;
+ default: llvm_unreachable("Invalid SubIdx for VCC");
+ }
+
+ case AMDGPU::FLAT_SCR:
+ switch (Channel) {
+ case 0:
+ return AMDGPU::FLAT_SCR_LO;
+ case 1:
+ return AMDGPU::FLAT_SCR_HI;
+ default:
+ llvm_unreachable("Invalid SubIdx for FLAT_SCR");
+ }
+ break;
+
+ case AMDGPU::EXEC:
+ switch (Channel) {
+ case 0:
+ return AMDGPU::EXEC_LO;
+ case 1:
+ return AMDGPU::EXEC_HI;
+ default:
+ llvm_unreachable("Invalid SubIdx for EXEC");
+ }
+ break;
+ }
+
+ const TargetRegisterClass *RC = getPhysRegClass(Reg);
+ // 32-bit registers don't have sub-registers, so we can just return the
+ // Reg. We need to have this check here, because the calculation below
+ // using getHWRegIndex() will fail with special 32-bit registers like
+ // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0.
+ if (RC->getSize() == 4) {
+ assert(Channel == 0);
+ return Reg;
+ }
+
unsigned Index = getHWRegIndex(Reg);
return SubRC->getRegister(Index + Channel);
}
-bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const {
- switch (RCID) {
- default: return false;
- case AMDGPU::SSrc_32RegClassID:
- case AMDGPU::SSrc_64RegClassID:
- case AMDGPU::VSrc_32RegClassID:
- case AMDGPU::VSrc_64RegClassID:
- return true;
- }
+bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
+ return OpType == AMDGPU::OPERAND_REG_IMM32;
}
-bool SIRegisterInfo::regClassCanUseImmediate(
- const TargetRegisterClass *RC) const {
- return regClassCanUseImmediate(RC->getID());
+bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
+ if (opCanUseLiteralConstant(OpType))
+ return true;
+
+ return OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
@@ -183,6 +449,29 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
case SIRegisterInfo::SCRATCH_PTR:
return AMDGPU::SGPR2_SGPR3;
+ case SIRegisterInfo::INPUT_PTR:
+ return AMDGPU::SGPR0_SGPR1;
+ case SIRegisterInfo::TIDIG_X:
+ return AMDGPU::VGPR0;
+ case SIRegisterInfo::TIDIG_Y:
+ return AMDGPU::VGPR1;
+ case SIRegisterInfo::TIDIG_Z:
+ return AMDGPU::VGPR2;
}
llvm_unreachable("unexpected preloaded value type");
}
+
+/// \brief Returns a register that is not used at any point in the function.
+/// If all registers are used, then this function will return
+// AMDGPU::NoRegister.
+unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC) const {
+
+ for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+ I != E; ++I) {
+ if (!MRI.isPhysRegUsed(*I))
+ return *I;
+ }
+ return AMDGPU::NoRegister;
+}
+
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 5d0235c0f427..d14212c2b104 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -13,8 +13,8 @@
//===----------------------------------------------------------------------===//
-#ifndef SIREGISTERINFO_H_
-#define SIREGISTERINFO_H_
+#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
#include "AMDGPURegisterInfo.h"
@@ -42,11 +42,24 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
unsigned getHWRegIndex(unsigned Reg) const override;
/// \brief Return the 'base' register class for this register.
- /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
+ /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
/// \returns true if this class contains only SGPR registers
- bool isSGPRClass(const TargetRegisterClass *RC) const;
+ bool isSGPRClass(const TargetRegisterClass *RC) const {
+ if (!RC)
+ return false;
+
+ return !hasVGPRs(RC);
+ }
+
+ /// \returns true if this class ID contains only SGPR registers
+ bool isSGPRClassID(unsigned RCID) const {
+ if (static_cast<int>(RCID) == -1)
+ return false;
+
+ return isSGPRClass(getRegClass(RCID));
+ }
/// \returns true if this class contains VGPR registers.
bool hasVGPRs(const TargetRegisterClass *RC) const;
@@ -67,28 +80,41 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
unsigned Channel) const;
- /// \returns True if operands defined with this register class can accept
- /// inline immediates.
- bool regClassCanUseImmediate(int RCID) const;
+ /// \returns True if operands defined with this operand type can accept
+ /// a literal constant (i.e. any 32-bit immediate).
+ bool opCanUseLiteralConstant(unsigned OpType) const;
- /// \returns True if operands defined with this register class can accept
- /// inline immediates.
- bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
+ /// \returns True if operands defined with this operand type can accept
+ /// an inline constant. i.e. An integer value in the range (-16, 64) or
+ /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
+ bool opCanUseInlineConstant(unsigned OpType) const;
enum PreloadedValue {
TGID_X,
TGID_Y,
TGID_Z,
SCRATCH_WAVE_OFFSET,
- SCRATCH_PTR
+ SCRATCH_PTR,
+ INPUT_PTR,
+ TIDIG_X,
+ TIDIG_Y,
+ TIDIG_Z
};
/// \brief Returns the physical register that \p Value is stored in.
unsigned getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const;
+ unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
+ const TargetRegisterClass *RC) const;
+
+private:
+ void buildScratchLoadStore(MachineBasicBlock::iterator MI,
+ unsigned LoadStoreOp, unsigned Value,
+ unsigned ScratchPtr, unsigned ScratchOffset,
+ int64_t Offset, RegScavenger *RS) const;
};
} // End namespace llvm
-#endif // SIREGISTERINFO_H_
+#endif
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 8974b6300625..1a1efb0c89a9 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -27,10 +27,28 @@ def VCC : RegisterWithSubRegs<"VCC", [VCC_LO, VCC_HI]> {
let HWEncoding = 106;
}
-def EXEC : SIReg<"EXEC", 126>;
+def EXEC_LO : SIReg<"exec_lo", 126>;
+def EXEC_HI : SIReg<"exec_hi", 127>;
+
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = 126;
+}
+
def SCC : SIReg<"SCC", 253>;
def M0 : SIReg <"M0", 124>;
+def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
+def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
+
+// Pair to indicate location of scratch space for flat accesses.
+def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = 104;
+}
+
// SGPR registers
foreach Index = 0-101 in {
def SGPR#Index : SIReg <"SGPR"#Index, Index>;
@@ -152,20 +170,24 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
//===----------------------------------------------------------------------===//
// Special register classes for predicates and the M0 register
-def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>;
+def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> {
+ let CopyCost = -1; // Theoretically it is possible to read from SCC,
+ // but it should never be necessary.
+}
+
def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
- (add SGPR_32, M0Reg, VCC_LO)
+ (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
>;
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
- (add SGPR_64Regs, VCCReg, EXECReg)
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
+ (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
>;
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
@@ -175,8 +197,6 @@ def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256
def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
-
def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
@@ -191,17 +211,49 @@ def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_IMM32";
+}
+
+class RegInlineOperand <RegisterClass rc> : RegisterOperand<rc> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_INLINE_C";
+}
+
+//===----------------------------------------------------------------------===//
+// SSrc_* Operands with an SGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
+def SSrc_32 : RegImmOperand<SReg_32>;
+
+def SSrc_64 : RegImmOperand<SReg_64>;
+
//===----------------------------------------------------------------------===//
-// [SV]Src_(32|64) register classes, can have either an immediate or an register
+// SCSrc_* Operands with an SGPR or a inline constant
//===----------------------------------------------------------------------===//
-def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
+def SCSrc_32 : RegInlineOperand<SReg_32>;
-def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
+//===----------------------------------------------------------------------===//
+// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
+
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+
+def VSrc_32 : RegImmOperand<VS_32>;
+
+def VSrc_64 : RegImmOperand<VS_64>;
+
+//===----------------------------------------------------------------------===//
+// VCSrc_* Operands with an SGPR, VGPR or an inline constant
+//===----------------------------------------------------------------------===//
-def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
+def VCSrc_32 : RegInlineOperand<VS_32>;
-def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VCSrc_64 : RegInlineOperand<VS_64>;
//===----------------------------------------------------------------------===//
// SGPR and VGPR register classes
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
index 28b65b825855..9b1f676020bf 100644
--- a/lib/Target/R600/SISchedule.td
+++ b/lib/Target/R600/SISchedule.td
@@ -7,9 +7,85 @@
//
//===----------------------------------------------------------------------===//
//
-// TODO: This is just a place holder for now.
+// MachineModel definitions for Southern Islands (SI)
//
//===----------------------------------------------------------------------===//
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS : SchedWrite;
+def WriteSALU : SchedWrite;
+def WriteSMEM : SchedWrite;
+def WriteVMEM : SchedWrite;
-def SI_Itin : ProcessorItineraries <[], [], []>;
+// Vector ALU instructions
+def Write32Bit : SchedWrite;
+def WriteQuarterRate32 : SchedWrite;
+
+def WriteFloatFMA : SchedWrite;
+
+def WriteDouble : SchedWrite;
+def WriteDoubleAdd : SchedWrite;
+
+def SIFullSpeedModel : SchedMachineModel;
+def SIQuarterSpeedModel : SchedMachineModel;
+
+// BufferSize = 0 means the processors are in-order.
+let BufferSize = 0 in {
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1>;
+def HWExport : ProcResource<7>; // Taken from S_WAITCNT
+def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT
+def HWSALU : ProcResource<1>;
+def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT
+def HWVALU : ProcResource<1>;
+
+}
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+ int latency> : WriteRes<write, resources> {
+ let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+ HWWriteRes<write, [HWVALU], latency>;
+
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide. They may not be acurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+multiclass SICommonWriteRes {
+
+ def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ???
+ def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ???
+ def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64
+ def : HWWriteRes<WriteSALU, [HWSALU], 1>;
+ def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ???
+ def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600
+
+ def : HWVALUWriteRes<Write32Bit, 1>;
+ def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+}
+
+
+let SchedModel = SIFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 1>;
+def : HWVALUWriteRes<WriteDouble, 4>;
+def : HWVALUWriteRes<WriteDoubleAdd, 2>;
+
+} // End SchedModel = SIFullSpeedModel
+
+let SchedModel = SIQuarterSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble, 16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+
+} // End SchedModel = SIQuarterSpeedModel
diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp
index 745c4b65644d..f91d1177bbae 100644
--- a/lib/Target/R600/SIShrinkInstructions.cpp
+++ b/lib/Target/R600/SIShrinkInstructions.cpp
@@ -10,11 +10,13 @@
//
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
@@ -24,6 +26,8 @@
STATISTIC(NumInstructionsShrunk,
"Number of 64-bit instruction reduced to 32-bit.");
+STATISTIC(NumLiteralConstantsFolded,
+ "Number of literal constants folded into 32-bit instructions.");
namespace llvm {
void initializeSIShrinkInstructionsPass(PassRegistry&);
@@ -41,13 +45,13 @@ public:
SIShrinkInstructions() : MachineFunctionPass(ID) {
}
- virtual bool runOnMachineFunction(MachineFunction &MF) override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
- virtual const char *getPassName() const override {
+ const char *getPassName() const override {
return "SI Shrink Instructions";
}
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -90,29 +94,83 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
const MachineOperand *Src1Mod =
TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
- if (Src1 && (!isVGPR(Src1, TRI, MRI) || Src1Mod->getImm() != 0))
+ if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
return false;
- // We don't need to check src0, all input types are legal, so just make
- // sure src0 isn't using any modifiers.
- const MachineOperand *Src0Mod =
- TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
- if (Src0Mod && Src0Mod->getImm() != 0)
+ // We don't need to check src0, all input types are legal, so just make sure
+ // src0 isn't using any modifiers.
+ if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
return false;
// Check output modifiers
- const MachineOperand *Omod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
- if (Omod && Omod->getImm() != 0)
+ if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
return false;
- const MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
- return !Clamp || Clamp->getImm() == 0;
+ if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+ return false;
+
+ return true;
+}
+
+/// \brief This function checks \p MI for operands defined by a move immediate
+/// instruction and then folds the literal constant into the instruction if it
+/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
+/// and will only fold literal constants if we are still in SSA.
+static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI, bool TryToCommute = true) {
+
+ if (!MRI.isSSA())
+ return;
+
+ assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) ||
+ TII->isVOPC(MI.getOpcode()));
+
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+
+ // Only one literal constant is allowed per instruction, so if src0 is a
+ // literal constant then we can't do any folding.
+ if (Src0->isImm() && TII->isLiteralConstant(*Src0))
+ return;
+
+
+ // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
+ // SGPR, we cannot commute the instruction, so we can't fold any literal
+ // constants.
+ if (Src0->isReg() && !isVGPR(Src0, TRI, MRI))
+ return;
+
+ // Try to fold Src0
+ if (Src0->isReg()) {
+ unsigned Reg = Src0->getReg();
+ MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+ if (Def && Def->isMoveImmediate()) {
+ MachineOperand &MovSrc = Def->getOperand(1);
+ bool ConstantFolded = false;
+
+ if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
+ Src0->ChangeToImmediate(MovSrc.getImm());
+ ConstantFolded = true;
+ }
+ if (ConstantFolded) {
+ if (MRI.use_empty(Reg))
+ Def->eraseFromParent();
+ ++NumLiteralConstantsFolded;
+ return;
+ }
+ }
+ }
+
+ // We have failed to fold src0, so commute the instruction and try again.
+ if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
+ foldImmediates(MI, TII, MRI, false);
+
}
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
- MF.getTarget().getInstrInfo());
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
const SIRegisterInfo &TRI = TII->getRegisterInfo();
std::vector<unsigned> I1Defs;
@@ -125,11 +183,23 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
+ // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
+ if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
+ const MachineOperand &Src = MI.getOperand(1);
+
+ if (Src.isImm()) {
+ if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src))
+ MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
+ }
+
+ continue;
+ }
+
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;
if (!canShrink(MI, TII, TRI, MRI)) {
- // Try commtuing the instruction and see if that enables us to shrink
+ // Try commuting the instruction and see if that enables us to shrink
// it.
if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
!canShrink(MI, TII, TRI, MRI))
@@ -147,18 +217,17 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
unsigned DstReg = MI.getOperand(0).getReg();
if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
// VOPC instructions can only write to the VCC register. We can't
- // force them to use VCC here, because the register allocator
- // has trouble with sequences like this, which cause the allocator
- // to run out of registes if vreg0 and vreg1 belong to the VCCReg
- // register class:
+ // force them to use VCC here, because the register allocator has
+ // trouble with sequences like this, which cause the allocator to run
+ // out of registers if vreg0 and vreg1 belong to the VCCReg register
+ // class:
// vreg0 = VOPC;
// vreg1 = VOPC;
// S_AND_B64 vreg0, vreg1
//
- // So, instead of forcing the instruction to write to VCC, we provide a
- // hint to the register allocator to use VCC and then we
- // we will run this pass again after RA and shrink it if it outpus to
- // VCC.
+ // So, instead of forcing the instruction to write to VCC, we provide
+ // a hint to the register allocator to use VCC and then we we will run
+ // this pass again after RA and shrink it if it outputs to VCC.
MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
continue;
}
@@ -167,27 +236,28 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
// We can shrink this instruction
- DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << "\n";);
+ DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
- MachineInstrBuilder MIB =
+ MachineInstrBuilder Inst32 =
BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
// dst
- MIB.addOperand(MI.getOperand(0));
+ Inst32.addOperand(MI.getOperand(0));
- MIB.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+ Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
const MachineOperand *Src1 =
TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (Src1)
- MIB.addOperand(*Src1);
-
- for (const MachineOperand &MO : MI.implicit_operands())
- MIB.addOperand(MO);
+ Inst32.addOperand(*Src1);
- DEBUG(dbgs() << "e32 MI = "; MI.dump(); dbgs() << "\n";);
++NumInstructionsShrunk;
MI.eraseFromParent();
+
+ foldImmediates(*Inst32, TII, MRI);
+ DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
+
+
}
}
return false;
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index 367963aebb00..9318dc11d55d 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -87,7 +87,7 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
Value *BitCast = Builder.CreateBitCast(Ptr,
PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
LoadInst *Load = Builder.CreateLoad(BitCast);
- SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
+ SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
I.getAllMetadataOtherThanDebugLoc(MD);
for (unsigned i = 0, e = MD.size(); i != e; ++i) {
Load->setMetadata(MD[i].first, MD[i].second);
diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
index f437564f4b84..d723d6e3e8b7 100644
--- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,11 +16,15 @@
using namespace llvm;
-/// \brief The target for the AMDGPU backend
+/// \brief The target which suports all AMD GPUs. This will eventually
+/// be deprecated and there will be a R600 target and a GCN target.
Target llvm::TheAMDGPUTarget;
+/// \brief The target for GCN GPUs
+Target llvm::TheGCNTarget;
/// \brief Extern function to initialize the targets for the AMDGPU backend
extern "C" void LLVMInitializeR600TargetInfo() {
RegisterTarget<Triple::r600, false>
R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
+ RegisterTarget<Triple::amdgcn, false> GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs");
}
diff --git a/lib/Target/R600/VIInstrFormats.td b/lib/Target/R600/VIInstrFormats.td
new file mode 100644
index 000000000000..5285d18ced46
--- /dev/null
+++ b/lib/Target/R600/VIInstrFormats.td
@@ -0,0 +1,145 @@
+//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// VI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class DSe_vi <bits<8> op> : Enc64 {
+
+ bits<8> vdst;
+ bits<1> gds;
+ bits<8> addr;
+ bits<8> data0;
+ bits<8> data1;
+ bits<8> offset0;
+ bits<8> offset1;
+
+ let Inst{7-0} = offset0;
+ let Inst{15-8} = offset1;
+ let Inst{16} = gds;
+ let Inst{24-17} = op;
+ let Inst{31-26} = 0x36; //encoding
+ let Inst{39-32} = addr;
+ let Inst{47-40} = data0;
+ let Inst{55-48} = data1;
+ let Inst{63-56} = vdst;
+}
+
+class MUBUFe_vi <bits<7> op> : Enc64 {
+
+ bits<12> offset;
+ bits<1> offen;
+ bits<1> idxen;
+ bits<1> glc;
+ bits<1> lds;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+
+ let Inst{11-0} = offset;
+ let Inst{12} = offen;
+ let Inst{13} = idxen;
+ let Inst{14} = glc;
+ let Inst{16} = lds;
+ let Inst{17} = slc;
+ let Inst{24-18} = op;
+ let Inst{31-26} = 0x38; //encoding
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{55} = tfe;
+ let Inst{63-56} = soffset;
+}
+
+class MTBUFe_vi <bits<4> op> : Enc64 {
+
+ bits<12> offset;
+ bits<1> offen;
+ bits<1> idxen;
+ bits<1> glc;
+ bits<4> dfmt;
+ bits<3> nfmt;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+
+ let Inst{11-0} = offset;
+ let Inst{12} = offen;
+ let Inst{13} = idxen;
+ let Inst{14} = glc;
+ let Inst{18-15} = op;
+ let Inst{22-19} = dfmt;
+ let Inst{25-23} = nfmt;
+ let Inst{31-26} = 0x3a; //encoding
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{54} = slc;
+ let Inst{55} = tfe;
+ let Inst{63-56} = soffset;
+}
+
+class SMEMe_vi <bits<8> op, bit imm> : Enc64 {
+
+ bits<7> sbase;
+ bits<7> sdata;
+ bits<1> glc;
+ bits<20> offset;
+
+ let Inst{5-0} = sbase{6-1};
+ let Inst{12-6} = sdata;
+ let Inst{16} = glc;
+ let Inst{17} = imm;
+ let Inst{25-18} = op;
+ let Inst{31-26} = 0x30; //encoding
+ let Inst{51-32} = offset;
+}
+
+class VOP3e_vi <bits<10> op> : Enc64 {
+
+ bits<8> dst;
+ bits<2> src0_modifiers;
+ bits<9> src0;
+ bits<2> src1_modifiers;
+ bits<9> src1;
+ bits<2> src2_modifiers;
+ bits<9> src2;
+ bits<1> clamp;
+ bits<2> omod;
+
+ let Inst{7-0} = dst;
+ let Inst{8} = src0_modifiers{1};
+ let Inst{9} = src1_modifiers{1};
+ let Inst{10} = src2_modifiers{1};
+ let Inst{15} = clamp;
+ let Inst{25-16} = op;
+ let Inst{31-26} = 0x34; //encoding
+ let Inst{40-32} = src0;
+ let Inst{49-41} = src1;
+ let Inst{58-50} = src2;
+ let Inst{60-59} = omod;
+ let Inst{61} = src0_modifiers{0};
+ let Inst{62} = src1_modifiers{0};
+ let Inst{63} = src2_modifiers{0};
+}
+
+class EXPe_vi : EXPe {
+ let Inst{31-26} = 0x31; //encoding
+}
+
+class VINTRPe_vi <bits<2> op> : VINTRPe <op> {
+ let Inst{31-26} = 0x35; // encoding
+}
diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td
new file mode 100644
index 000000000000..07cfa29ae12b
--- /dev/null
+++ b/lib/Target/R600/VIInstructions.td
@@ -0,0 +1,89 @@
+//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isVI in {
+
+def V_LDEXP_F32 : VOP3InstVI <0x288, "v_ldexp_f32", VOP_F32_F32_I32,
+ AMDGPUldexp
+>;
+def V_BFM_B32 : VOP3InstVI <0x293, "v_bfm_b32", VOP_I32_I32_I32, AMDGPUbfm>;
+def V_BCNT_U32_B32 : VOP3InstVI <0x28b, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
+def V_MBCNT_LO_U32_B32 : VOP3InstVI <0x28c, "v_mbcnt_lo_u32_b32",
+ VOP_I32_I32_I32
+>;
+def V_MBCNT_HI_U32_B32 : VOP3InstVI <0x28d, "v_mbcnt_hi_u32_b32",
+ VOP_I32_I32_I32
+>;
+
+def V_CVT_PKRTZ_F16_F32 : VOP3InstVI <0x296, "v_cvt_pkrtz_f16_f32",
+ VOP_I32_F32_F32, int_SI_packf16
+>;
+
+defm BUFFER_LOAD_DWORD_VI : MUBUF_Load_Helper_vi <
+ 0x14, "buffer_load_dword", VGPR_32, i32, global_load
+>;
+
+defm BUFFER_LOAD_FORMAT_XYZW_VI : MUBUF_Load_Helper_vi <
+ 0x03, "buffer_load_format_xyzw", VReg_128
+>;
+
+} // End SubtargetPredicate = isVI
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isVI] in {
+
+def : Pat <
+ (int_SI_tid),
+ (V_MBCNT_HI_U32_B32 0xffffffff,
+ (V_MBCNT_LO_U32_B32 0xffffffff, 0))
+>;
+
+//===----------------------------------------------------------------------===//
+// SMEM Patterns
+//===----------------------------------------------------------------------===//
+
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// Offset in an 32Bit VGPR
+def : Pat <
+ (SIload_constant v4i32:$sbase, i32:$voff),
+ (BUFFER_LOAD_DWORD_VI_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
+>;
+
+// Offset in an 32Bit VGPR
+def : Pat <
+ (SIload_constant v4i32:$sbase, i32:$voff),
+ (BUFFER_LOAD_DWORD_VI_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
+>;
+
+/* int_SI_vs_load_input */
+def : Pat<
+ (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
+ (BUFFER_LOAD_FORMAT_XYZW_VI_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
+>;
+
+defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_VI_OFFSET,
+ BUFFER_LOAD_DWORD_VI_OFFEN,
+ BUFFER_LOAD_DWORD_VI_IDXEN,
+ BUFFER_LOAD_DWORD_VI_BOTHEN>;
+
+} // End Predicates = [isVI]