aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/NVPTX
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/NVPTX')
-rw-r--r--contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp290
-rw-r--r--contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h53
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h100
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp56
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h30
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp105
-rw-r--r--contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h36
-rw-r--r--contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h48
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTX.h194
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTX.td69
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp46
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h50
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp2292
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h333
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp84
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp195
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp81
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h38
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp433
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp5085
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h94
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp4516
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h541
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp178
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td59
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp273
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h78
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td2741
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td7047
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp205
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h48
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp47
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h83
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h46
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp227
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp111
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h72
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td69
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp189
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXSection.h48
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp70
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h116
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp250
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h100
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h105
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp543
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h96
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXVector.td1481
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp90
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVPTXutil.h25
-rw-r--r--contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp222
-rw-r--r--contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp23
-rw-r--r--contrib/llvm/lib/Target/NVPTX/cl_common_defines.h122
53 files changed, 29533 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
new file mode 100644
index 000000000000..80b2f621fb94
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -0,0 +1,290 @@
+//===-- NVPTXInstPrinter.cpp - PTX assembly instruction printing ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Print MCInst instructions to .ptx format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTX.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "NVPTXGenAsmWriter.inc"
+
+
+NVPTXInstPrinter::NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI)
+ : MCInstPrinter(MAI, MII, MRI) {
+ setAvailableFeatures(STI.getFeatureBits());
+}
+
+void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+ // Decode the virtual register
+ // Must be kept in sync with NVPTXAsmPrinter::encodeVirtualRegister
+ unsigned RCId = (RegNo >> 28);
+ switch (RCId) {
+ default: report_fatal_error("Bad virtual register encoding");
+ case 0:
+ // This is actually a physical register, so defer to the autogenerated
+ // register printer
+ OS << getRegisterName(RegNo);
+ return;
+ case 1:
+ OS << "%p";
+ break;
+ case 2:
+ OS << "%rs";
+ break;
+ case 3:
+ OS << "%r";
+ break;
+ case 4:
+ OS << "%rd";
+ break;
+ case 5:
+ OS << "%f";
+ break;
+ case 6:
+ OS << "%fd";
+ break;
+ }
+
+ unsigned VReg = RegNo & 0x0FFFFFFF;
+ OS << VReg;
+}
+
+void NVPTXInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+ StringRef Annot) {
+ printInstruction(MI, OS);
+
+ // Next always print the annotation.
+ printAnnotation(OS, Annot);
+}
+
+void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ printRegName(O, Reg);
+ } else if (Op.isImm()) {
+ O << markup("<imm:") << formatImm(Op.getImm()) << markup(">");
+ } else {
+ assert(Op.isExpr() && "Unknown operand kind in printOperand");
+ O << *Op.getExpr();
+ }
+}
+
+void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int64_t Imm = MO.getImm();
+
+ if (strcmp(Modifier, "ftz") == 0) {
+ // FTZ flag
+ if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG)
+ O << ".ftz";
+ } else if (strcmp(Modifier, "sat") == 0) {
+ // SAT flag
+ if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
+ O << ".sat";
+ } else if (strcmp(Modifier, "base") == 0) {
+ // Default operand
+ switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) {
+ default:
+ return;
+ case NVPTX::PTXCvtMode::NONE:
+ break;
+ case NVPTX::PTXCvtMode::RNI:
+ O << ".rni";
+ break;
+ case NVPTX::PTXCvtMode::RZI:
+ O << ".rzi";
+ break;
+ case NVPTX::PTXCvtMode::RMI:
+ O << ".rmi";
+ break;
+ case NVPTX::PTXCvtMode::RPI:
+ O << ".rpi";
+ break;
+ case NVPTX::PTXCvtMode::RN:
+ O << ".rn";
+ break;
+ case NVPTX::PTXCvtMode::RZ:
+ O << ".rz";
+ break;
+ case NVPTX::PTXCvtMode::RM:
+ O << ".rm";
+ break;
+ case NVPTX::PTXCvtMode::RP:
+ O << ".rp";
+ break;
+ }
+ } else {
+ llvm_unreachable("Invalid conversion modifier");
+ }
+}
+
+void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int64_t Imm = MO.getImm();
+
+ if (strcmp(Modifier, "ftz") == 0) {
+ // FTZ flag
+ if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG)
+ O << ".ftz";
+ } else if (strcmp(Modifier, "base") == 0) {
+ switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) {
+ default:
+ return;
+ case NVPTX::PTXCmpMode::EQ:
+ O << ".eq";
+ break;
+ case NVPTX::PTXCmpMode::NE:
+ O << ".ne";
+ break;
+ case NVPTX::PTXCmpMode::LT:
+ O << ".lt";
+ break;
+ case NVPTX::PTXCmpMode::LE:
+ O << ".le";
+ break;
+ case NVPTX::PTXCmpMode::GT:
+ O << ".gt";
+ break;
+ case NVPTX::PTXCmpMode::GE:
+ O << ".ge";
+ break;
+ case NVPTX::PTXCmpMode::LO:
+ O << ".lo";
+ break;
+ case NVPTX::PTXCmpMode::LS:
+ O << ".ls";
+ break;
+ case NVPTX::PTXCmpMode::HI:
+ O << ".hi";
+ break;
+ case NVPTX::PTXCmpMode::HS:
+ O << ".hs";
+ break;
+ case NVPTX::PTXCmpMode::EQU:
+ O << ".equ";
+ break;
+ case NVPTX::PTXCmpMode::NEU:
+ O << ".neu";
+ break;
+ case NVPTX::PTXCmpMode::LTU:
+ O << ".ltu";
+ break;
+ case NVPTX::PTXCmpMode::LEU:
+ O << ".leu";
+ break;
+ case NVPTX::PTXCmpMode::GTU:
+ O << ".gtu";
+ break;
+ case NVPTX::PTXCmpMode::GEU:
+ O << ".geu";
+ break;
+ case NVPTX::PTXCmpMode::NUM:
+ O << ".num";
+ break;
+ case NVPTX::PTXCmpMode::NotANumber:
+ O << ".nan";
+ break;
+ }
+ } else {
+ llvm_unreachable("Empty Modifier");
+ }
+}
+
+void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier) {
+ if (Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int Imm = (int) MO.getImm();
+ if (!strcmp(Modifier, "volatile")) {
+ if (Imm)
+ O << ".volatile";
+ } else if (!strcmp(Modifier, "addsp")) {
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::GLOBAL:
+ O << ".global";
+ break;
+ case NVPTX::PTXLdStInstCode::SHARED:
+ O << ".shared";
+ break;
+ case NVPTX::PTXLdStInstCode::LOCAL:
+ O << ".local";
+ break;
+ case NVPTX::PTXLdStInstCode::PARAM:
+ O << ".param";
+ break;
+ case NVPTX::PTXLdStInstCode::CONSTANT:
+ O << ".const";
+ break;
+ case NVPTX::PTXLdStInstCode::GENERIC:
+ break;
+ default:
+ llvm_unreachable("Wrong Address Space");
+ }
+ } else if (!strcmp(Modifier, "sign")) {
+ if (Imm == NVPTX::PTXLdStInstCode::Signed)
+ O << "s";
+ else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
+ O << "u";
+ else
+ O << "f";
+ } else if (!strcmp(Modifier, "vec")) {
+ if (Imm == NVPTX::PTXLdStInstCode::V2)
+ O << ".v2";
+ else if (Imm == NVPTX::PTXLdStInstCode::V4)
+ O << ".v4";
+ } else
+ llvm_unreachable("Unknown Modifier");
+ } else
+ llvm_unreachable("Empty Modifier");
+}
+
+void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier) {
+ printOperand(MI, OpNum, O);
+
+ if (Modifier && !strcmp(Modifier, "add")) {
+ O << ", ";
+ printOperand(MI, OpNum + 1, O);
+ } else {
+ if (MI->getOperand(OpNum + 1).isImm() &&
+ MI->getOperand(OpNum + 1).getImm() == 0)
+ return; // don't print ',0' or '+0'
+ O << "+";
+ printOperand(MI, OpNum + 1, O);
+ }
+}
+
+void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier) {
+ const MCOperand &Op = MI->getOperand(OpNum);
+ assert(Op.isExpr() && "Call prototype is not an MCExpr?");
+ const MCExpr *Expr = Op.getExpr();
+ const MCSymbol &Sym = cast<MCSymbolRefExpr>(Expr)->getSymbol();
+ O << Sym.getName();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
new file mode 100644
index 000000000000..1fb3c57390c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -0,0 +1,53 @@
+//= NVPTXInstPrinter.h - Convert NVPTX MCInst to assembly syntax --*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an NVPTX MCInst to .ptx file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_INST_PRINTER_H
+#define NVPTX_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCOperand;
+class MCSubtargetInfo;
+
+class NVPTXInstPrinter : public MCInstPrinter {
+public:
+ NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+ void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+ void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+ // End
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printLdStCode(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier = nullptr);
+ void printMemOperand(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier = nullptr);
+ void printProtoIdent(const MCInst *MI, int OpNum,
+ raw_ostream &O, const char *Modifier = nullptr);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
new file mode 100644
index 000000000000..16ec19c25f16
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -0,0 +1,100 @@
+//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the NVPTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXBASEINFO_H
+#define NVPTXBASEINFO_H
+
+namespace llvm {
+
+enum AddressSpace {
+ ADDRESS_SPACE_GENERIC = 0,
+ ADDRESS_SPACE_GLOBAL = 1,
+ ADDRESS_SPACE_SHARED = 3,
+ ADDRESS_SPACE_CONST = 4,
+ ADDRESS_SPACE_LOCAL = 5,
+
+ // NVVM Internal
+ ADDRESS_SPACE_PARAM = 101
+};
+
+enum PropertyAnnotation {
+ PROPERTY_MAXNTID_X = 0,
+ PROPERTY_MAXNTID_Y,
+ PROPERTY_MAXNTID_Z,
+ PROPERTY_REQNTID_X,
+ PROPERTY_REQNTID_Y,
+ PROPERTY_REQNTID_Z,
+ PROPERTY_MINNCTAPERSM,
+ PROPERTY_ISTEXTURE,
+ PROPERTY_ISSURFACE,
+ PROPERTY_ISSAMPLER,
+ PROPERTY_ISREADONLY_IMAGE_PARAM,
+ PROPERTY_ISWRITEONLY_IMAGE_PARAM,
+ PROPERTY_ISREADWRITE_IMAGE_PARAM,
+ PROPERTY_ISKERNEL_FUNCTION,
+ PROPERTY_ALIGN,
+ PROPERTY_MANAGED,
+
+ // last property
+ PROPERTY_LAST
+};
+
+const unsigned AnnotationNameLen = 9; // length of each annotation name
+const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
+ "maxntidx", // PROPERTY_MAXNTID_X
+ "maxntidy", // PROPERTY_MAXNTID_Y
+ "maxntidz", // PROPERTY_MAXNTID_Z
+ "reqntidx", // PROPERTY_REQNTID_X
+ "reqntidy", // PROPERTY_REQNTID_Y
+ "reqntidz", // PROPERTY_REQNTID_Z
+ "minctasm", // PROPERTY_MINNCTAPERSM
+ "texture", // PROPERTY_ISTEXTURE
+ "surface", // PROPERTY_ISSURFACE
+ "sampler", // PROPERTY_ISSAMPLER
+ "rdoimage", // PROPERTY_ISREADONLY_IMAGE_PARAM
+ "wroimage", // PROPERTY_ISWRITEONLY_IMAGE_PARAM
+ "rdwrimage", // PROPERTY_ISREADWRITE_IMAGE_PARAM
+ "kernel", // PROPERTY_ISKERNEL_FUNCTION
+ "align", // PROPERTY_ALIGN
+ "managed", // PROPERTY_MANAGED
+
+ // last property
+ "proplast", // PROPERTY_LAST
+};
+
+// name of named metadata used for global annotations
+#if defined(__GNUC__)
+// As this is declared to be static but some of the .cpp files that
+// include NVVM.h do not use this array, gcc gives a warning when
+// compiling those .cpp files, hence __attribute__((unused)).
+__attribute__((unused))
+#endif
+ static const char *NamedMDForAnnotations = "nvvm.annotations";
+
+namespace NVPTXII {
+enum {
+ // These must be kept in sync with TSFlags in NVPTXInstrFormats.td
+ IsTexFlag = 0x80,
+ IsSuldMask = 0x300,
+ IsSuldShift = 8,
+ IsSustFlag = 0x400,
+ IsSurfTexQueryFlag = 0x800,
+ IsTexModeUnifiedFlag = 0x1000
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
new file mode 100644
index 000000000000..366341afe1b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -0,0 +1,56 @@
+//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the NVPTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// -debug-compile - Command line option to inform opt and llc passes to
+// compile for debugging
+static cl::opt<bool> CompileForDebugging("debug-compile",
+ cl::desc("Compile for debugging"),
+ cl::Hidden, cl::init(false));
+
+void NVPTXMCAsmInfo::anchor() {}
+
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
+ Triple TheTriple(TT);
+ if (TheTriple.getArch() == Triple::nvptx64) {
+ PointerSize = CalleeSaveStackSlotSize = 8;
+ }
+
+ CommentString = "//";
+
+ HasSetDirective = false;
+
+ HasSingleParameterDotFile = false;
+
+ InlineAsmStart = " inline asm";
+ InlineAsmEnd = " inline asm";
+
+ SupportsDebugInformation = CompileForDebugging;
+ HasDotTypeDotSizeDirective = false;
+
+ Data8bitsDirective = " .b8 ";
+ Data16bitsDirective = " .b16 ";
+ Data32bitsDirective = " .b32 ";
+ Data64bitsDirective = " .b64 ";
+ ZeroDirective = " .b8";
+ AsciiDirective = " .b8";
+ AscizDirective = " .b8";
+
+ // @TODO: Can we just disable this?
+ GlobalDirective = "\t// .globl\t";
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
new file mode 100644
index 000000000000..7d1633f60d2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVPTXMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_MCASM_INFO_H
+#define NVPTX_MCASM_INFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+class StringRef;
+
+class NVPTXMCAsmInfo : public MCAsmInfo {
+ virtual void anchor();
+public:
+ explicit NVPTXMCAsmInfo(const StringRef &TT);
+};
+} // namespace llvm
+
+#endif // NVPTX_MCASM_INFO_H
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
new file mode 100644
index 000000000000..158ca901e586
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -0,0 +1,105 @@
+//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCTargetDesc.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "NVPTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+static MCInstrInfo *createNVPTXMCInstrInfo() {
+ MCInstrInfo *X = new MCInstrInfo();
+ InitNVPTXMCInstrInfo(X);
+ return X;
+}
+
+static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
+ MCRegisterInfo *X = new MCRegisterInfo();
+ // PTX does not have a return address register.
+ InitNVPTXMCRegisterInfo(X, 0);
+ return X;
+}
+
+static MCSubtargetInfo *
+createNVPTXMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+ MCSubtargetInfo *X = new MCSubtargetInfo();
+ InitNVPTXMCSubtargetInfo(X, TT, CPU, FS);
+ return X;
+}
+
+static MCCodeGenInfo *createNVPTXMCCodeGenInfo(
+ StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) {
+ MCCodeGenInfo *X = new MCCodeGenInfo();
+ X->InitMCCodeGenInfo(RM, CM, OL);
+ return X;
+}
+
+static MCInstPrinter *createNVPTXMCInstPrinter(const Target &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI,
+ const MCInstrInfo &MII,
+ const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI) {
+ if (SyntaxVariant == 0)
+ return new NVPTXInstPrinter(MAI, MII, MRI, STI);
+ return nullptr;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXTargetMC() {
+ // Register the MC asm info.
+ RegisterMCAsmInfo<NVPTXMCAsmInfo> X(TheNVPTXTarget32);
+ RegisterMCAsmInfo<NVPTXMCAsmInfo> Y(TheNVPTXTarget64);
+
+ // Register the MC codegen info.
+ TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget32,
+ createNVPTXMCCodeGenInfo);
+ TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget64,
+ createNVPTXMCCodeGenInfo);
+
+ // Register the MC instruction info.
+ TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget32, createNVPTXMCInstrInfo);
+ TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget64, createNVPTXMCInstrInfo);
+
+ // Register the MC register info.
+ TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget32,
+ createNVPTXMCRegisterInfo);
+ TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget64,
+ createNVPTXMCRegisterInfo);
+
+ // Register the MC subtarget info.
+ TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget32,
+ createNVPTXMCSubtargetInfo);
+ TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64,
+ createNVPTXMCSubtargetInfo);
+
+ // Register the MCInstPrinter.
+ TargetRegistry::RegisterMCInstPrinter(TheNVPTXTarget32,
+ createNVPTXMCInstPrinter);
+ TargetRegistry::RegisterMCInstPrinter(TheNVPTXTarget64,
+ createNVPTXMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
new file mode 100644
index 000000000000..af95c76f92b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -0,0 +1,36 @@
+//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXMCTARGETDESC_H
+#define NVPTXMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+} // End llvm namespace
+
+// Defines symbolic names for PTX registers.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the PTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "NVPTXGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
new file mode 100644
index 000000000000..f9fb05922920
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
@@ -0,0 +1,48 @@
+//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The strings allocated from a managed string pool are owned by the string
+// pool and will be deleted together with the managed string pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MANAGED_STRING_H
+#define LLVM_SUPPORT_MANAGED_STRING_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// ManagedStringPool - The strings allocated from a managed string pool are
+/// owned by the string pool and will be deleted together with the managed
+/// string pool.
+class ManagedStringPool {
+ SmallVector<std::string *, 8> Pool;
+
+public:
+ ManagedStringPool() {}
+ ~ManagedStringPool() {
+ SmallVectorImpl<std::string *>::iterator Current = Pool.begin();
+ while (Current != Pool.end()) {
+ delete *Current;
+ Current++;
+ }
+ }
+
+ std::string *getManagedString(const char *S) {
+ std::string *Str = new std::string(S);
+ Pool.push_back(Str);
+ return Str;
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
new file mode 100644
index 000000000000..e74c808f8554
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
@@ -0,0 +1,194 @@
+//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM NVPTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_H
+#define LLVM_TARGET_NVPTX_H
+
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <iosfwd>
+
+namespace llvm {
+class NVPTXTargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+class formatted_raw_ostream;
+
+namespace NVPTXCC {
+enum CondCodes {
+ EQ,
+ NE,
+ LT,
+ LE,
+ GT,
+ GE
+};
+}
+
+inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
+ switch (CC) {
+ case NVPTXCC::NE:
+ return "ne";
+ case NVPTXCC::EQ:
+ return "eq";
+ case NVPTXCC::LT:
+ return "lt";
+ case NVPTXCC::LE:
+ return "le";
+ case NVPTXCC::GT:
+ return "gt";
+ case NVPTXCC::GE:
+ return "ge";
+ }
+ llvm_unreachable("Unknown condition code");
+}
+
+FunctionPass *
+createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
+ModulePass *createNVPTXAssignValidGlobalNamesPass();
+ModulePass *createGenericToNVVMPass();
+FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
+ModulePass *createNVVMReflectPass();
+ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
+MachineFunctionPass *createNVPTXPrologEpilogPass();
+MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
+FunctionPass *createNVPTXImageOptimizerPass();
+
+bool isImageOrSamplerVal(const Value *, const Module *);
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+namespace NVPTX {
+enum DrvInterface {
+ NVCL,
+ CUDA
+};
+
+// A field inside TSFlags needs a shift and a mask. The usage is
+// always as follows :
+// ((TSFlags & fieldMask) >> fieldShift)
+// The enum keeps the mask, the shift, and all valid values of the
+// field in one place.
+enum VecInstType {
+ VecInstTypeShift = 0,
+ VecInstTypeMask = 0xF,
+
+ VecNOP = 0,
+ VecLoad = 1,
+ VecStore = 2,
+ VecBuild = 3,
+ VecShuffle = 4,
+ VecExtract = 5,
+ VecInsert = 6,
+ VecDest = 7,
+ VecOther = 15
+};
+
+enum SimpleMove {
+ SimpleMoveMask = 0x10,
+ SimpleMoveShift = 4
+};
+enum LoadStore {
+ isLoadMask = 0x20,
+ isLoadShift = 5,
+ isStoreMask = 0x40,
+ isStoreShift = 6
+};
+
+namespace PTXLdStInstCode {
+enum AddressSpace {
+ GENERIC = 0,
+ GLOBAL = 1,
+ CONSTANT = 2,
+ SHARED = 3,
+ PARAM = 4,
+ LOCAL = 5
+};
+enum FromType {
+ Unsigned = 0,
+ Signed,
+ Float
+};
+enum VecType {
+ Scalar = 1,
+ V2 = 2,
+ V4 = 4
+};
+}
+
+/// PTXCvtMode - Conversion code enumeration
+namespace PTXCvtMode {
+enum CvtMode {
+ NONE = 0,
+ RNI,
+ RZI,
+ RMI,
+ RPI,
+ RN,
+ RZ,
+ RM,
+ RP,
+
+ BASE_MASK = 0x0F,
+ FTZ_FLAG = 0x10,
+ SAT_FLAG = 0x20
+};
+}
+
+/// PTXCmpMode - Comparison mode enumeration
+namespace PTXCmpMode {
+enum CmpMode {
+ EQ = 0,
+ NE,
+ LT,
+ LE,
+ GT,
+ GE,
+ LO,
+ LS,
+ HI,
+ HS,
+ EQU,
+ NEU,
+ LTU,
+ LEU,
+ GTU,
+ GEU,
+ NUM,
+ // NAN is a MACRO
+ NotANumber,
+
+ BASE_MASK = 0xFF,
+ FTZ_FLAG = 0x100
+};
+}
+}
+} // end namespace llvm;
+
+// Defines symbolic names for NVPTX registers. This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the NVPTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.td b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
new file mode 100644
index 000000000000..93fabf615369
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
@@ -0,0 +1,69 @@
+//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the NVPTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "NVPTXRegisterInfo.td"
+include "NVPTXInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+// - We use the SM version number instead of explicit feature table.
+// - Need at least one feature to avoid generating zero sized array by
+// TableGen in NVPTXGenSubtarget.inc.
+//===----------------------------------------------------------------------===//
+
+// SM Versions
+def SM20 : SubtargetFeature<"sm_20", "SmVersion", "20",
+ "Target SM 2.0">;
+def SM21 : SubtargetFeature<"sm_21", "SmVersion", "21",
+ "Target SM 2.1">;
+def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
+ "Target SM 3.0">;
+def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
+ "Target SM 3.5">;
+def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50",
+ "Target SM 5.0">;
+
+// PTX Versions
+def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30",
+ "Use PTX version 3.0">;
+def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31",
+ "Use PTX version 3.1">;
+def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
+ "Use PTX version 3.2">;
+def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40",
+ "Use PTX version 4.0">;
+
+//===----------------------------------------------------------------------===//
+// NVPTX supported processors.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"sm_20", [SM20]>;
+def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_30", [SM30]>;
+def : Proc<"sm_35", [SM35]>;
+def : Proc<"sm_50", [SM50]>;
+
+
+def NVPTXInstrInfo : InstrInfo {
+}
+
+def NVPTX : Target {
+ let InstructionSet = NVPTXInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
new file mode 100644
index 000000000000..1f3769601788
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -0,0 +1,46 @@
+//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAllocaHoisting.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
+ bool functionModified = false;
+ Function::iterator I = function.begin();
+ TerminatorInst *firstTerminatorInst = (I++)->getTerminator();
+
+ for (Function::iterator E = function.end(); I != E; ++I) {
+ for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+ AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+ if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+ allocaInst->moveBefore(firstTerminatorInst);
+ functionModified = true;
+ }
+ }
+ }
+
+ return functionModified;
+}
+
+char NVPTXAllocaHoisting::ID = 1;
+static RegisterPass<NVPTXAllocaHoisting>
+X("alloca-hoisting", "Hoisting alloca instructions in non-entry "
+ "blocks to the entry block");
+
+FunctionPass *createAllocaHoisting() { return new NVPTXAllocaHoisting(); }
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
new file mode 100644
index 000000000000..5b610687e391
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -0,0 +1,50 @@
+//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_ALLOCA_HOISTING_H_
+#define NVPTX_ALLOCA_HOISTING_H_
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class FunctionPass;
+class Function;
+
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAllocaHoisting : public FunctionPass {
+public:
+ static char ID; // Pass ID
+ NVPTXAllocaHoisting() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DataLayoutPass>();
+ AU.addPreserved("stack-protector");
+ AU.addPreserved<MachineFunctionAnalysis>();
+ }
+
+ const char *getPassName() const override {
+ return "NVPTX specific alloca hoisting";
+ }
+
+ bool runOnFunction(Function &function) override;
+};
+
+extern FunctionPass *createAllocaHoisting();
+
+} // end namespace llvm
+
+#endif // NVPTX_ALLOCA_HOISTING_H_
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
new file mode 100644
index 000000000000..187b88c1d54a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -0,0 +1,2292 @@
+//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAsmPrinter.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXMCExpr.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXUtilities.h"
+#include "cl_common_defines.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TimeValue.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <sstream>
+using namespace llvm;
+
+#define DEPOTNAME "__local_depot"
+
+static cl::opt<bool>
+EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
+ cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
+ cl::init(true));
+
+static cl::opt<bool>
+InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specific: Emit source line in ptx file"),
+ cl::init(false));
+
+namespace {
+/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
+/// depends.
+void DiscoverDependentGlobals(const Value *V,
+ DenseSet<const GlobalVariable *> &Globals) {
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+ Globals.insert(GV);
+ else {
+ if (const User *U = dyn_cast<User>(V)) {
+ for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
+ DiscoverDependentGlobals(U->getOperand(i), Globals);
+ }
+ }
+ }
+}
+
+/// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
+/// instances to be emitted, but only after any dependents have been added
+/// first.
+void VisitGlobalVariableForEmission(
+ const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
+ DenseSet<const GlobalVariable *> &Visited,
+ DenseSet<const GlobalVariable *> &Visiting) {
+ // Have we already visited this one?
+ if (Visited.count(GV))
+ return;
+
+ // Do we have a circular dependency?
+ if (Visiting.count(GV))
+ report_fatal_error("Circular dependency found in global variable set");
+
+ // Start visiting this global
+ Visiting.insert(GV);
+
+ // Make sure we visit all dependents first
+ DenseSet<const GlobalVariable *> Others;
+ for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
+ DiscoverDependentGlobals(GV->getOperand(i), Others);
+
+ for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(),
+ E = Others.end();
+ I != E; ++I)
+ VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
+
+ // Now we can visit ourself
+ Order.push_back(GV);
+ Visited.insert(GV);
+ Visiting.erase(GV);
+}
+}
+
+// @TODO: This is a copy from AsmPrinter.cpp. The function is static, so we
+// cannot just link to the existing version.
+/// LowerConstant - Lower the specified LLVM Constant to an MCExpr.
+///
+using namespace nvptx;
+const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
+ MCContext &Ctx = AP.OutContext;
+
+ if (CV->isNullValue() || isa<UndefValue>(CV))
+ return MCConstantExpr::Create(0, Ctx);
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+ return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
+
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+ return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
+
+ if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
+ return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+
+ const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+ if (!CE)
+ llvm_unreachable("Unknown constant value to lower!");
+
+ switch (CE->getOpcode()) {
+ default:
+ // If the code isn't optimized, there may be outstanding folding
+ // opportunities. Attempt to fold the expression using DataLayout as a
+ // last resort before giving up.
+ if (Constant *C = ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
+ if (C != CE)
+ return LowerConstant(C, AP);
+
+ // Otherwise report the problem to the user.
+ {
+ std::string S;
+ raw_string_ostream OS(S);
+ OS << "Unsupported expression in static initializer: ";
+ CE->printAsOperand(OS, /*PrintType=*/ false,
+ !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+ report_fatal_error(OS.str());
+ }
+ case Instruction::AddrSpaceCast: {
+ // Strip any addrspace(1)->addrspace(0) addrspace casts. These will be
+ // handled by the generic() logic in the MCExpr printer
+ PointerType *DstTy = cast<PointerType>(CE->getType());
+ PointerType *SrcTy = cast<PointerType>(CE->getOperand(0)->getType());
+ if (SrcTy->getAddressSpace() == 1 && DstTy->getAddressSpace() == 0) {
+ return LowerConstant(cast<const Constant>(CE->getOperand(0)), AP);
+ }
+ std::string S;
+ raw_string_ostream OS(S);
+ OS << "Unsupported expression in static initializer: ";
+ CE->printAsOperand(OS, /*PrintType=*/ false,
+ !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+ report_fatal_error(OS.str());
+ }
+ case Instruction::GetElementPtr: {
+ const DataLayout &TD = *AP.TM.getDataLayout();
+ // Generate a symbolic expression for the byte address
+ APInt OffsetAI(TD.getPointerSizeInBits(), 0);
+ cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
+
+ const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
+ if (!OffsetAI)
+ return Base;
+
+ int64_t Offset = OffsetAI.getSExtValue();
+ return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
+ Ctx);
+ }
+
+ case Instruction::Trunc:
+ // We emit the value and depend on the assembler to truncate the generated
+ // expression properly. This is important for differences between
+ // blockaddress labels. Since the two labels are in the same function, it
+ // is reasonable to treat their delta as a 32-bit value.
+ // FALL THROUGH.
+ case Instruction::BitCast:
+ return LowerConstant(CE->getOperand(0), AP);
+
+ case Instruction::IntToPtr: {
+ const DataLayout &TD = *AP.TM.getDataLayout();
+ // Handle casts to pointers by changing them into casts to the appropriate
+ // integer type. This promotes constant folding and simplifies this code.
+ Constant *Op = CE->getOperand(0);
+ Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+ false /*ZExt*/);
+ return LowerConstant(Op, AP);
+ }
+
+ case Instruction::PtrToInt: {
+ const DataLayout &TD = *AP.TM.getDataLayout();
+ // Support only foldable casts to/from pointers that can be eliminated by
+ // changing the pointer to the appropriately sized integer type.
+ Constant *Op = CE->getOperand(0);
+ Type *Ty = CE->getType();
+
+ const MCExpr *OpExpr = LowerConstant(Op, AP);
+
+ // We can emit the pointer value into this slot if the slot is an
+ // integer slot equal to the size of the pointer.
+ if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+ return OpExpr;
+
+ // Otherwise the pointer is smaller than the resultant integer, mask off
+ // the high bits so we are sure to get a proper truncation if the input is
+ // a constant expr.
+ unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+ const MCExpr *MaskExpr =
+ MCConstantExpr::Create(~0ULL >> (64 - InBits), Ctx);
+ return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
+ }
+
+ // The MC library also has a right-shift operator, but it isn't consistently
+ // signed or unsigned between different targets.
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::Shl:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP);
+ const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP);
+ switch (CE->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown binary operator constant cast expr");
+ case Instruction::Add:
+ return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
+ case Instruction::Sub:
+ return MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+ case Instruction::Mul:
+ return MCBinaryExpr::CreateMul(LHS, RHS, Ctx);
+ case Instruction::SDiv:
+ return MCBinaryExpr::CreateDiv(LHS, RHS, Ctx);
+ case Instruction::SRem:
+ return MCBinaryExpr::CreateMod(LHS, RHS, Ctx);
+ case Instruction::Shl:
+ return MCBinaryExpr::CreateShl(LHS, RHS, Ctx);
+ case Instruction::And:
+ return MCBinaryExpr::CreateAnd(LHS, RHS, Ctx);
+ case Instruction::Or:
+ return MCBinaryExpr::CreateOr(LHS, RHS, Ctx);
+ case Instruction::Xor:
+ return MCBinaryExpr::CreateXor(LHS, RHS, Ctx);
+ }
+ }
+ }
+}
+
+void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
+ if (!EmitLineNumbers)
+ return;
+ if (ignoreLoc(MI))
+ return;
+
+ DebugLoc curLoc = MI.getDebugLoc();
+
+ if (prevDebugLoc.isUnknown() && curLoc.isUnknown())
+ return;
+
+ if (prevDebugLoc == curLoc)
+ return;
+
+ prevDebugLoc = curLoc;
+
+ if (curLoc.isUnknown())
+ return;
+
+ const MachineFunction *MF = MI.getParent()->getParent();
+ //const TargetMachine &TM = MF->getTarget();
+
+ const LLVMContext &ctx = MF->getFunction()->getContext();
+ DIScope Scope(curLoc.getScope(ctx));
+
+ assert((!Scope || Scope.isScope()) &&
+ "Scope of a DebugLoc should be null or a DIScope.");
+ if (!Scope)
+ return;
+
+ StringRef fileName(Scope.getFilename());
+ StringRef dirName(Scope.getDirectory());
+ SmallString<128> FullPathName = dirName;
+ if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
+ sys::path::append(FullPathName, fileName);
+ fileName = FullPathName.str();
+ }
+
+ if (filenameMap.find(fileName.str()) == filenameMap.end())
+ return;
+
+ // Emit the line from the source file.
+ if (InterleaveSrc)
+ this->emitSrcInText(fileName.str(), curLoc.getLine());
+
+ std::stringstream temp;
+ temp << "\t.loc " << filenameMap[fileName.str()] << " " << curLoc.getLine()
+ << " " << curLoc.getCol();
+ OutStreamer.EmitRawText(Twine(temp.str().c_str()));
+}
+
+void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ SmallString<128> Str;
+ raw_svector_ostream OS(Str);
+ if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+ emitLineNumberAsDotLoc(*MI);
+
+ MCInst Inst;
+ lowerToMCInst(MI, Inst);
+ EmitToStreamer(OutStreamer, Inst);
+}
+
+// Handle symbol backtracking for targets that do not support image handles
+bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
+ unsigned OpNo, MCOperand &MCOp) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ const MCInstrDesc &MCID = MI->getDesc();
+
+ if (MCID.TSFlags & NVPTXII::IsTexFlag) {
+ // This is a texture fetch, so operand 4 is a texref and operand 5 is
+ // a samplerref
+ if (OpNo == 4 && MO.isImm()) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+ if (OpNo == 5 && MO.isImm() && !(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+
+ return false;
+ } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+ unsigned VecSize =
+ 1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+ // For a surface load of vector size N, the Nth operand will be the surfref
+ if (OpNo == VecSize && MO.isImm()) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+
+ return false;
+ } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
+ // This is a surface store, so operand 0 is a surfref
+ if (OpNo == 0 && MO.isImm()) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+
+ return false;
+ } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
+ // This is a query, so operand 1 is a surfref/texref
+ if (OpNo == 1 && MO.isImm()) {
+ lowerImageHandleSymbol(MO.getImm(), MCOp);
+ return true;
+ }
+
+ return false;
+ }
+
+ return false;
+}
+
+void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
+ // Ewwww
+ TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget());
+ NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM);
+ const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+ const char *Sym = MFI->getImageHandleSymbol(Index);
+ std::string *SymNamePtr =
+ nvTM.getManagedStrPool()->getManagedString(Sym);
+ MCOp = GetSymbolRef(OutContext.GetOrCreateSymbol(
+ StringRef(SymNamePtr->c_str())));
+}
+
+void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
+ OutMI.setOpcode(MI->getOpcode());
+ const NVPTXSubtarget &ST = TM.getSubtarget<NVPTXSubtarget>();
+
+ // Special: Do not mangle symbol operand of CALL_PROTOTYPE
+ if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
+ const MachineOperand &MO = MI->getOperand(0);
+ OutMI.addOperand(GetSymbolRef(
+ OutContext.GetOrCreateSymbol(Twine(MO.getSymbolName()))));
+ return;
+ }
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ if (!ST.hasImageHandles()) {
+ if (lowerImageHandleOperand(MI, i, MCOp)) {
+ OutMI.addOperand(MCOp);
+ continue;
+ }
+ }
+
+ if (lowerOperand(MO, MCOp))
+ OutMI.addOperand(MCOp);
+ }
+}
+
+bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
+ MCOperand &MCOp) {
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ MCOp = MCOperand::CreateReg(encodeVirtualRegister(MO.getReg()));
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::CreateImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+ MO.getMBB()->getSymbol(), OutContext));
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = GetSymbolRef(GetExternalSymbolSymbol(MO.getSymbolName()));
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = GetSymbolRef(getSymbol(MO.getGlobal()));
+ break;
+ case MachineOperand::MO_FPImmediate: {
+ const ConstantFP *Cnt = MO.getFPImm();
+ APFloat Val = Cnt->getValueAPF();
+
+ switch (Cnt->getType()->getTypeID()) {
+ default: report_fatal_error("Unsupported FP type"); break;
+ case Type::FloatTyID:
+ MCOp = MCOperand::CreateExpr(
+ NVPTXFloatMCExpr::CreateConstantFPSingle(Val, OutContext));
+ break;
+ case Type::DoubleTyID:
+ MCOp = MCOperand::CreateExpr(
+ NVPTXFloatMCExpr::CreateConstantFPDouble(Val, OutContext));
+ break;
+ }
+ break;
+ }
+ }
+ return true;
+}
+
+unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+ DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
+ unsigned RegNum = RegMap[Reg];
+
+ // Encode the register class in the upper 4 bits
+ // Must be kept in sync with NVPTXInstPrinter::printRegName
+ unsigned Ret = 0;
+ if (RC == &NVPTX::Int1RegsRegClass) {
+ Ret = (1 << 28);
+ } else if (RC == &NVPTX::Int16RegsRegClass) {
+ Ret = (2 << 28);
+ } else if (RC == &NVPTX::Int32RegsRegClass) {
+ Ret = (3 << 28);
+ } else if (RC == &NVPTX::Int64RegsRegClass) {
+ Ret = (4 << 28);
+ } else if (RC == &NVPTX::Float32RegsRegClass) {
+ Ret = (5 << 28);
+ } else if (RC == &NVPTX::Float64RegsRegClass) {
+ Ret = (6 << 28);
+ } else {
+ report_fatal_error("Bad register class");
+ }
+
+ // Insert the vreg number
+ Ret |= (RegNum & 0x0FFFFFFF);
+ return Ret;
+ } else {
+ // Some special-use registers are actually physical registers.
+ // Encode this as the register class ID of 0 and the real register ID.
+ return Reg & 0x0FFFFFFF;
+ }
+}
+
+MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
+ const MCExpr *Expr;
+ Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+ OutContext);
+ return MCOperand::CreateExpr(Expr);
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
+ const DataLayout *TD = TM.getDataLayout();
+ const TargetLowering *TLI = TM.getTargetLowering();
+
+ Type *Ty = F->getReturnType();
+
+ bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+ if (Ty->getTypeID() == Type::VoidTyID)
+ return;
+
+ O << " (";
+
+ if (isABI) {
+ if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
+ unsigned size = 0;
+ if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+ size = ITy->getBitWidth();
+ if (size < 32)
+ size = 32;
+ } else {
+ assert(Ty->isFloatingPointTy() && "Floating point type expected here");
+ size = Ty->getPrimitiveSizeInBits();
+ }
+
+ O << ".param .b" << size << " func_retval0";
+ } else if (isa<PointerType>(Ty)) {
+ O << ".param .b" << TLI->getPointerTy().getSizeInBits()
+ << " func_retval0";
+ } else {
+ if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
+ unsigned totalsz = TD->getTypeAllocSize(Ty);
+ unsigned retAlignment = 0;
+ if (!llvm::getAlign(*F, 0, retAlignment))
+ retAlignment = TD->getABITypeAlignment(Ty);
+ O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
+ << "]";
+ } else
+ assert(false && "Unknown return type");
+ }
+ } else {
+ SmallVector<EVT, 16> vtparts;
+ ComputeValueVTs(*TLI, Ty, vtparts);
+ unsigned idx = 0;
+ for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+ unsigned elems = 1;
+ EVT elemtype = vtparts[i];
+ if (vtparts[i].isVector()) {
+ elems = vtparts[i].getVectorNumElements();
+ elemtype = vtparts[i].getVectorElementType();
+ }
+
+ for (unsigned j = 0, je = elems; j != je; ++j) {
+ unsigned sz = elemtype.getSizeInBits();
+ if (elemtype.isInteger() && (sz < 32))
+ sz = 32;
+ O << ".reg .b" << sz << " func_retval" << idx;
+ if (j < je - 1)
+ O << ", ";
+ ++idx;
+ }
+ if (i < e - 1)
+ O << ", ";
+ }
+ }
+ O << ") ";
+ return;
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
+ raw_ostream &O) {
+ const Function *F = MF.getFunction();
+ printReturnValStr(F, O);
+}
+
+void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
+ SmallString<128> Str;
+ raw_svector_ostream O(Str);
+
+ if (!GlobalsEmitted) {
+ emitGlobals(*MF->getFunction()->getParent());
+ GlobalsEmitted = true;
+ }
+
+ // Set up
+ MRI = &MF->getRegInfo();
+ F = MF->getFunction();
+ emitLinkageDirective(F, O);
+ if (llvm::isKernelFunction(*F))
+ O << ".entry ";
+ else {
+ O << ".func ";
+ printReturnValStr(*MF, O);
+ }
+
+ O << *CurrentFnSym;
+
+ emitFunctionParamList(*MF, O);
+
+ if (llvm::isKernelFunction(*F))
+ emitKernelFunctionDirectives(*F, O);
+
+ OutStreamer.EmitRawText(O.str());
+
+ prevDebugLoc = DebugLoc();
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
+ VRegMapping.clear();
+ OutStreamer.EmitRawText(StringRef("{\n"));
+ setAndEmitFunctionVirtualRegisters(*MF);
+
+ SmallString<128> Str;
+ raw_svector_ostream O(Str);
+ emitDemotedVars(MF->getFunction(), O);
+ OutStreamer.EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
+ OutStreamer.EmitRawText(StringRef("}\n"));
+ VRegMapping.clear();
+}
+
+void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+ unsigned RegNo = MI->getOperand(0).getReg();
+ const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+ if (TRI->isVirtualRegister(RegNo)) {
+ OutStreamer.AddComment(Twine("implicit-def: ") +
+ getVirtualRegisterName(RegNo));
+ } else {
+ OutStreamer.AddComment(Twine("implicit-def: ") +
+ TM.getRegisterInfo()->getName(RegNo));
+ }
+ OutStreamer.AddBlankLine();
+}
+
+void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
+ raw_ostream &O) const {
+ // If the NVVM IR has some of reqntid* specified, then output
+ // the reqntid directive, and set the unspecified ones to 1.
+ // If none of reqntid* is specified, don't output reqntid directive.
+ unsigned reqntidx, reqntidy, reqntidz;
+ bool specified = false;
+ if (llvm::getReqNTIDx(F, reqntidx) == false)
+ reqntidx = 1;
+ else
+ specified = true;
+ if (llvm::getReqNTIDy(F, reqntidy) == false)
+ reqntidy = 1;
+ else
+ specified = true;
+ if (llvm::getReqNTIDz(F, reqntidz) == false)
+ reqntidz = 1;
+ else
+ specified = true;
+
+ if (specified)
+ O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz
+ << "\n";
+
+ // If the NVVM IR has some of maxntid* specified, then output
+ // the maxntid directive, and set the unspecified ones to 1.
+ // If none of maxntid* is specified, don't output maxntid directive.
+ unsigned maxntidx, maxntidy, maxntidz;
+ specified = false;
+ if (llvm::getMaxNTIDx(F, maxntidx) == false)
+ maxntidx = 1;
+ else
+ specified = true;
+ if (llvm::getMaxNTIDy(F, maxntidy) == false)
+ maxntidy = 1;
+ else
+ specified = true;
+ if (llvm::getMaxNTIDz(F, maxntidz) == false)
+ maxntidz = 1;
+ else
+ specified = true;
+
+ if (specified)
+ O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz
+ << "\n";
+
+ unsigned mincta;
+ if (llvm::getMinCTASm(F, mincta))
+ O << ".minnctapersm " << mincta << "\n";
+}
+
+std::string
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+ std::string Name;
+ raw_string_ostream NameStr(Name);
+
+ VRegRCMap::const_iterator I = VRegMapping.find(RC);
+ assert(I != VRegMapping.end() && "Bad register class");
+ const DenseMap<unsigned, unsigned> &RegMap = I->second;
+
+ VRegMap::const_iterator VI = RegMap.find(Reg);
+ assert(VI != RegMap.end() && "Bad virtual register");
+ unsigned MappedVR = VI->second;
+
+ NameStr << getNVPTXRegClassStr(RC) << MappedVR;
+
+ NameStr.flush();
+ return Name;
+}
+
+void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
+ raw_ostream &O) {
+ O << getVirtualRegisterName(vr);
+}
+
+void NVPTXAsmPrinter::printVecModifiedImmediate(
+ const MachineOperand &MO, const char *Modifier, raw_ostream &O) {
+ static const char vecelem[] = { '0', '1', '2', '3', '0', '1', '2', '3' };
+ int Imm = (int) MO.getImm();
+ if (0 == strcmp(Modifier, "vecelem"))
+ O << "_" << vecelem[Imm];
+ else if (0 == strcmp(Modifier, "vecv4comm1")) {
+ if ((Imm < 0) || (Imm > 3))
+ O << "//";
+ } else if (0 == strcmp(Modifier, "vecv4comm2")) {
+ if ((Imm < 4) || (Imm > 7))
+ O << "//";
+ } else if (0 == strcmp(Modifier, "vecv4pos")) {
+ if (Imm < 0)
+ Imm = 0;
+ O << "_" << vecelem[Imm % 4];
+ } else if (0 == strcmp(Modifier, "vecv2comm1")) {
+ if ((Imm < 0) || (Imm > 1))
+ O << "//";
+ } else if (0 == strcmp(Modifier, "vecv2comm2")) {
+ if ((Imm < 2) || (Imm > 3))
+ O << "//";
+ } else if (0 == strcmp(Modifier, "vecv2pos")) {
+ if (Imm < 0)
+ Imm = 0;
+ O << "_" << vecelem[Imm % 2];
+ } else
+ llvm_unreachable("Unknown Modifier on immediate operand");
+}
+
+
+
+void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
+
+ emitLinkageDirective(F, O);
+ if (llvm::isKernelFunction(*F))
+ O << ".entry ";
+ else
+ O << ".func ";
+ printReturnValStr(F, O);
+ O << *getSymbol(F) << "\n";
+ emitFunctionParamList(F, O);
+ O << ";\n";
+}
+
+static bool usedInGlobalVarDef(const Constant *C) {
+ if (!C)
+ return false;
+
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+ if (GV->getName().str() == "llvm.used")
+ return false;
+ return true;
+ }
+
+ for (const User *U : C->users())
+ if (const Constant *C = dyn_cast<Constant>(U))
+ if (usedInGlobalVarDef(C))
+ return true;
+
+ return false;
+}
+
+static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
+ if (const GlobalVariable *othergv = dyn_cast<GlobalVariable>(U)) {
+ if (othergv->getName().str() == "llvm.used")
+ return true;
+ }
+
+ if (const Instruction *instr = dyn_cast<Instruction>(U)) {
+ if (instr->getParent() && instr->getParent()->getParent()) {
+ const Function *curFunc = instr->getParent()->getParent();
+ if (oneFunc && (curFunc != oneFunc))
+ return false;
+ oneFunc = curFunc;
+ return true;
+ } else
+ return false;
+ }
+
+ if (const MDNode *md = dyn_cast<MDNode>(U))
+ if (md->hasName() && ((md->getName().str() == "llvm.dbg.gv") ||
+ (md->getName().str() == "llvm.dbg.sp")))
+ return true;
+
+ for (const User *UU : U->users())
+ if (usedInOneFunc(UU, oneFunc) == false)
+ return false;
+
+ return true;
+}
+
+/* Find out if a global variable can be demoted to local scope.
+ * Currently, this is valid for CUDA shared variables, which have local
+ * scope and global lifetime. So the conditions to check are :
+ * 1. Is the global variable in shared address space?
+ * 2. Does it have internal linkage?
+ * 3. Is the global variable referenced only in one function?
+ */
+static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
+ if (gv->hasInternalLinkage() == false)
+ return false;
+ const PointerType *Pty = gv->getType();
+ if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+ return false;
+
+ const Function *oneFunc = nullptr;
+
+ bool flag = usedInOneFunc(gv, oneFunc);
+ if (flag == false)
+ return false;
+ if (!oneFunc)
+ return false;
+ f = oneFunc;
+ return true;
+}
+
+static bool useFuncSeen(const Constant *C,
+ llvm::DenseMap<const Function *, bool> &seenMap) {
+ for (const User *U : C->users()) {
+ if (const Constant *cu = dyn_cast<Constant>(U)) {
+ if (useFuncSeen(cu, seenMap))
+ return true;
+ } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
+ const BasicBlock *bb = I->getParent();
+ if (!bb)
+ continue;
+ const Function *caller = bb->getParent();
+ if (!caller)
+ continue;
+ if (seenMap.find(caller) != seenMap.end())
+ return true;
+ }
+ }
+ return false;
+}
+
+void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
+ llvm::DenseMap<const Function *, bool> seenMap;
+ for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+ const Function *F = FI;
+
+ if (F->isDeclaration()) {
+ if (F->use_empty())
+ continue;
+ if (F->getIntrinsicID())
+ continue;
+ emitDeclaration(F, O);
+ continue;
+ }
+ for (const User *U : F->users()) {
+ if (const Constant *C = dyn_cast<Constant>(U)) {
+ if (usedInGlobalVarDef(C)) {
+ // The use is in the initialization of a global variable
+ // that is a function pointer, so print a declaration
+ // for the original function
+ emitDeclaration(F, O);
+ break;
+ }
+ // Emit a declaration of this function if the function that
+ // uses this constant expr has already been seen.
+ if (useFuncSeen(C, seenMap)) {
+ emitDeclaration(F, O);
+ break;
+ }
+ }
+
+ if (!isa<Instruction>(U))
+ continue;
+ const Instruction *instr = cast<Instruction>(U);
+ const BasicBlock *bb = instr->getParent();
+ if (!bb)
+ continue;
+ const Function *caller = bb->getParent();
+ if (!caller)
+ continue;
+
+ // If a caller has already been seen, then the caller is
+ // appearing in the module before the callee. so print out
+ // a declaration for the callee.
+ if (seenMap.find(caller) != seenMap.end()) {
+ emitDeclaration(F, O);
+ break;
+ }
+ }
+ seenMap[F] = true;
+ }
+}
+
+void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
+ DebugInfoFinder DbgFinder;
+ DbgFinder.processModule(M);
+
+ unsigned i = 1;
+ for (DICompileUnit DIUnit : DbgFinder.compile_units()) {
+ StringRef Filename(DIUnit.getFilename());
+ StringRef Dirname(DIUnit.getDirectory());
+ SmallString<128> FullPathName = Dirname;
+ if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+ sys::path::append(FullPathName, Filename);
+ Filename = FullPathName.str();
+ }
+ if (filenameMap.find(Filename.str()) != filenameMap.end())
+ continue;
+ filenameMap[Filename.str()] = i;
+ OutStreamer.EmitDwarfFileDirective(i, "", Filename.str());
+ ++i;
+ }
+
+ for (DISubprogram SP : DbgFinder.subprograms()) {
+ StringRef Filename(SP.getFilename());
+ StringRef Dirname(SP.getDirectory());
+ SmallString<128> FullPathName = Dirname;
+ if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+ sys::path::append(FullPathName, Filename);
+ Filename = FullPathName.str();
+ }
+ if (filenameMap.find(Filename.str()) != filenameMap.end())
+ continue;
+ filenameMap[Filename.str()] = i;
+ ++i;
+ }
+}
+
+bool NVPTXAsmPrinter::doInitialization(Module &M) {
+
+ SmallString<128> Str1;
+ raw_svector_ostream OS1(Str1);
+
+ MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+ MMI->AnalyzeModule(M);
+
+ // We need to call the parent's one explicitly.
+ //bool Result = AsmPrinter::doInitialization(M);
+
+ // Initialize TargetLoweringObjectFile.
+ const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
+ .Initialize(OutContext, TM);
+
+ Mang = new Mangler(TM.getDataLayout());
+
+ // Emit header before any dwarf directives are emitted below.
+ emitHeader(M, OS1);
+ OutStreamer.EmitRawText(OS1.str());
+
+ // Already commented out
+ //bool Result = AsmPrinter::doInitialization(M);
+
+ // Emit module-level inline asm if it exists.
+ if (!M.getModuleInlineAsm().empty()) {
+ OutStreamer.AddComment("Start of file scope inline assembly");
+ OutStreamer.AddBlankLine();
+ OutStreamer.EmitRawText(StringRef(M.getModuleInlineAsm()));
+ OutStreamer.AddBlankLine();
+ OutStreamer.AddComment("End of file scope inline assembly");
+ OutStreamer.AddBlankLine();
+ }
+
+ if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+ recordAndEmitFilenames(M);
+
+ GlobalsEmitted = false;
+
+ return false; // success
+}
+
+void NVPTXAsmPrinter::emitGlobals(const Module &M) {
+ SmallString<128> Str2;
+ raw_svector_ostream OS2(Str2);
+
+ emitDeclarations(M, OS2);
+
+ // As ptxas does not support forward references of globals, we need to first
+ // sort the list of module-level globals in def-use order. We visit each
+ // global variable in order, and ensure that we emit it *after* its dependent
+ // globals. We use a little extra memory maintaining both a set and a list to
+ // have fast searches while maintaining a strict ordering.
+ SmallVector<const GlobalVariable *, 8> Globals;
+ DenseSet<const GlobalVariable *> GVVisited;
+ DenseSet<const GlobalVariable *> GVVisiting;
+
+ // Visit each global variable, in order
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I)
+ VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
+
+ assert(GVVisited.size() == M.getGlobalList().size() &&
+ "Missed a global variable");
+ assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
+
+ // Print out module-level global variables in proper order
+ for (unsigned i = 0, e = Globals.size(); i != e; ++i)
+ printModuleLevelGV(Globals[i], OS2);
+
+ OS2 << '\n';
+
+ OutStreamer.EmitRawText(OS2.str());
+}
+
+void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
+ O << "//\n";
+ O << "// Generated by LLVM NVPTX Back-End\n";
+ O << "//\n";
+ O << "\n";
+
+ unsigned PTXVersion = nvptxSubtarget.getPTXVersion();
+ O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
+
+ O << ".target ";
+ O << nvptxSubtarget.getTargetName();
+
+ if (nvptxSubtarget.getDrvInterface() == NVPTX::NVCL)
+ O << ", texmode_independent";
+ if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+ if (!nvptxSubtarget.hasDouble())
+ O << ", map_f64_to_f32";
+ }
+
+ if (MAI->doesSupportDebugInformation())
+ O << ", debug";
+
+ O << "\n";
+
+ O << ".address_size ";
+ if (nvptxSubtarget.is64Bit())
+ O << "64";
+ else
+ O << "32";
+ O << "\n";
+
+ O << "\n";
+}
+
+bool NVPTXAsmPrinter::doFinalization(Module &M) {
+
+ // If we did not emit any functions, then the global declarations have not
+ // yet been emitted.
+ if (!GlobalsEmitted) {
+ emitGlobals(M);
+ GlobalsEmitted = true;
+ }
+
+ // XXX Temproarily remove global variables so that doFinalization() will not
+ // emit them again (global variables are emitted at beginning).
+
+ Module::GlobalListType &global_list = M.getGlobalList();
+ int i, n = global_list.size();
+ GlobalVariable **gv_array = new GlobalVariable *[n];
+
+ // first, back-up GlobalVariable in gv_array
+ i = 0;
+ for (Module::global_iterator I = global_list.begin(), E = global_list.end();
+ I != E; ++I)
+ gv_array[i++] = &*I;
+
+ // second, empty global_list
+ while (!global_list.empty())
+ global_list.remove(global_list.begin());
+
+ // call doFinalization
+ bool ret = AsmPrinter::doFinalization(M);
+
+ // now we restore global variables
+ for (i = 0; i < n; i++)
+ global_list.insert(global_list.end(), gv_array[i]);
+
+ clearAnnotationCache(&M);
+
+ delete[] gv_array;
+ return ret;
+
+ //bool Result = AsmPrinter::doFinalization(M);
+ // Instead of calling the parents doFinalization, we may
+ // clone parents doFinalization and customize here.
+ // Currently, we if NVISA out the EmitGlobals() in
+ // parent's doFinalization, which is too intrusive.
+ //
+ // Same for the doInitialization.
+ //return Result;
+}
+
+// This function emits appropriate linkage directives for
+// functions and global variables.
+//
+// extern function declaration -> .extern
+// extern function definition -> .visible
+// external global variable with init -> .visible
+// external without init -> .extern
+// appending -> not allowed, assert.
+// for any linkage other than
+// internal, private, linker_private,
+// linker_private_weak, linker_private_weak_def_auto,
+// we emit -> .weak.
+
+void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
+ raw_ostream &O) {
+ if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+ if (V->hasExternalLinkage()) {
+ if (isa<GlobalVariable>(V)) {
+ const GlobalVariable *GVar = cast<GlobalVariable>(V);
+ if (GVar) {
+ if (GVar->hasInitializer())
+ O << ".visible ";
+ else
+ O << ".extern ";
+ }
+ } else if (V->isDeclaration())
+ O << ".extern ";
+ else
+ O << ".visible ";
+ } else if (V->hasAppendingLinkage()) {
+ std::string msg;
+ msg.append("Error: ");
+ msg.append("Symbol ");
+ if (V->hasName())
+ msg.append(V->getName().str());
+ msg.append("has unsupported appending linkage type");
+ llvm_unreachable(msg.c_str());
+ } else if (!V->hasInternalLinkage() &&
+ !V->hasPrivateLinkage()) {
+ O << ".weak ";
+ }
+ }
+}
+
+void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
+ raw_ostream &O,
+ bool processDemoted) {
+
+ // Skip meta data
+ if (GVar->hasSection()) {
+ if (GVar->getSection() == StringRef("llvm.metadata"))
+ return;
+ }
+
+ // Skip LLVM intrinsic global variables
+ if (GVar->getName().startswith("llvm.") ||
+ GVar->getName().startswith("nvvm."))
+ return;
+
+ const DataLayout *TD = TM.getDataLayout();
+
+ // GlobalVariables are always constant pointers themselves.
+ const PointerType *PTy = GVar->getType();
+ Type *ETy = PTy->getElementType();
+
+ if (GVar->hasExternalLinkage()) {
+ if (GVar->hasInitializer())
+ O << ".visible ";
+ else
+ O << ".extern ";
+ } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() ||
+ GVar->hasAvailableExternallyLinkage() ||
+ GVar->hasCommonLinkage()) {
+ O << ".weak ";
+ }
+
+ if (llvm::isTexture(*GVar)) {
+ O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+ return;
+ }
+
+ if (llvm::isSurface(*GVar)) {
+ O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+ return;
+ }
+
+ if (GVar->isDeclaration()) {
+ // (extern) declarations, no definition or initializer
+ // Currently the only known declaration is for an automatic __local
+ // (.shared) promoted to global.
+ emitPTXGlobalVariable(GVar, O);
+ O << ";\n";
+ return;
+ }
+
+ if (llvm::isSampler(*GVar)) {
+ O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+
+ const Constant *Initializer = nullptr;
+ if (GVar->hasInitializer())
+ Initializer = GVar->getInitializer();
+ const ConstantInt *CI = nullptr;
+ if (Initializer)
+ CI = dyn_cast<ConstantInt>(Initializer);
+ if (CI) {
+ unsigned sample = CI->getZExtValue();
+
+ O << " = { ";
+
+ for (int i = 0,
+ addr = ((sample & __CLK_ADDRESS_MASK) >> __CLK_ADDRESS_BASE);
+ i < 3; i++) {
+ O << "addr_mode_" << i << " = ";
+ switch (addr) {
+ case 0:
+ O << "wrap";
+ break;
+ case 1:
+ O << "clamp_to_border";
+ break;
+ case 2:
+ O << "clamp_to_edge";
+ break;
+ case 3:
+ O << "wrap";
+ break;
+ case 4:
+ O << "mirror";
+ break;
+ }
+ O << ", ";
+ }
+ O << "filter_mode = ";
+ switch ((sample & __CLK_FILTER_MASK) >> __CLK_FILTER_BASE) {
+ case 0:
+ O << "nearest";
+ break;
+ case 1:
+ O << "linear";
+ break;
+ case 2:
+ llvm_unreachable("Anisotropic filtering is not supported");
+ default:
+ O << "nearest";
+ break;
+ }
+ if (!((sample & __CLK_NORMALIZED_MASK) >> __CLK_NORMALIZED_BASE)) {
+ O << ", force_unnormalized_coords = 1";
+ }
+ O << " }";
+ }
+
+ O << ";\n";
+ return;
+ }
+
+ if (GVar->hasPrivateLinkage()) {
+
+ if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+ return;
+
+ // FIXME - need better way (e.g. Metadata) to avoid generating this global
+ if (!strncmp(GVar->getName().data(), "filename", 8))
+ return;
+ if (GVar->use_empty())
+ return;
+ }
+
+ const Function *demotedFunc = nullptr;
+ if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
+ O << "// " << GVar->getName().str() << " has been demoted\n";
+ if (localDecls.find(demotedFunc) != localDecls.end())
+ localDecls[demotedFunc].push_back(GVar);
+ else {
+ std::vector<const GlobalVariable *> temp;
+ temp.push_back(GVar);
+ localDecls[demotedFunc] = temp;
+ }
+ return;
+ }
+
+ O << ".";
+ emitPTXAddressSpace(PTy->getAddressSpace(), O);
+
+ if (isManaged(*GVar)) {
+ O << " .attribute(.managed)";
+ }
+
+ if (GVar->getAlignment() == 0)
+ O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+ else
+ O << " .align " << GVar->getAlignment();
+
+ if (ETy->isSingleValueType()) {
+ O << " .";
+ // Special case: ABI requires that we use .u8 for predicates
+ if (ETy->isIntegerTy(1))
+ O << "u8";
+ else
+ O << getPTXFundamentalTypeStr(ETy, false);
+ O << " ";
+ O << *getSymbol(GVar);
+
+ // Ptx allows variable initilization only for constant and global state
+ // spaces.
+ if (GVar->hasInitializer()) {
+ if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+ (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+ const Constant *Initializer = GVar->getInitializer();
+ // 'undef' is treated as there is no value spefied.
+ if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
+ O << " = ";
+ printScalarConstant(Initializer, O);
+ }
+ } else {
+ // The frontend adds zero-initializer to variables that don't have an
+ // initial value, so skip warning for this case.
+ if (!GVar->getInitializer()->isNullValue()) {
+ std::string warnMsg = "initial value of '" + GVar->getName().str() +
+ "' is not allowed in addrspace(" +
+ llvm::utostr_32(PTy->getAddressSpace()) + ")";
+ report_fatal_error(warnMsg.c_str());
+ }
+ }
+ }
+ } else {
+ unsigned int ElementSize = 0;
+
+ // Although PTX has direct support for struct type and array type and
+ // LLVM IR is very similar to PTX, the LLVM CodeGen does not support for
+ // targets that support these high level field accesses. Structs, arrays
+ // and vectors are lowered into arrays of bytes.
+ switch (ETy->getTypeID()) {
+ case Type::StructTyID:
+ case Type::ArrayTyID:
+ case Type::VectorTyID:
+ ElementSize = TD->getTypeStoreSize(ETy);
+ // Ptx allows variable initilization only for constant and
+ // global state spaces.
+ if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+ (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+ GVar->hasInitializer()) {
+ const Constant *Initializer = GVar->getInitializer();
+ if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
+ AggBuffer aggBuffer(ElementSize, O, *this);
+ bufferAggregateConstant(Initializer, &aggBuffer);
+ if (aggBuffer.numSymbols) {
+ if (nvptxSubtarget.is64Bit()) {
+ O << " .u64 " << *getSymbol(GVar) << "[";
+ O << ElementSize / 8;
+ } else {
+ O << " .u32 " << *getSymbol(GVar) << "[";
+ O << ElementSize / 4;
+ }
+ O << "]";
+ } else {
+ O << " .b8 " << *getSymbol(GVar) << "[";
+ O << ElementSize;
+ O << "]";
+ }
+ O << " = {";
+ aggBuffer.print();
+ O << "}";
+ } else {
+ O << " .b8 " << *getSymbol(GVar);
+ if (ElementSize) {
+ O << "[";
+ O << ElementSize;
+ O << "]";
+ }
+ }
+ } else {
+ O << " .b8 " << *getSymbol(GVar);
+ if (ElementSize) {
+ O << "[";
+ O << ElementSize;
+ O << "]";
+ }
+ }
+ break;
+ default:
+ llvm_unreachable("type not supported yet");
+ }
+
+ }
+ O << ";\n";
+}
+
+void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
+ if (localDecls.find(f) == localDecls.end())
+ return;
+
+ std::vector<const GlobalVariable *> &gvars = localDecls[f];
+
+ for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
+ O << "\t// demoted variable\n\t";
+ printModuleLevelGV(gvars[i], O, true);
+ }
+}
+
+void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
+ raw_ostream &O) const {
+ switch (AddressSpace) {
+ case llvm::ADDRESS_SPACE_LOCAL:
+ O << "local";
+ break;
+ case llvm::ADDRESS_SPACE_GLOBAL:
+ O << "global";
+ break;
+ case llvm::ADDRESS_SPACE_CONST:
+ O << "const";
+ break;
+ case llvm::ADDRESS_SPACE_SHARED:
+ O << "shared";
+ break;
+ default:
+ report_fatal_error("Bad address space found while emitting PTX");
+ break;
+ }
+}
+
+std::string
+NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
+ switch (Ty->getTypeID()) {
+ default:
+ llvm_unreachable("unexpected type");
+ break;
+ case Type::IntegerTyID: {
+ unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+ if (NumBits == 1)
+ return "pred";
+ else if (NumBits <= 64) {
+ std::string name = "u";
+ return name + utostr(NumBits);
+ } else {
+ llvm_unreachable("Integer too large");
+ break;
+ }
+ break;
+ }
+ case Type::FloatTyID:
+ return "f32";
+ case Type::DoubleTyID:
+ return "f64";
+ case Type::PointerTyID:
+ if (nvptxSubtarget.is64Bit())
+ if (useB4PTR)
+ return "b64";
+ else
+ return "u64";
+ else if (useB4PTR)
+ return "b32";
+ else
+ return "u32";
+ }
+ llvm_unreachable("unexpected type");
+ return nullptr;
+}
+
+void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
+ raw_ostream &O) {
+
+ const DataLayout *TD = TM.getDataLayout();
+
+ // GlobalVariables are always constant pointers themselves.
+ const PointerType *PTy = GVar->getType();
+ Type *ETy = PTy->getElementType();
+
+ O << ".";
+ emitPTXAddressSpace(PTy->getAddressSpace(), O);
+ if (GVar->getAlignment() == 0)
+ O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+ else
+ O << " .align " << GVar->getAlignment();
+
+ if (ETy->isSingleValueType()) {
+ O << " .";
+ O << getPTXFundamentalTypeStr(ETy);
+ O << " ";
+ O << *getSymbol(GVar);
+ return;
+ }
+
+ int64_t ElementSize = 0;
+
+ // Although PTX has direct support for struct type and array type and LLVM IR
+ // is very similar to PTX, the LLVM CodeGen does not support for targets that
+ // support these high level field accesses. Structs and arrays are lowered
+ // into arrays of bytes.
+ switch (ETy->getTypeID()) {
+ case Type::StructTyID:
+ case Type::ArrayTyID:
+ case Type::VectorTyID:
+ ElementSize = TD->getTypeStoreSize(ETy);
+ O << " .b8 " << *getSymbol(GVar) << "[";
+ if (ElementSize) {
+ O << itostr(ElementSize);
+ }
+ O << "]";
+ break;
+ default:
+ llvm_unreachable("type not supported yet");
+ }
+ return;
+}
+
+static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
+ if (Ty->isSingleValueType())
+ return TD->getPrefTypeAlignment(Ty);
+
+ const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+ if (ATy)
+ return getOpenCLAlignment(TD, ATy->getElementType());
+
+ const VectorType *VTy = dyn_cast<VectorType>(Ty);
+ if (VTy) {
+ Type *ETy = VTy->getElementType();
+ unsigned int numE = VTy->getNumElements();
+ unsigned int alignE = TD->getPrefTypeAlignment(ETy);
+ if (numE == 3)
+ return 4 * alignE;
+ else
+ return numE * alignE;
+ }
+
+ const StructType *STy = dyn_cast<StructType>(Ty);
+ if (STy) {
+ unsigned int alignStruct = 1;
+ // Go through each element of the struct and find the
+ // largest alignment.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) {
+ Type *ETy = STy->getElementType(i);
+ unsigned int align = getOpenCLAlignment(TD, ETy);
+ if (align > alignStruct)
+ alignStruct = align;
+ }
+ return alignStruct;
+ }
+
+ const FunctionType *FTy = dyn_cast<FunctionType>(Ty);
+ if (FTy)
+ return TD->getPointerPrefAlignment();
+ return TD->getPrefTypeAlignment(Ty);
+}
+
+void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
+ int paramIndex, raw_ostream &O) {
+ if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+ (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
+ O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
+ else {
+ std::string argName = I->getName();
+ const char *p = argName.c_str();
+ while (*p) {
+ if (*p == '.')
+ O << "_";
+ else
+ O << *p;
+ p++;
+ }
+ }
+}
+
+void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
+ Function::const_arg_iterator I, E;
+ int i = 0;
+
+ if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+ (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) {
+ O << *CurrentFnSym << "_param_" << paramIndex;
+ return;
+ }
+
+ for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, i++) {
+ if (i == paramIndex) {
+ printParamName(I, paramIndex, O);
+ return;
+ }
+ }
+ llvm_unreachable("paramIndex out of bound");
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
+ const DataLayout *TD = TM.getDataLayout();
+ const AttributeSet &PAL = F->getAttributes();
+ const TargetLowering *TLI = TM.getTargetLowering();
+ Function::const_arg_iterator I, E;
+ unsigned paramIndex = 0;
+ bool first = true;
+ bool isKernelFunc = llvm::isKernelFunction(*F);
+ bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+ MVT thePointerTy = TLI->getPointerTy();
+
+ O << "(\n";
+
+ for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
+ Type *Ty = I->getType();
+
+ if (!first)
+ O << ",\n";
+
+ first = false;
+
+ // Handle image/sampler parameters
+ if (isKernelFunction(*F)) {
+ if (isSampler(*I) || isImage(*I)) {
+ if (isImage(*I)) {
+ std::string sname = I->getName();
+ if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
+ if (nvptxSubtarget.hasImageHandles())
+ O << "\t.param .u64 .ptr .surfref ";
+ else
+ O << "\t.param .surfref ";
+ O << *CurrentFnSym << "_param_" << paramIndex;
+ }
+ else { // Default image is read_only
+ if (nvptxSubtarget.hasImageHandles())
+ O << "\t.param .u64 .ptr .texref ";
+ else
+ O << "\t.param .texref ";
+ O << *CurrentFnSym << "_param_" << paramIndex;
+ }
+ } else {
+ if (nvptxSubtarget.hasImageHandles())
+ O << "\t.param .u64 .ptr .samplerref ";
+ else
+ O << "\t.param .samplerref ";
+ O << *CurrentFnSym << "_param_" << paramIndex;
+ }
+ continue;
+ }
+ }
+
+ if (PAL.hasAttribute(paramIndex + 1, Attribute::ByVal) == false) {
+ if (Ty->isAggregateType() || Ty->isVectorTy()) {
+ // Just print .param .align <a> .b8 .param[size];
+ // <a> = PAL.getparamalignment
+ // size = typeallocsize of element type
+ unsigned align = PAL.getParamAlignment(paramIndex + 1);
+ if (align == 0)
+ align = TD->getABITypeAlignment(Ty);
+
+ unsigned sz = TD->getTypeAllocSize(Ty);
+ O << "\t.param .align " << align << " .b8 ";
+ printParamName(I, paramIndex, O);
+ O << "[" << sz << "]";
+
+ continue;
+ }
+ // Just a scalar
+ const PointerType *PTy = dyn_cast<PointerType>(Ty);
+ if (isKernelFunc) {
+ if (PTy) {
+ // Special handling for pointer arguments to kernel
+ O << "\t.param .u" << thePointerTy.getSizeInBits() << " ";
+
+ if (nvptxSubtarget.getDrvInterface() != NVPTX::CUDA) {
+ Type *ETy = PTy->getElementType();
+ int addrSpace = PTy->getAddressSpace();
+ switch (addrSpace) {
+ default:
+ O << ".ptr ";
+ break;
+ case llvm::ADDRESS_SPACE_CONST:
+ O << ".ptr .const ";
+ break;
+ case llvm::ADDRESS_SPACE_SHARED:
+ O << ".ptr .shared ";
+ break;
+ case llvm::ADDRESS_SPACE_GLOBAL:
+ O << ".ptr .global ";
+ break;
+ }
+ O << ".align " << (int) getOpenCLAlignment(TD, ETy) << " ";
+ }
+ printParamName(I, paramIndex, O);
+ continue;
+ }
+
+ // non-pointer scalar to kernel func
+ O << "\t.param .";
+ // Special case: predicate operands become .u8 types
+ if (Ty->isIntegerTy(1))
+ O << "u8";
+ else
+ O << getPTXFundamentalTypeStr(Ty);
+ O << " ";
+ printParamName(I, paramIndex, O);
+ continue;
+ }
+ // Non-kernel function, just print .param .b<size> for ABI
+ // and .reg .b<size> for non-ABI
+ unsigned sz = 0;
+ if (isa<IntegerType>(Ty)) {
+ sz = cast<IntegerType>(Ty)->getBitWidth();
+ if (sz < 32)
+ sz = 32;
+ } else if (isa<PointerType>(Ty))
+ sz = thePointerTy.getSizeInBits();
+ else
+ sz = Ty->getPrimitiveSizeInBits();
+ if (isABI)
+ O << "\t.param .b" << sz << " ";
+ else
+ O << "\t.reg .b" << sz << " ";
+ printParamName(I, paramIndex, O);
+ continue;
+ }
+
+ // param has byVal attribute. So should be a pointer
+ const PointerType *PTy = dyn_cast<PointerType>(Ty);
+ assert(PTy && "Param with byval attribute should be a pointer type");
+ Type *ETy = PTy->getElementType();
+
+ if (isABI || isKernelFunc) {
+ // Just print .param .align <a> .b8 .param[size];
+ // <a> = PAL.getparamalignment
+ // size = typeallocsize of element type
+ unsigned align = PAL.getParamAlignment(paramIndex + 1);
+ if (align == 0)
+ align = TD->getABITypeAlignment(ETy);
+
+ unsigned sz = TD->getTypeAllocSize(ETy);
+ O << "\t.param .align " << align << " .b8 ";
+ printParamName(I, paramIndex, O);
+ O << "[" << sz << "]";
+ continue;
+ } else {
+ // Split the ETy into constituent parts and
+ // print .param .b<size> <name> for each part.
+ // Further, if a part is vector, print the above for
+ // each vector element.
+ SmallVector<EVT, 16> vtparts;
+ ComputeValueVTs(*TLI, ETy, vtparts);
+ for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+ unsigned elems = 1;
+ EVT elemtype = vtparts[i];
+ if (vtparts[i].isVector()) {
+ elems = vtparts[i].getVectorNumElements();
+ elemtype = vtparts[i].getVectorElementType();
+ }
+
+ for (unsigned j = 0, je = elems; j != je; ++j) {
+ unsigned sz = elemtype.getSizeInBits();
+ if (elemtype.isInteger() && (sz < 32))
+ sz = 32;
+ O << "\t.reg .b" << sz << " ";
+ printParamName(I, paramIndex, O);
+ if (j < je - 1)
+ O << ",\n";
+ ++paramIndex;
+ }
+ if (i < e - 1)
+ O << ",\n";
+ }
+ --paramIndex;
+ continue;
+ }
+ }
+
+ O << "\n)\n";
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF,
+ raw_ostream &O) {
+ const Function *F = MF.getFunction();
+ emitFunctionParamList(F, O);
+}
+
+void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
+ const MachineFunction &MF) {
+ SmallString<128> Str;
+ raw_svector_ostream O(Str);
+
+ // Map the global virtual register number to a register class specific
+ // virtual register number starting from 1 with that class.
+ const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+ //unsigned numRegClasses = TRI->getNumRegClasses();
+
+ // Emit the Fake Stack Object
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ int NumBytes = (int) MFI->getStackSize();
+ if (NumBytes) {
+ O << "\t.local .align " << MFI->getMaxAlignment() << " .b8 \t" << DEPOTNAME
+ << getFunctionNumber() << "[" << NumBytes << "];\n";
+ if (nvptxSubtarget.is64Bit()) {
+ O << "\t.reg .b64 \t%SP;\n";
+ O << "\t.reg .b64 \t%SPL;\n";
+ } else {
+ O << "\t.reg .b32 \t%SP;\n";
+ O << "\t.reg .b32 \t%SPL;\n";
+ }
+ }
+
+ // Go through all virtual registers to establish the mapping between the
+ // global virtual
+ // register number and the per class virtual register number.
+ // We use the per class virtual register number in the ptx output.
+ unsigned int numVRs = MRI->getNumVirtRegs();
+ for (unsigned i = 0; i < numVRs; i++) {
+ unsigned int vr = TRI->index2VirtReg(i);
+ const TargetRegisterClass *RC = MRI->getRegClass(vr);
+ DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+ int n = regmap.size();
+ regmap.insert(std::make_pair(vr, n + 1));
+ }
+
+ // Emit register declarations
+ // @TODO: Extract out the real register usage
+ // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s64 %rd<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .f64 %fd<" << NVPTXNumRegisters << ">;\n";
+
+ // Emit declaration of the virtual registers or 'physical' registers for
+ // each register class
+ for (unsigned i=0; i< TRI->getNumRegClasses(); i++) {
+ const TargetRegisterClass *RC = TRI->getRegClass(i);
+ DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+ std::string rcname = getNVPTXRegClassName(RC);
+ std::string rcStr = getNVPTXRegClassStr(RC);
+ int n = regmap.size();
+
+ // Only declare those registers that may be used.
+ if (n) {
+ O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+ << ">;\n";
+ }
+ }
+
+ OutStreamer.EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
+ APFloat APF = APFloat(Fp->getValueAPF()); // make a copy
+ bool ignored;
+ unsigned int numHex;
+ const char *lead;
+
+ if (Fp->getType()->getTypeID() == Type::FloatTyID) {
+ numHex = 8;
+ lead = "0f";
+ APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &ignored);
+ } else if (Fp->getType()->getTypeID() == Type::DoubleTyID) {
+ numHex = 16;
+ lead = "0d";
+ APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+ } else
+ llvm_unreachable("unsupported fp type");
+
+ APInt API = APF.bitcastToAPInt();
+ std::string hexstr(utohexstr(API.getZExtValue()));
+ O << lead;
+ if (hexstr.length() < numHex)
+ O << std::string(numHex - hexstr.length(), '0');
+ O << utohexstr(API.getZExtValue());
+}
+
+void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+ O << CI->getValue();
+ return;
+ }
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+ printFPConstant(CFP, O);
+ return;
+ }
+ if (isa<ConstantPointerNull>(CPV)) {
+ O << "0";
+ return;
+ }
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+ PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
+ bool IsNonGenericPointer = false;
+ if (PTy && PTy->getAddressSpace() != 0) {
+ IsNonGenericPointer = true;
+ }
+ if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
+ O << "generic(";
+ O << *getSymbol(GVar);
+ O << ")";
+ } else {
+ O << *getSymbol(GVar);
+ }
+ return;
+ }
+ if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ const Value *v = Cexpr->stripPointerCasts();
+ PointerType *PTy = dyn_cast<PointerType>(Cexpr->getType());
+ bool IsNonGenericPointer = false;
+ if (PTy && PTy->getAddressSpace() != 0) {
+ IsNonGenericPointer = true;
+ }
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+ if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+ O << "generic(";
+ O << *getSymbol(GVar);
+ O << ")";
+ } else {
+ O << *getSymbol(GVar);
+ }
+ return;
+ } else {
+ O << *LowerConstant(CPV, *this);
+ return;
+ }
+ }
+ llvm_unreachable("Not scalar type found in printScalarConstant()");
+}
+
+void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
+ AggBuffer *aggBuffer) {
+
+ const DataLayout *TD = TM.getDataLayout();
+
+ if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
+ int s = TD->getTypeAllocSize(CPV->getType());
+ if (s < Bytes)
+ s = Bytes;
+ aggBuffer->addZeros(s);
+ return;
+ }
+
+ unsigned char *ptr;
+ switch (CPV->getType()->getTypeID()) {
+
+ case Type::IntegerTyID: {
+ const Type *ETy = CPV->getType();
+ if (ETy == Type::getInt8Ty(CPV->getContext())) {
+ unsigned char c =
+ (unsigned char)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+ ptr = &c;
+ aggBuffer->addBytes(ptr, 1, Bytes);
+ } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
+ short int16 = (short)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+ ptr = (unsigned char *)&int16;
+ aggBuffer->addBytes(ptr, 2, Bytes);
+ } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+ int int32 = (int)(constInt->getZExtValue());
+ ptr = (unsigned char *)&int32;
+ aggBuffer->addBytes(ptr, 4, Bytes);
+ break;
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
+ ConstantFoldConstantExpression(Cexpr, TD))) {
+ int int32 = (int)(constInt->getZExtValue());
+ ptr = (unsigned char *)&int32;
+ aggBuffer->addBytes(ptr, 4, Bytes);
+ break;
+ }
+ if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+ Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+ aggBuffer->addSymbol(v);
+ aggBuffer->addZeros(4);
+ break;
+ }
+ }
+ llvm_unreachable("unsupported integer const type");
+ } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+ long long int64 = (long long)(constInt->getZExtValue());
+ ptr = (unsigned char *)&int64;
+ aggBuffer->addBytes(ptr, 8, Bytes);
+ break;
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
+ ConstantFoldConstantExpression(Cexpr, TD))) {
+ long long int64 = (long long)(constInt->getZExtValue());
+ ptr = (unsigned char *)&int64;
+ aggBuffer->addBytes(ptr, 8, Bytes);
+ break;
+ }
+ if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+ Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+ aggBuffer->addSymbol(v);
+ aggBuffer->addZeros(8);
+ break;
+ }
+ }
+ llvm_unreachable("unsupported integer const type");
+ } else
+ llvm_unreachable("unsupported integer const type");
+ break;
+ }
+ case Type::FloatTyID:
+ case Type::DoubleTyID: {
+ const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+ const Type *Ty = CFP->getType();
+ if (Ty == Type::getFloatTy(CPV->getContext())) {
+ float float32 = (float) CFP->getValueAPF().convertToFloat();
+ ptr = (unsigned char *)&float32;
+ aggBuffer->addBytes(ptr, 4, Bytes);
+ } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+ double float64 = CFP->getValueAPF().convertToDouble();
+ ptr = (unsigned char *)&float64;
+ aggBuffer->addBytes(ptr, 8, Bytes);
+ } else {
+ llvm_unreachable("unsupported fp const type");
+ }
+ break;
+ }
+ case Type::PointerTyID: {
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+ aggBuffer->addSymbol(GVar);
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ const Value *v = Cexpr->stripPointerCasts();
+ aggBuffer->addSymbol(v);
+ }
+ unsigned int s = TD->getTypeAllocSize(CPV->getType());
+ aggBuffer->addZeros(s);
+ break;
+ }
+
+ case Type::ArrayTyID:
+ case Type::VectorTyID:
+ case Type::StructTyID: {
+ if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
+ isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
+ int ElementSize = TD->getTypeAllocSize(CPV->getType());
+ bufferAggregateConstant(CPV, aggBuffer);
+ if (Bytes > ElementSize)
+ aggBuffer->addZeros(Bytes - ElementSize);
+ } else if (isa<ConstantAggregateZero>(CPV))
+ aggBuffer->addZeros(Bytes);
+ else
+ llvm_unreachable("Unexpected Constant type");
+ break;
+ }
+
+ default:
+ llvm_unreachable("unsupported type");
+ }
+}
+
+void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
+ AggBuffer *aggBuffer) {
+ const DataLayout *TD = TM.getDataLayout();
+ int Bytes;
+
+ // Old constants
+ if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+ if (CPV->getNumOperands())
+ for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i)
+ bufferLEByte(cast<Constant>(CPV->getOperand(i)), 0, aggBuffer);
+ return;
+ }
+
+ if (const ConstantDataSequential *CDS =
+ dyn_cast<ConstantDataSequential>(CPV)) {
+ if (CDS->getNumElements())
+ for (unsigned i = 0; i < CDS->getNumElements(); ++i)
+ bufferLEByte(cast<Constant>(CDS->getElementAsConstant(i)), 0,
+ aggBuffer);
+ return;
+ }
+
+ if (isa<ConstantStruct>(CPV)) {
+ if (CPV->getNumOperands()) {
+ StructType *ST = cast<StructType>(CPV->getType());
+ for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+ if (i == (e - 1))
+ Bytes = TD->getStructLayout(ST)->getElementOffset(0) +
+ TD->getTypeAllocSize(ST) -
+ TD->getStructLayout(ST)->getElementOffset(i);
+ else
+ Bytes = TD->getStructLayout(ST)->getElementOffset(i + 1) -
+ TD->getStructLayout(ST)->getElementOffset(i);
+ bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer);
+ }
+ }
+ return;
+ }
+ llvm_unreachable("unsupported constant type in printAggregateConstant()");
+}
+
+// buildTypeNameMap - Run through symbol table looking for type names.
+//
+
+bool NVPTXAsmPrinter::isImageType(const Type *Ty) {
+
+ std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty);
+
+ if (PI != TypeNameMap.end() && (!PI->second.compare("struct._image1d_t") ||
+ !PI->second.compare("struct._image2d_t") ||
+ !PI->second.compare("struct._image3d_t")))
+ return true;
+
+ return false;
+}
+
+
+bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case NVPTX::CallArgBeginInst:
+ case NVPTX::CallArgEndInst0:
+ case NVPTX::CallArgEndInst1:
+ case NVPTX::CallArgF32:
+ case NVPTX::CallArgF64:
+ case NVPTX::CallArgI16:
+ case NVPTX::CallArgI32:
+ case NVPTX::CallArgI32imm:
+ case NVPTX::CallArgI64:
+ case NVPTX::CallArgParam:
+ case NVPTX::CallVoidInst:
+ case NVPTX::CallVoidInstReg:
+ case NVPTX::Callseq_End:
+ case NVPTX::CallVoidInstReg64:
+ case NVPTX::DeclareParamInst:
+ case NVPTX::DeclareRetMemInst:
+ case NVPTX::DeclareRetRegInst:
+ case NVPTX::DeclareRetScalarInst:
+ case NVPTX::DeclareScalarParamInst:
+ case NVPTX::DeclareScalarRegInst:
+ case NVPTX::StoreParamF32:
+ case NVPTX::StoreParamF64:
+ case NVPTX::StoreParamI16:
+ case NVPTX::StoreParamI32:
+ case NVPTX::StoreParamI64:
+ case NVPTX::StoreParamI8:
+ case NVPTX::StoreRetvalF32:
+ case NVPTX::StoreRetvalF64:
+ case NVPTX::StoreRetvalI16:
+ case NVPTX::StoreRetvalI32:
+ case NVPTX::StoreRetvalI64:
+ case NVPTX::StoreRetvalI8:
+ case NVPTX::LastCallArgF32:
+ case NVPTX::LastCallArgF64:
+ case NVPTX::LastCallArgI16:
+ case NVPTX::LastCallArgI32:
+ case NVPTX::LastCallArgI32imm:
+ case NVPTX::LastCallArgI64:
+ case NVPTX::LastCallArgParam:
+ case NVPTX::LoadParamMemF32:
+ case NVPTX::LoadParamMemF64:
+ case NVPTX::LoadParamMemI16:
+ case NVPTX::LoadParamMemI32:
+ case NVPTX::LoadParamMemI64:
+ case NVPTX::LoadParamMemI8:
+ case NVPTX::PrototypeInst:
+ case NVPTX::DBG_VALUE:
+ return true;
+ }
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0)
+ return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default:
+ // See if this is a generic print operand
+ return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+ case 'r':
+ break;
+ }
+ }
+
+ printOperand(MI, OpNo, O);
+
+ return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
+ const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier
+
+ O << '[';
+ printMemOperand(MI, OpNo, O);
+ O << ']';
+
+ return false;
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &MO = MI->getOperand(opNum);
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register:
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (MO.getReg() == NVPTX::VRDepot)
+ O << DEPOTNAME << getFunctionNumber();
+ else
+ O << NVPTXInstPrinter::getRegisterName(MO.getReg());
+ } else {
+ emitVirtualRegister(MO.getReg(), O);
+ }
+ return;
+
+ case MachineOperand::MO_Immediate:
+ if (!Modifier)
+ O << MO.getImm();
+ else if (strstr(Modifier, "vec") == Modifier)
+ printVecModifiedImmediate(MO, Modifier, O);
+ else
+ llvm_unreachable(
+ "Don't know how to handle modifier on immediate operand");
+ return;
+
+ case MachineOperand::MO_FPImmediate:
+ printFPConstant(MO.getFPImm(), O);
+ break;
+
+ case MachineOperand::MO_GlobalAddress:
+ O << *getSymbol(MO.getGlobal());
+ break;
+
+ case MachineOperand::MO_MachineBasicBlock:
+ O << *MO.getMBB()->getSymbol();
+ return;
+
+ default:
+ llvm_unreachable("Operand type not supported.");
+ }
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+ raw_ostream &O, const char *Modifier) {
+ printOperand(MI, opNum, O);
+
+ if (Modifier && !strcmp(Modifier, "add")) {
+ O << ", ";
+ printOperand(MI, opNum + 1, O);
+ } else {
+ if (MI->getOperand(opNum + 1).isImm() &&
+ MI->getOperand(opNum + 1).getImm() == 0)
+ return; // don't print ',0' or '+0'
+ O << "+";
+ printOperand(MI, opNum + 1, O);
+ }
+}
+
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() {
+ RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+ RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
+
+void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
+ std::stringstream temp;
+ LineReader *reader = this->getReader(filename.str());
+ temp << "\n//";
+ temp << filename.str();
+ temp << ":";
+ temp << line;
+ temp << " ";
+ temp << reader->readLine(line);
+ temp << "\n";
+ this->OutStreamer.EmitRawText(Twine(temp.str()));
+}
+
+LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
+ if (!reader) {
+ reader = new LineReader(filename);
+ }
+
+ if (reader->fileName() != filename) {
+ delete reader;
+ reader = new LineReader(filename);
+ }
+
+ return reader;
+}
+
+std::string LineReader::readLine(unsigned lineNum) {
+ if (lineNum < theCurLine) {
+ theCurLine = 0;
+ fstr.seekg(0, std::ios::beg);
+ }
+ while (theCurLine < lineNum) {
+ fstr.getline(buff, 500);
+ theCurLine++;
+ }
+ return buff;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXAsmPrinter() {
+ RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+ RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
new file mode 100644
index 000000000000..a9f9bdd6d3d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -0,0 +1,333 @@
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXASMPRINTER_H
+#define NVPTXASMPRINTER_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <fstream>
+
+// The ptx syntax and format is very different from that usually seem in a .s
+// file,
+// therefore we are not able to use the MCAsmStreamer interface here.
+//
+// We are handcrafting the output method here.
+//
+// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
+// (subclass of MCStreamer).
+
+// This is defined in AsmPrinter.cpp.
+// Used to process the constant expressions in initializers.
+namespace nvptx {
+const llvm::MCExpr *
+LowerConstant(const llvm::Constant *CV, llvm::AsmPrinter &AP);
+}
+
+namespace llvm {
+
+class LineReader {
+private:
+ unsigned theCurLine;
+ std::ifstream fstr;
+ char buff[512];
+ std::string theFileName;
+ SmallVector<unsigned, 32> lineOffset;
+public:
+ LineReader(std::string filename) {
+ theCurLine = 0;
+ fstr.open(filename.c_str());
+ theFileName = filename;
+ }
+ std::string fileName() { return theFileName; }
+ ~LineReader() { fstr.close(); }
+ std::string readLine(unsigned line);
+};
+
+class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
+
+ class AggBuffer {
+ // Used to buffer the emitted string for initializing global
+ // aggregates.
+ //
+ // Normally an aggregate (array, vector or structure) is emitted
+ // as a u8[]. However, if one element/field of the aggregate
+ // is a non-NULL address, then the aggregate is emitted as u32[]
+ // or u64[].
+ //
+ // We first layout the aggregate in 'buffer' in bytes, except for
+ // those symbol addresses. For the i-th symbol address in the
+ //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
+ // are filled with 0s. symbolPosInBuffer[i-1] records its position
+ // in 'buffer', and Symbols[i-1] records the Value*.
+ //
+ // Once we have this AggBuffer setup, we can choose how to print
+ // it out.
+ public:
+ unsigned size; // size of the buffer in bytes
+ unsigned char *buffer; // the buffer
+ unsigned numSymbols; // number of symbol addresses
+ SmallVector<unsigned, 4> symbolPosInBuffer;
+ SmallVector<const Value *, 4> Symbols;
+
+ private:
+ unsigned curpos;
+ raw_ostream &O;
+ NVPTXAsmPrinter &AP;
+ bool EmitGeneric;
+
+ public:
+ AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
+ : O(_O), AP(_AP) {
+ buffer = new unsigned char[_size];
+ size = _size;
+ curpos = 0;
+ numSymbols = 0;
+ EmitGeneric = AP.EmitGeneric;
+ }
+ ~AggBuffer() { delete[] buffer; }
+ unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
+ assert((curpos + Num) <= size);
+ assert((curpos + Bytes) <= size);
+ for (int i = 0; i < Num; ++i) {
+ buffer[curpos] = Ptr[i];
+ curpos++;
+ }
+ for (int i = Num; i < Bytes; ++i) {
+ buffer[curpos] = 0;
+ curpos++;
+ }
+ return curpos;
+ }
+ unsigned addZeros(int Num) {
+ assert((curpos + Num) <= size);
+ for (int i = 0; i < Num; ++i) {
+ buffer[curpos] = 0;
+ curpos++;
+ }
+ return curpos;
+ }
+ void addSymbol(const Value *GVar) {
+ symbolPosInBuffer.push_back(curpos);
+ Symbols.push_back(GVar);
+ numSymbols++;
+ }
+ void print() {
+ if (numSymbols == 0) {
+ // print out in bytes
+ for (unsigned i = 0; i < size; i++) {
+ if (i)
+ O << ", ";
+ O << (unsigned int) buffer[i];
+ }
+ } else {
+ // print out in 4-bytes or 8-bytes
+ unsigned int pos = 0;
+ unsigned int nSym = 0;
+ unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+ unsigned int nBytes = 4;
+ if (AP.nvptxSubtarget.is64Bit())
+ nBytes = 8;
+ for (pos = 0; pos < size; pos += nBytes) {
+ if (pos)
+ O << ", ";
+ if (pos == nextSymbolPos) {
+ const Value *v = Symbols[nSym];
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+ MCSymbol *Name = AP.getSymbol(GVar);
+ PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
+ bool IsNonGenericPointer = false;
+ if (PTy && PTy->getAddressSpace() != 0) {
+ IsNonGenericPointer = true;
+ }
+ if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+ O << "generic(";
+ O << *Name;
+ O << ")";
+ } else {
+ O << *Name;
+ }
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
+ O << *nvptx::LowerConstant(Cexpr, AP);
+ } else
+ llvm_unreachable("symbol type unknown");
+ nSym++;
+ if (nSym >= numSymbols)
+ nextSymbolPos = size + 1;
+ else
+ nextSymbolPos = symbolPosInBuffer[nSym];
+ } else if (nBytes == 4)
+ O << *(unsigned int *)(buffer + pos);
+ else
+ O << *(unsigned long long *)(buffer + pos);
+ }
+ }
+ }
+ };
+
+ friend class AggBuffer;
+
+ void emitSrcInText(StringRef filename, unsigned line);
+
+private:
+ const char *getPassName() const override { return "NVPTX Assembly Printer"; }
+
+ const Function *F;
+ std::string CurrentFnName;
+
+ void EmitFunctionEntryLabel() override;
+ void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
+ void emitImplicitDef(const MachineInstr *MI) const override;
+
+ void EmitInstruction(const MachineInstr *) override;
+ void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
+ bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+ MCOperand GetSymbolRef(const MCSymbol *Symbol);
+ unsigned encodeVirtualRegister(unsigned Reg);
+
+ void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {}
+
+ void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
+ raw_ostream &O);
+ void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
+ void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
+ bool = false);
+ void printParamName(int paramIndex, raw_ostream &O);
+ void printParamName(Function::const_arg_iterator I, int paramIndex,
+ raw_ostream &O);
+ void emitGlobals(const Module &M);
+ void emitHeader(Module &M, raw_ostream &O);
+ void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
+ void emitVirtualRegister(unsigned int vr, raw_ostream &);
+ void emitFunctionExternParamList(const MachineFunction &MF);
+ void emitFunctionParamList(const Function *, raw_ostream &O);
+ void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
+ void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+ void emitFunctionTempData(const MachineFunction &MF, unsigned &FrameSize);
+ bool isImageType(const Type *Ty);
+ void printReturnValStr(const Function *, raw_ostream &O);
+ void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &) override;
+ void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+ const char *Modifier = nullptr);
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &) override;
+protected:
+ bool doInitialization(Module &M) override;
+ bool doFinalization(Module &M) override;
+
+private:
+ std::string CurrentBankselLabelInBasicBlock;
+
+ bool GlobalsEmitted;
+
+ // This is specific per MachineFunction.
+ const MachineRegisterInfo *MRI;
+ // The contents are specific for each
+ // MachineFunction. But the size of the
+ // array is not.
+ typedef DenseMap<unsigned, unsigned> VRegMap;
+ typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
+ VRegRCMap VRegMapping;
+ // cache the subtarget here.
+ const NVPTXSubtarget &nvptxSubtarget;
+ // Build the map between type name and ID based on module's type
+ // symbol table.
+ std::map<const Type *, std::string> TypeNameMap;
+
+ // List of variables demoted to a function scope.
+ std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
+
+ // To record filename to ID mapping
+ std::map<std::string, unsigned> filenameMap;
+ void recordAndEmitFilenames(Module &);
+
+ void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+ void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
+ std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const;
+ void printScalarConstant(const Constant *CPV, raw_ostream &O);
+ void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
+ void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
+ void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
+
+ void printOperandProper(const MachineOperand &MO);
+
+ void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
+ void emitDeclarations(const Module &, raw_ostream &O);
+ void emitDeclaration(const Function *, raw_ostream &O);
+
+ static const char *getRegisterName(unsigned RegNo);
+ void emitDemotedVars(const Function *, raw_ostream &);
+
+ bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
+ MCOperand &MCOp);
+ void lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp);
+
+ LineReader *reader;
+ LineReader *getReader(std::string);
+
+ // Used to control the need to emit .generic() in the initializer of
+ // module scope variables.
+ // Although ptx supports the hybrid mode like the following,
+ // .global .u32 a;
+ // .global .u32 b;
+ // .global .u32 addr[] = {a, generic(b)}
+ // we have difficulty representing the difference in the NVVM IR.
+ //
+ // Since the address value should always be generic in CUDA C and always
+ // be specific in OpenCL, we use this simple control here.
+ //
+ bool EmitGeneric;
+
+public:
+ NVPTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+ : AsmPrinter(TM, Streamer),
+ nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+ CurrentBankselLabelInBasicBlock = "";
+ reader = nullptr;
+ EmitGeneric = (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA);
+ }
+
+ ~NVPTXAsmPrinter() {
+ if (!reader)
+ delete reader;
+ }
+
+ bool ignoreLoc(const MachineInstr &);
+
+ std::string getVirtualRegisterName(unsigned) const;
+
+ DebugLoc prevDebugLoc;
+ void emitLineNumberAsDotLoc(const MachineInstr &);
+};
+} // end of namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
new file mode 100644
index 000000000000..962b12312026
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -0,0 +1,84 @@
+//===-- NVPTXAssignValidGlobalNames.cpp - Assign valid names to globals ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Clean up the names of global variables in the module to not contain symbols
+// that are invalid in PTX.
+//
+// Currently NVPTX, like other backends, relies on generic symbol name
+// sanitizing done by MC. However, the ptxas assembler is more stringent and
+// disallows some additional characters in symbol names. This pass makes sure
+// such names do not reach MC at all.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+/// \brief NVPTXAssignValidGlobalNames
+class NVPTXAssignValidGlobalNames : public ModulePass {
+public:
+ static char ID;
+ NVPTXAssignValidGlobalNames() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ /// \brief Clean up the name to remove symbols invalid in PTX.
+ std::string cleanUpName(StringRef Name);
+};
+}
+
+char NVPTXAssignValidGlobalNames::ID = 0;
+
+namespace llvm {
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXAssignValidGlobalNames, "nvptx-assign-valid-global-names",
+ "Assign valid PTX names to globals", false, false)
+
+bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) {
+ for (GlobalVariable &GV : M.globals()) {
+ // We are only allowed to rename local symbols.
+ if (GV.hasLocalLinkage()) {
+ // setName doesn't do extra work if the name does not change.
+ // Note: this does not create collisions - if setName is asked to set the
+ // name to something that already exists, it adds a proper postfix to
+ // avoid collisions.
+ GV.setName(cleanUpName(GV.getName()));
+ }
+ }
+
+ return true;
+}
+
+std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) {
+ std::string ValidName;
+ raw_string_ostream ValidNameStream(ValidName);
+ for (unsigned I = 0, E = Name.size(); I != E; ++I) {
+ char C = Name[I];
+ if (C == '.' || C == '@') {
+ ValidNameStream << "_$_";
+ } else {
+ ValidNameStream << C;
+ }
+ }
+
+ return ValidNameStream.str();
+}
+
+ModulePass *llvm::createNVPTXAssignValidGlobalNamesPass() {
+ return new NVPTXAssignValidGlobalNames();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
new file mode 100644
index 000000000000..f3a095d829e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -0,0 +1,195 @@
+//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When a load/store accesses the generic address space, checks whether the
+// address is casted from a non-generic address space. If so, remove this
+// addrspacecast because accessing non-generic address spaces is typically
+// faster. Besides seeking addrspacecasts, this optimization also traces into
+// the base pointer of a GEP.
+//
+// For instance, the code below loads a float from an array allocated in
+// addrspace(3).
+//
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float]* %0, i64 0, i64 %i
+// %2 = load float* %1 ; emits ld.f32
+//
+// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast
+// and the GEP to expose more optimization opportunities to function
+// optimizeMemoryInst. The intermediate code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %1 = addrspacecast float addrspace(3)* %0 to float*
+// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly
+//
+// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed
+// generic pointers, and folds the load and the addrspacecast into a load from
+// the original address space. The final code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32
+//
+// This pass may remove an addrspacecast in a different BB. Therefore, we
+// implement it as a FunctionPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// An option to disable this optimization. Enable it by default.
+static cl::opt<bool> DisableFavorNonGeneric(
+ "disable-nvptx-favor-non-generic",
+ cl::init(false),
+ cl::desc("Do not convert generic address space usage "
+ "to non-generic address space usage"),
+ cl::Hidden);
+
+namespace {
+/// \brief NVPTXFavorNonGenericAddrSpaces
+class NVPTXFavorNonGenericAddrSpaces : public FunctionPass {
+public:
+ static char ID;
+ NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ /// Optimizes load/store instructions. Idx is the index of the pointer operand
+ /// (0 for load, and 1 for store). Returns true if it changes anything.
+ bool optimizeMemoryInstruction(Instruction *I, unsigned Idx);
+ /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X,
+ /// indices)". This reordering exposes to optimizeMemoryInstruction more
+ /// optimization opportunities on loads and stores. Returns true if it changes
+ /// the program.
+ bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP);
+};
+}
+
+char NVPTXFavorNonGenericAddrSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic",
+ "Remove unnecessary non-generic-to-generic addrspacecasts",
+ false, false)
+
+// Decides whether removing Cast is valid and beneficial. Cast can be an
+// instruction or a constant expression.
+static bool IsEliminableAddrSpaceCast(Operator *Cast) {
+ // Returns false if not even an addrspacecast.
+ if (Cast->getOpcode() != Instruction::AddrSpaceCast)
+ return false;
+
+ Value *Src = Cast->getOperand(0);
+ PointerType *SrcTy = cast<PointerType>(Src->getType());
+ PointerType *DestTy = cast<PointerType>(Cast->getType());
+ // TODO: For now, we only handle the case where the addrspacecast only changes
+ // the address space but not the type. If the type also changes, we could
+ // still get rid of the addrspacecast by adding an extra bitcast, but we
+ // rarely see such scenarios.
+ if (SrcTy->getElementType() != DestTy->getElementType())
+ return false;
+
+ // Checks whether the addrspacecast is from a non-generic address space to the
+ // generic address space.
+ return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC &&
+ DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC);
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
+ GEPOperator *GEP) {
+ Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand());
+ if (!Cast)
+ return false;
+
+ if (!IsEliminableAddrSpaceCast(Cast))
+ return false;
+
+ SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end());
+ if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) {
+ // %1 = gep (addrspacecast X), indices
+ // =>
+ // %0 = gep X, indices
+ // %1 = addrspacecast %0
+ GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0),
+ Indices,
+ GEP->getName(),
+ GEPI);
+ NewGEPI->setIsInBounds(GEP->isInBounds());
+ GEP->replaceAllUsesWith(
+ new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI));
+ } else {
+ // GEP is a constant expression.
+ Constant *NewGEPCE = ConstantExpr::getGetElementPtr(
+ cast<Constant>(Cast->getOperand(0)),
+ Indices,
+ GEP->isInBounds());
+ GEP->replaceAllUsesWith(
+ ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType()));
+ }
+
+ return true;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
+ unsigned Idx) {
+ // If the pointer operand is a GEP, hoist the addrspacecast if any from the
+ // GEP to expose more optimization opportunites.
+ if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) {
+ hoistAddrSpaceCastFromGEP(GEP);
+ }
+
+ // load/store (addrspacecast X) => load/store X if shortcutting the
+ // addrspacecast is valid and can improve performance.
+ //
+ // e.g.,
+ // %1 = addrspacecast float addrspace(3)* %0 to float*
+ // %2 = load float* %1
+ // ->
+ // %2 = load float addrspace(3)* %0
+ //
+ // Note: the addrspacecast can also be a constant expression.
+ if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) {
+ if (IsEliminableAddrSpaceCast(Cast)) {
+ MI->setOperand(Idx, Cast->getOperand(0));
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
+ if (DisableFavorNonGeneric)
+ return false;
+
+ bool Changed = false;
+ for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+ for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+ if (isa<LoadInst>(I)) {
+ // V = load P
+ Changed |= optimizeMemoryInstruction(I, 0);
+ } else if (isa<StoreInst>(I)) {
+ // store V, P
+ Changed |= optimizeMemoryInstruction(I, 1);
+ }
+ }
+ }
+ return Changed;
+}
+
+FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() {
+ return new NVPTXFavorNonGenericAddrSpaces();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
new file mode 100644
index 000000000000..8b088412dbba
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -0,0 +1,81 @@
+//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXFrameLowering.h"
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+NVPTXFrameLowering::NVPTXFrameLowering(NVPTXSubtarget &STI)
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0),
+ is64bit(STI.is64Bit()) {}
+
+bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+
+void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
+ if (MF.getFrameInfo()->hasStackObjects()) {
+ MachineBasicBlock &MBB = MF.front();
+ // Insert "mov.u32 %SP, %Depot"
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ // This instruction really occurs before first instruction
+ // in the BB, so giving it no debug location.
+ DebugLoc dl = DebugLoc();
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // mov %SPL, %depot;
+ // cvta.local %SP, %SPL;
+ if (is64bit) {
+ unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, dl,
+ MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+ NVPTX::VRFrame).addReg(LocalReg);
+ BuildMI(MBB, MI, dl,
+ MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
+ LocalReg).addImm(MF.getFunctionNumber());
+ } else {
+ unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass);
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, dl,
+ MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes),
+ NVPTX::VRFrame).addReg(LocalReg);
+ BuildMI(MBB, MI, dl,
+ MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
+ LocalReg).addImm(MF.getFunctionNumber());
+ }
+ }
+}
+
+void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void NVPTXFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ // Simply discard ADJCALLSTACKDOWN,
+ // ADJCALLSTACKUP instructions.
+ MBB.erase(I);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
new file mode 100644
index 000000000000..56fb673de0eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -0,0 +1,38 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_FRAMELOWERING_H
+#define NVPTX_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class NVPTXSubtarget;
+class NVPTXFrameLowering : public TargetFrameLowering {
+ bool is64bit;
+
+public:
+ explicit NVPTXFrameLowering(NVPTXSubtarget &STI);
+
+ bool hasFP(const MachineFunction &MF) const override;
+ void emitPrologue(MachineFunction &MF) const override;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+ void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
new file mode 100644
index 000000000000..faa9fdb424b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -0,0 +1,433 @@
+//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Convert generic global variables into either .global or .const access based
+// on the variable's "constant" qualifier.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/PassManager.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeGenericToNVVMPass(PassRegistry &);
+}
+
+namespace {
+class GenericToNVVM : public ModulePass {
+public:
+ static char ID;
+
+ GenericToNVVM() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+private:
+ Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
+ IRBuilder<> &Builder);
+ Value *remapConstant(Module *M, Function *F, Constant *C,
+ IRBuilder<> &Builder);
+ Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
+ Constant *C,
+ IRBuilder<> &Builder);
+ Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+ IRBuilder<> &Builder);
+ void remapNamedMDNode(Module *M, NamedMDNode *N);
+ MDNode *remapMDNode(Module *M, MDNode *N);
+
+ typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy;
+ typedef ValueMap<Constant *, Value *> ConstantToValueMapTy;
+ GVMapTy GVMap;
+ ConstantToValueMapTy ConstantToValueMap;
+};
+} // end namespace
+
+char GenericToNVVM::ID = 0;
+
+ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); }
+
+INITIALIZE_PASS(
+ GenericToNVVM, "generic-to-nvvm",
+ "Ensure that the global variables are in the global address space", false,
+ false)
+
+bool GenericToNVVM::runOnModule(Module &M) {
+ // Create a clone of each global variable that has the default address space.
+ // The clone is created with the global address space specifier, and the pair
+ // of original global variable and its clone is placed in the GVMap for later
+ // use.
+
+ for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+ I != E;) {
+ GlobalVariable *GV = I++;
+ if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
+ !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
+ !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getType()->getElementType(), GV->isConstant(),
+ GV->getLinkage(),
+ GV->hasInitializer() ? GV->getInitializer() : nullptr,
+ "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
+ NewGV->copyAttributesFrom(GV);
+ GVMap[GV] = NewGV;
+ }
+ }
+
+ // Return immediately, if every global variable has a specific address space
+ // specifier.
+ if (GVMap.empty()) {
+ return false;
+ }
+
+ // Walk through the instructions in function defitinions, and replace any use
+ // of original global variables in GVMap with a use of the corresponding
+ // copies in GVMap. If necessary, promote constants to instructions.
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ if (I->isDeclaration()) {
+ continue;
+ }
+ IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg());
+ for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE;
+ ++BBI) {
+ for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
+ ++II) {
+ for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) {
+ Value *Operand = II->getOperand(i);
+ if (isa<Constant>(Operand)) {
+ II->setOperand(
+ i, remapConstant(&M, I, cast<Constant>(Operand), Builder));
+ }
+ }
+ }
+ }
+ ConstantToValueMap.clear();
+ }
+
+ // Walk through the metadata section and update the debug information
+ // associated with the global variables in the default address space.
+ for (Module::named_metadata_iterator I = M.named_metadata_begin(),
+ E = M.named_metadata_end();
+ I != E; I++) {
+ remapNamedMDNode(&M, I);
+ }
+
+ // Walk through the global variable initializers, and replace any use of
+ // original global variables in GVMap with a use of the corresponding copies
+ // in GVMap. The copies need to be bitcast to the original global variable
+ // types, as we cannot use cvta in global variable initializers.
+ for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) {
+ GlobalVariable *GV = I->first;
+ GlobalVariable *NewGV = I->second;
+ ++I;
+ Constant *BitCastNewGV = ConstantExpr::getPointerCast(NewGV, GV->getType());
+ // At this point, the remaining uses of GV should be found only in global
+ // variable initializers, as other uses have been already been removed
+ // while walking through the instructions in function definitions.
+ for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end();
+ UI != UE;)
+ (UI++)->set(BitCastNewGV);
+ std::string Name = GV->getName();
+ GV->removeDeadConstantUsers();
+ GV->eraseFromParent();
+ NewGV->setName(Name);
+ }
+ GVMap.clear();
+
+ return true;
+}
+
+Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
+ GlobalVariable *GV,
+ IRBuilder<> &Builder) {
+ PointerType *GVType = GV->getType();
+ Value *CVTA = nullptr;
+
+ // See if the address space conversion requires the operand to be bitcast
+ // to i8 addrspace(n)* first.
+ EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true);
+ if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
+ // A bitcast to i8 addrspace(n)* on the operand is needed.
+ LLVMContext &Context = M->getContext();
+ unsigned int AddrSpace = GVType->getAddressSpace();
+ Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
+ CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
+ // Insert the address space conversion.
+ Type *ResultType =
+ PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
+ SmallVector<Type *, 2> ParamTypes;
+ ParamTypes.push_back(ResultType);
+ ParamTypes.push_back(DestTy);
+ Function *CVTAFunction = Intrinsic::getDeclaration(
+ M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+ CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
+ // Another bitcast from i8 * to <the element type of GVType> * is
+ // required.
+ DestTy =
+ PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC);
+ CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
+ } else {
+ // A simple CVTA is enough.
+ SmallVector<Type *, 2> ParamTypes;
+ ParamTypes.push_back(PointerType::get(GVType->getElementType(),
+ llvm::ADDRESS_SPACE_GENERIC));
+ ParamTypes.push_back(GVType);
+ Function *CVTAFunction = Intrinsic::getDeclaration(
+ M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+ CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
+ }
+
+ return CVTA;
+}
+
+Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
+ IRBuilder<> &Builder) {
+ // If the constant C has been converted already in the given function F, just
+ // return the converted value.
+ ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C);
+ if (CTII != ConstantToValueMap.end()) {
+ return CTII->second;
+ }
+
+ Value *NewValue = C;
+ if (isa<GlobalVariable>(C)) {
+ // If the constant C is a global variable and is found in GVMap, generate a
+ // set set of instructions that convert the clone of C with the global
+ // address space specifier to a generic pointer.
+ // The constant C cannot be used here, as it will be erased from the
+ // module eventually. And the clone of C with the global address space
+ // specifier cannot be used here either, as it will affect the types of
+ // other instructions in the function. Hence, this address space conversion
+ // is required.
+ GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
+ if (I != GVMap.end()) {
+ NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+ }
+ } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) ||
+ isa<ConstantStruct>(C)) {
+ // If any element in the constant vector or aggregate C is or uses a global
+ // variable in GVMap, the constant C needs to be reconstructed, using a set
+ // of instructions.
+ NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder);
+ } else if (isa<ConstantExpr>(C)) {
+ // If any operand in the constant expression C is or uses a global variable
+ // in GVMap, the constant expression C needs to be reconstructed, using a
+ // set of instructions.
+ NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder);
+ }
+
+ ConstantToValueMap[C] = NewValue;
+ return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantVectorOrConstantAggregate(
+ Module *M, Function *F, Constant *C, IRBuilder<> &Builder) {
+ bool OperandChanged = false;
+ SmallVector<Value *, 4> NewOperands;
+ unsigned NumOperands = C->getNumOperands();
+
+ // Check if any element is or uses a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Operand = C->getOperand(i);
+ Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the elements has been modified, return C as it is.
+ if (!OperandChanged) {
+ return C;
+ }
+
+ // If any of the elements has been modified, construct the equivalent
+ // vector or aggregate value with a set instructions and the converted
+ // elements.
+ Value *NewValue = UndefValue::get(C->getType());
+ if (isa<ConstantVector>(C)) {
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i);
+ NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx);
+ }
+ } else {
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ NewValue =
+ Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i));
+ }
+ }
+
+ return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+ IRBuilder<> &Builder) {
+ bool OperandChanged = false;
+ SmallVector<Value *, 4> NewOperands;
+ unsigned NumOperands = C->getNumOperands();
+
+ // Check if any operand is or uses a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Operand = C->getOperand(i);
+ Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the operands has been modified, return C as it is.
+ if (!OperandChanged) {
+ return C;
+ }
+
+ // If any of the operands has been modified, construct the instruction with
+ // the converted operands.
+ unsigned Opcode = C->getOpcode();
+ switch (Opcode) {
+ case Instruction::ICmp:
+ // CompareConstantExpr (icmp)
+ return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()),
+ NewOperands[0], NewOperands[1]);
+ case Instruction::FCmp:
+ // CompareConstantExpr (fcmp)
+ assert(false && "Address space conversion should have no effect "
+ "on float point CompareConstantExpr (fcmp)!");
+ return C;
+ case Instruction::ExtractElement:
+ // ExtractElementConstantExpr
+ return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
+ case Instruction::InsertElement:
+ // InsertElementConstantExpr
+ return Builder.CreateInsertElement(NewOperands[0], NewOperands[1],
+ NewOperands[2]);
+ case Instruction::ShuffleVector:
+ // ShuffleVector
+ return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
+ NewOperands[2]);
+ case Instruction::ExtractValue:
+ // ExtractValueConstantExpr
+ return Builder.CreateExtractValue(NewOperands[0], C->getIndices());
+ case Instruction::InsertValue:
+ // InsertValueConstantExpr
+ return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
+ C->getIndices());
+ case Instruction::GetElementPtr:
+ // GetElementPtrConstantExpr
+ return cast<GEPOperator>(C)->isInBounds()
+ ? Builder.CreateGEP(
+ NewOperands[0],
+ makeArrayRef(&NewOperands[1], NumOperands - 1))
+ : Builder.CreateInBoundsGEP(
+ NewOperands[0],
+ makeArrayRef(&NewOperands[1], NumOperands - 1));
+ case Instruction::Select:
+ // SelectConstantExpr
+ return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]);
+ default:
+ // BinaryConstantExpr
+ if (Instruction::isBinaryOp(Opcode)) {
+ return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()),
+ NewOperands[0], NewOperands[1]);
+ }
+ // UnaryConstantExpr
+ if (Instruction::isCast(Opcode)) {
+ return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
+ NewOperands[0], C->getType());
+ }
+ assert(false && "GenericToNVVM encountered an unsupported ConstantExpr");
+ return C;
+ }
+}
+
+void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
+
+ bool OperandChanged = false;
+ SmallVector<MDNode *, 16> NewOperands;
+ unsigned NumOperands = N->getNumOperands();
+
+ // Check if any operand is or contains a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ MDNode *Operand = N->getOperand(i);
+ MDNode *NewOperand = remapMDNode(M, Operand);
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the operands has been modified, return immediately.
+ if (!OperandChanged) {
+ return;
+ }
+
+ // Replace the old operands with the new operands.
+ N->dropAllReferences();
+ for (SmallVectorImpl<MDNode *>::iterator I = NewOperands.begin(),
+ E = NewOperands.end();
+ I != E; ++I) {
+ N->addOperand(*I);
+ }
+}
+
+MDNode *GenericToNVVM::remapMDNode(Module *M, MDNode *N) {
+
+ bool OperandChanged = false;
+ SmallVector<Value *, 8> NewOperands;
+ unsigned NumOperands = N->getNumOperands();
+
+ // Check if any operand is or contains a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Operand = N->getOperand(i);
+ Value *NewOperand = Operand;
+ if (Operand) {
+ if (isa<GlobalVariable>(Operand)) {
+ GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(Operand));
+ if (I != GVMap.end()) {
+ NewOperand = I->second;
+ if (++i < NumOperands) {
+ NewOperands.push_back(NewOperand);
+ // Address space of the global variable follows the global variable
+ // in the global variable debug info (see createGlobalVariable in
+ // lib/Analysis/DIBuilder.cpp).
+ NewOperand =
+ ConstantInt::get(Type::getInt32Ty(M->getContext()),
+ I->second->getType()->getAddressSpace());
+ }
+ }
+ } else if (isa<MDNode>(Operand)) {
+ NewOperand = remapMDNode(M, cast<MDNode>(Operand));
+ }
+ }
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the operands has been modified, return N as it is.
+ if (!OperandChanged) {
+ return N;
+ }
+
+ // If any of the operands has been modified, create a new MDNode with the new
+ // operands.
+ return MDNode::get(M->getContext(), makeArrayRef(NewOperands));
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
new file mode 100644
index 000000000000..05205fba1aff
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -0,0 +1,5085 @@
+//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelDAGToDAG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-isel"
+
+static cl::opt<int> UsePrecDivF32(
+ "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+ " IEEE Compliant F32 div.rnd if avaiable."),
+ cl::init(2));
+
+static cl::opt<bool>
+UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
+ cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+ cl::init(true));
+
+static cl::opt<bool>
+FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
+ cl::init(false));
+
+
+/// createNVPTXISelDag - This pass converts a legalized DAG into a
+/// NVPTX-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
+ llvm::CodeGenOpt::Level OptLevel) {
+ return new NVPTXDAGToDAGISel(TM, OptLevel);
+}
+
+NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+ CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel),
+ Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
+ doMulWide = (OptLevel > 0);
+}
+
+int NVPTXDAGToDAGISel::getDivF32Level() const {
+ if (UsePrecDivF32.getNumOccurrences() > 0) {
+ // If nvptx-prec-div32=N is used on the command-line, always honor it
+ return UsePrecDivF32;
+ } else {
+ // Otherwise, use div.approx if fast math is enabled
+ if (TM.Options.UnsafeFPMath)
+ return 0;
+ else
+ return 2;
+ }
+}
+
+bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
+ if (UsePrecSqrtF32.getNumOccurrences() > 0) {
+ // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+ return UsePrecSqrtF32;
+ } else {
+ // Otherwise, use sqrt.approx if fast math is enabled
+ if (TM.Options.UnsafeFPMath)
+ return false;
+ else
+ return true;
+ }
+}
+
+bool NVPTXDAGToDAGISel::useF32FTZ() const {
+ if (FtzEnabled.getNumOccurrences() > 0) {
+ // If nvptx-f32ftz is used on the command-line, always honor it
+ return FtzEnabled;
+ } else {
+ const Function *F = MF->getFunction();
+ // Otherwise, check for an nvptx-f32ftz attribute on the function
+ if (F->hasFnAttribute("nvptx-f32ftz"))
+ return (F->getAttributes().getAttribute(AttributeSet::FunctionIndex,
+ "nvptx-f32ftz")
+ .getValueAsString() == "true");
+ else
+ return false;
+ }
+}
+
+bool NVPTXDAGToDAGISel::allowFMA() const {
+ const NVPTXTargetLowering *TL = Subtarget.getTargetLowering();
+ return TL->allowFMA(*MF, OptLevel);
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
+
+ if (N->isMachineOpcode()) {
+ N->setNodeId(-1);
+ return nullptr; // Already selected.
+ }
+
+ SDNode *ResNode = nullptr;
+ switch (N->getOpcode()) {
+ case ISD::LOAD:
+ ResNode = SelectLoad(N);
+ break;
+ case ISD::STORE:
+ ResNode = SelectStore(N);
+ break;
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LoadV4:
+ ResNode = SelectLoadVector(N);
+ break;
+ case NVPTXISD::LDGV2:
+ case NVPTXISD::LDGV4:
+ case NVPTXISD::LDUV2:
+ case NVPTXISD::LDUV4:
+ ResNode = SelectLDGLDU(N);
+ break;
+ case NVPTXISD::StoreV2:
+ case NVPTXISD::StoreV4:
+ ResNode = SelectStoreVector(N);
+ break;
+ case NVPTXISD::LoadParam:
+ case NVPTXISD::LoadParamV2:
+ case NVPTXISD::LoadParamV4:
+ ResNode = SelectLoadParam(N);
+ break;
+ case NVPTXISD::StoreRetval:
+ case NVPTXISD::StoreRetvalV2:
+ case NVPTXISD::StoreRetvalV4:
+ ResNode = SelectStoreRetval(N);
+ break;
+ case NVPTXISD::StoreParam:
+ case NVPTXISD::StoreParamV2:
+ case NVPTXISD::StoreParamV4:
+ case NVPTXISD::StoreParamS32:
+ case NVPTXISD::StoreParamU32:
+ ResNode = SelectStoreParam(N);
+ break;
+ case ISD::INTRINSIC_WO_CHAIN:
+ ResNode = SelectIntrinsicNoChain(N);
+ break;
+ case ISD::INTRINSIC_W_CHAIN:
+ ResNode = SelectIntrinsicChain(N);
+ break;
+ case NVPTXISD::Tex1DFloatS32:
+ case NVPTXISD::Tex1DFloatFloat:
+ case NVPTXISD::Tex1DFloatFloatLevel:
+ case NVPTXISD::Tex1DFloatFloatGrad:
+ case NVPTXISD::Tex1DS32S32:
+ case NVPTXISD::Tex1DS32Float:
+ case NVPTXISD::Tex1DS32FloatLevel:
+ case NVPTXISD::Tex1DS32FloatGrad:
+ case NVPTXISD::Tex1DU32S32:
+ case NVPTXISD::Tex1DU32Float:
+ case NVPTXISD::Tex1DU32FloatLevel:
+ case NVPTXISD::Tex1DU32FloatGrad:
+ case NVPTXISD::Tex1DArrayFloatS32:
+ case NVPTXISD::Tex1DArrayFloatFloat:
+ case NVPTXISD::Tex1DArrayFloatFloatLevel:
+ case NVPTXISD::Tex1DArrayFloatFloatGrad:
+ case NVPTXISD::Tex1DArrayS32S32:
+ case NVPTXISD::Tex1DArrayS32Float:
+ case NVPTXISD::Tex1DArrayS32FloatLevel:
+ case NVPTXISD::Tex1DArrayS32FloatGrad:
+ case NVPTXISD::Tex1DArrayU32S32:
+ case NVPTXISD::Tex1DArrayU32Float:
+ case NVPTXISD::Tex1DArrayU32FloatLevel:
+ case NVPTXISD::Tex1DArrayU32FloatGrad:
+ case NVPTXISD::Tex2DFloatS32:
+ case NVPTXISD::Tex2DFloatFloat:
+ case NVPTXISD::Tex2DFloatFloatLevel:
+ case NVPTXISD::Tex2DFloatFloatGrad:
+ case NVPTXISD::Tex2DS32S32:
+ case NVPTXISD::Tex2DS32Float:
+ case NVPTXISD::Tex2DS32FloatLevel:
+ case NVPTXISD::Tex2DS32FloatGrad:
+ case NVPTXISD::Tex2DU32S32:
+ case NVPTXISD::Tex2DU32Float:
+ case NVPTXISD::Tex2DU32FloatLevel:
+ case NVPTXISD::Tex2DU32FloatGrad:
+ case NVPTXISD::Tex2DArrayFloatS32:
+ case NVPTXISD::Tex2DArrayFloatFloat:
+ case NVPTXISD::Tex2DArrayFloatFloatLevel:
+ case NVPTXISD::Tex2DArrayFloatFloatGrad:
+ case NVPTXISD::Tex2DArrayS32S32:
+ case NVPTXISD::Tex2DArrayS32Float:
+ case NVPTXISD::Tex2DArrayS32FloatLevel:
+ case NVPTXISD::Tex2DArrayS32FloatGrad:
+ case NVPTXISD::Tex2DArrayU32S32:
+ case NVPTXISD::Tex2DArrayU32Float:
+ case NVPTXISD::Tex2DArrayU32FloatLevel:
+ case NVPTXISD::Tex2DArrayU32FloatGrad:
+ case NVPTXISD::Tex3DFloatS32:
+ case NVPTXISD::Tex3DFloatFloat:
+ case NVPTXISD::Tex3DFloatFloatLevel:
+ case NVPTXISD::Tex3DFloatFloatGrad:
+ case NVPTXISD::Tex3DS32S32:
+ case NVPTXISD::Tex3DS32Float:
+ case NVPTXISD::Tex3DS32FloatLevel:
+ case NVPTXISD::Tex3DS32FloatGrad:
+ case NVPTXISD::Tex3DU32S32:
+ case NVPTXISD::Tex3DU32Float:
+ case NVPTXISD::Tex3DU32FloatLevel:
+ case NVPTXISD::Tex3DU32FloatGrad:
+ case NVPTXISD::TexCubeFloatFloat:
+ case NVPTXISD::TexCubeFloatFloatLevel:
+ case NVPTXISD::TexCubeS32Float:
+ case NVPTXISD::TexCubeS32FloatLevel:
+ case NVPTXISD::TexCubeU32Float:
+ case NVPTXISD::TexCubeU32FloatLevel:
+ case NVPTXISD::TexCubeArrayFloatFloat:
+ case NVPTXISD::TexCubeArrayFloatFloatLevel:
+ case NVPTXISD::TexCubeArrayS32Float:
+ case NVPTXISD::TexCubeArrayS32FloatLevel:
+ case NVPTXISD::TexCubeArrayU32Float:
+ case NVPTXISD::TexCubeArrayU32FloatLevel:
+ case NVPTXISD::Tld4R2DFloatFloat:
+ case NVPTXISD::Tld4G2DFloatFloat:
+ case NVPTXISD::Tld4B2DFloatFloat:
+ case NVPTXISD::Tld4A2DFloatFloat:
+ case NVPTXISD::Tld4R2DS64Float:
+ case NVPTXISD::Tld4G2DS64Float:
+ case NVPTXISD::Tld4B2DS64Float:
+ case NVPTXISD::Tld4A2DS64Float:
+ case NVPTXISD::Tld4R2DU64Float:
+ case NVPTXISD::Tld4G2DU64Float:
+ case NVPTXISD::Tld4B2DU64Float:
+ case NVPTXISD::Tld4A2DU64Float:
+ case NVPTXISD::TexUnified1DFloatS32:
+ case NVPTXISD::TexUnified1DFloatFloat:
+ case NVPTXISD::TexUnified1DFloatFloatLevel:
+ case NVPTXISD::TexUnified1DFloatFloatGrad:
+ case NVPTXISD::TexUnified1DS32S32:
+ case NVPTXISD::TexUnified1DS32Float:
+ case NVPTXISD::TexUnified1DS32FloatLevel:
+ case NVPTXISD::TexUnified1DS32FloatGrad:
+ case NVPTXISD::TexUnified1DU32S32:
+ case NVPTXISD::TexUnified1DU32Float:
+ case NVPTXISD::TexUnified1DU32FloatLevel:
+ case NVPTXISD::TexUnified1DU32FloatGrad:
+ case NVPTXISD::TexUnified1DArrayFloatS32:
+ case NVPTXISD::TexUnified1DArrayFloatFloat:
+ case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+ case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+ case NVPTXISD::TexUnified1DArrayS32S32:
+ case NVPTXISD::TexUnified1DArrayS32Float:
+ case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+ case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+ case NVPTXISD::TexUnified1DArrayU32S32:
+ case NVPTXISD::TexUnified1DArrayU32Float:
+ case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+ case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+ case NVPTXISD::TexUnified2DFloatS32:
+ case NVPTXISD::TexUnified2DFloatFloat:
+ case NVPTXISD::TexUnified2DFloatFloatLevel:
+ case NVPTXISD::TexUnified2DFloatFloatGrad:
+ case NVPTXISD::TexUnified2DS32S32:
+ case NVPTXISD::TexUnified2DS32Float:
+ case NVPTXISD::TexUnified2DS32FloatLevel:
+ case NVPTXISD::TexUnified2DS32FloatGrad:
+ case NVPTXISD::TexUnified2DU32S32:
+ case NVPTXISD::TexUnified2DU32Float:
+ case NVPTXISD::TexUnified2DU32FloatLevel:
+ case NVPTXISD::TexUnified2DU32FloatGrad:
+ case NVPTXISD::TexUnified2DArrayFloatS32:
+ case NVPTXISD::TexUnified2DArrayFloatFloat:
+ case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+ case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+ case NVPTXISD::TexUnified2DArrayS32S32:
+ case NVPTXISD::TexUnified2DArrayS32Float:
+ case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+ case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+ case NVPTXISD::TexUnified2DArrayU32S32:
+ case NVPTXISD::TexUnified2DArrayU32Float:
+ case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+ case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+ case NVPTXISD::TexUnified3DFloatS32:
+ case NVPTXISD::TexUnified3DFloatFloat:
+ case NVPTXISD::TexUnified3DFloatFloatLevel:
+ case NVPTXISD::TexUnified3DFloatFloatGrad:
+ case NVPTXISD::TexUnified3DS32S32:
+ case NVPTXISD::TexUnified3DS32Float:
+ case NVPTXISD::TexUnified3DS32FloatLevel:
+ case NVPTXISD::TexUnified3DS32FloatGrad:
+ case NVPTXISD::TexUnified3DU32S32:
+ case NVPTXISD::TexUnified3DU32Float:
+ case NVPTXISD::TexUnified3DU32FloatLevel:
+ case NVPTXISD::TexUnified3DU32FloatGrad:
+ case NVPTXISD::TexUnifiedCubeFloatFloat:
+ case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+ case NVPTXISD::TexUnifiedCubeS32Float:
+ case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+ case NVPTXISD::TexUnifiedCubeU32Float:
+ case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+ case NVPTXISD::TexUnifiedCubeArrayS32Float:
+ case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+ case NVPTXISD::TexUnifiedCubeArrayU32Float:
+ case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+ case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+ case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+ case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+ case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+ case NVPTXISD::Tld4UnifiedR2DS64Float:
+ case NVPTXISD::Tld4UnifiedG2DS64Float:
+ case NVPTXISD::Tld4UnifiedB2DS64Float:
+ case NVPTXISD::Tld4UnifiedA2DS64Float:
+ case NVPTXISD::Tld4UnifiedR2DU64Float:
+ case NVPTXISD::Tld4UnifiedG2DU64Float:
+ case NVPTXISD::Tld4UnifiedB2DU64Float:
+ case NVPTXISD::Tld4UnifiedA2DU64Float:
+ ResNode = SelectTextureIntrinsic(N);
+ break;
+ case NVPTXISD::Suld1DI8Clamp:
+ case NVPTXISD::Suld1DI16Clamp:
+ case NVPTXISD::Suld1DI32Clamp:
+ case NVPTXISD::Suld1DI64Clamp:
+ case NVPTXISD::Suld1DV2I8Clamp:
+ case NVPTXISD::Suld1DV2I16Clamp:
+ case NVPTXISD::Suld1DV2I32Clamp:
+ case NVPTXISD::Suld1DV2I64Clamp:
+ case NVPTXISD::Suld1DV4I8Clamp:
+ case NVPTXISD::Suld1DV4I16Clamp:
+ case NVPTXISD::Suld1DV4I32Clamp:
+ case NVPTXISD::Suld1DArrayI8Clamp:
+ case NVPTXISD::Suld1DArrayI16Clamp:
+ case NVPTXISD::Suld1DArrayI32Clamp:
+ case NVPTXISD::Suld1DArrayI64Clamp:
+ case NVPTXISD::Suld1DArrayV2I8Clamp:
+ case NVPTXISD::Suld1DArrayV2I16Clamp:
+ case NVPTXISD::Suld1DArrayV2I32Clamp:
+ case NVPTXISD::Suld1DArrayV2I64Clamp:
+ case NVPTXISD::Suld1DArrayV4I8Clamp:
+ case NVPTXISD::Suld1DArrayV4I16Clamp:
+ case NVPTXISD::Suld1DArrayV4I32Clamp:
+ case NVPTXISD::Suld2DI8Clamp:
+ case NVPTXISD::Suld2DI16Clamp:
+ case NVPTXISD::Suld2DI32Clamp:
+ case NVPTXISD::Suld2DI64Clamp:
+ case NVPTXISD::Suld2DV2I8Clamp:
+ case NVPTXISD::Suld2DV2I16Clamp:
+ case NVPTXISD::Suld2DV2I32Clamp:
+ case NVPTXISD::Suld2DV2I64Clamp:
+ case NVPTXISD::Suld2DV4I8Clamp:
+ case NVPTXISD::Suld2DV4I16Clamp:
+ case NVPTXISD::Suld2DV4I32Clamp:
+ case NVPTXISD::Suld2DArrayI8Clamp:
+ case NVPTXISD::Suld2DArrayI16Clamp:
+ case NVPTXISD::Suld2DArrayI32Clamp:
+ case NVPTXISD::Suld2DArrayI64Clamp:
+ case NVPTXISD::Suld2DArrayV2I8Clamp:
+ case NVPTXISD::Suld2DArrayV2I16Clamp:
+ case NVPTXISD::Suld2DArrayV2I32Clamp:
+ case NVPTXISD::Suld2DArrayV2I64Clamp:
+ case NVPTXISD::Suld2DArrayV4I8Clamp:
+ case NVPTXISD::Suld2DArrayV4I16Clamp:
+ case NVPTXISD::Suld2DArrayV4I32Clamp:
+ case NVPTXISD::Suld3DI8Clamp:
+ case NVPTXISD::Suld3DI16Clamp:
+ case NVPTXISD::Suld3DI32Clamp:
+ case NVPTXISD::Suld3DI64Clamp:
+ case NVPTXISD::Suld3DV2I8Clamp:
+ case NVPTXISD::Suld3DV2I16Clamp:
+ case NVPTXISD::Suld3DV2I32Clamp:
+ case NVPTXISD::Suld3DV2I64Clamp:
+ case NVPTXISD::Suld3DV4I8Clamp:
+ case NVPTXISD::Suld3DV4I16Clamp:
+ case NVPTXISD::Suld3DV4I32Clamp:
+ case NVPTXISD::Suld1DI8Trap:
+ case NVPTXISD::Suld1DI16Trap:
+ case NVPTXISD::Suld1DI32Trap:
+ case NVPTXISD::Suld1DI64Trap:
+ case NVPTXISD::Suld1DV2I8Trap:
+ case NVPTXISD::Suld1DV2I16Trap:
+ case NVPTXISD::Suld1DV2I32Trap:
+ case NVPTXISD::Suld1DV2I64Trap:
+ case NVPTXISD::Suld1DV4I8Trap:
+ case NVPTXISD::Suld1DV4I16Trap:
+ case NVPTXISD::Suld1DV4I32Trap:
+ case NVPTXISD::Suld1DArrayI8Trap:
+ case NVPTXISD::Suld1DArrayI16Trap:
+ case NVPTXISD::Suld1DArrayI32Trap:
+ case NVPTXISD::Suld1DArrayI64Trap:
+ case NVPTXISD::Suld1DArrayV2I8Trap:
+ case NVPTXISD::Suld1DArrayV2I16Trap:
+ case NVPTXISD::Suld1DArrayV2I32Trap:
+ case NVPTXISD::Suld1DArrayV2I64Trap:
+ case NVPTXISD::Suld1DArrayV4I8Trap:
+ case NVPTXISD::Suld1DArrayV4I16Trap:
+ case NVPTXISD::Suld1DArrayV4I32Trap:
+ case NVPTXISD::Suld2DI8Trap:
+ case NVPTXISD::Suld2DI16Trap:
+ case NVPTXISD::Suld2DI32Trap:
+ case NVPTXISD::Suld2DI64Trap:
+ case NVPTXISD::Suld2DV2I8Trap:
+ case NVPTXISD::Suld2DV2I16Trap:
+ case NVPTXISD::Suld2DV2I32Trap:
+ case NVPTXISD::Suld2DV2I64Trap:
+ case NVPTXISD::Suld2DV4I8Trap:
+ case NVPTXISD::Suld2DV4I16Trap:
+ case NVPTXISD::Suld2DV4I32Trap:
+ case NVPTXISD::Suld2DArrayI8Trap:
+ case NVPTXISD::Suld2DArrayI16Trap:
+ case NVPTXISD::Suld2DArrayI32Trap:
+ case NVPTXISD::Suld2DArrayI64Trap:
+ case NVPTXISD::Suld2DArrayV2I8Trap:
+ case NVPTXISD::Suld2DArrayV2I16Trap:
+ case NVPTXISD::Suld2DArrayV2I32Trap:
+ case NVPTXISD::Suld2DArrayV2I64Trap:
+ case NVPTXISD::Suld2DArrayV4I8Trap:
+ case NVPTXISD::Suld2DArrayV4I16Trap:
+ case NVPTXISD::Suld2DArrayV4I32Trap:
+ case NVPTXISD::Suld3DI8Trap:
+ case NVPTXISD::Suld3DI16Trap:
+ case NVPTXISD::Suld3DI32Trap:
+ case NVPTXISD::Suld3DI64Trap:
+ case NVPTXISD::Suld3DV2I8Trap:
+ case NVPTXISD::Suld3DV2I16Trap:
+ case NVPTXISD::Suld3DV2I32Trap:
+ case NVPTXISD::Suld3DV2I64Trap:
+ case NVPTXISD::Suld3DV4I8Trap:
+ case NVPTXISD::Suld3DV4I16Trap:
+ case NVPTXISD::Suld3DV4I32Trap:
+ case NVPTXISD::Suld1DI8Zero:
+ case NVPTXISD::Suld1DI16Zero:
+ case NVPTXISD::Suld1DI32Zero:
+ case NVPTXISD::Suld1DI64Zero:
+ case NVPTXISD::Suld1DV2I8Zero:
+ case NVPTXISD::Suld1DV2I16Zero:
+ case NVPTXISD::Suld1DV2I32Zero:
+ case NVPTXISD::Suld1DV2I64Zero:
+ case NVPTXISD::Suld1DV4I8Zero:
+ case NVPTXISD::Suld1DV4I16Zero:
+ case NVPTXISD::Suld1DV4I32Zero:
+ case NVPTXISD::Suld1DArrayI8Zero:
+ case NVPTXISD::Suld1DArrayI16Zero:
+ case NVPTXISD::Suld1DArrayI32Zero:
+ case NVPTXISD::Suld1DArrayI64Zero:
+ case NVPTXISD::Suld1DArrayV2I8Zero:
+ case NVPTXISD::Suld1DArrayV2I16Zero:
+ case NVPTXISD::Suld1DArrayV2I32Zero:
+ case NVPTXISD::Suld1DArrayV2I64Zero:
+ case NVPTXISD::Suld1DArrayV4I8Zero:
+ case NVPTXISD::Suld1DArrayV4I16Zero:
+ case NVPTXISD::Suld1DArrayV4I32Zero:
+ case NVPTXISD::Suld2DI8Zero:
+ case NVPTXISD::Suld2DI16Zero:
+ case NVPTXISD::Suld2DI32Zero:
+ case NVPTXISD::Suld2DI64Zero:
+ case NVPTXISD::Suld2DV2I8Zero:
+ case NVPTXISD::Suld2DV2I16Zero:
+ case NVPTXISD::Suld2DV2I32Zero:
+ case NVPTXISD::Suld2DV2I64Zero:
+ case NVPTXISD::Suld2DV4I8Zero:
+ case NVPTXISD::Suld2DV4I16Zero:
+ case NVPTXISD::Suld2DV4I32Zero:
+ case NVPTXISD::Suld2DArrayI8Zero:
+ case NVPTXISD::Suld2DArrayI16Zero:
+ case NVPTXISD::Suld2DArrayI32Zero:
+ case NVPTXISD::Suld2DArrayI64Zero:
+ case NVPTXISD::Suld2DArrayV2I8Zero:
+ case NVPTXISD::Suld2DArrayV2I16Zero:
+ case NVPTXISD::Suld2DArrayV2I32Zero:
+ case NVPTXISD::Suld2DArrayV2I64Zero:
+ case NVPTXISD::Suld2DArrayV4I8Zero:
+ case NVPTXISD::Suld2DArrayV4I16Zero:
+ case NVPTXISD::Suld2DArrayV4I32Zero:
+ case NVPTXISD::Suld3DI8Zero:
+ case NVPTXISD::Suld3DI16Zero:
+ case NVPTXISD::Suld3DI32Zero:
+ case NVPTXISD::Suld3DI64Zero:
+ case NVPTXISD::Suld3DV2I8Zero:
+ case NVPTXISD::Suld3DV2I16Zero:
+ case NVPTXISD::Suld3DV2I32Zero:
+ case NVPTXISD::Suld3DV2I64Zero:
+ case NVPTXISD::Suld3DV4I8Zero:
+ case NVPTXISD::Suld3DV4I16Zero:
+ case NVPTXISD::Suld3DV4I32Zero:
+ ResNode = SelectSurfaceIntrinsic(N);
+ break;
+ case ISD::AND:
+ case ISD::SRA:
+ case ISD::SRL:
+ // Try to select BFE
+ ResNode = SelectBFE(N);
+ break;
+ case ISD::ADDRSPACECAST:
+ ResNode = SelectAddrSpaceCast(N);
+ break;
+ default:
+ break;
+ }
+ if (ResNode)
+ return ResNode;
+ return SelectCode(N);
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectIntrinsicChain(SDNode *N) {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IID) {
+ default:
+ return NULL;
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_p:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_p:
+ return SelectLDGLDU(N);
+ }
+}
+
+static unsigned int getCodeAddrSpace(MemSDNode *N,
+ const NVPTXSubtarget &Subtarget) {
+ const Value *Src = N->getMemOperand()->getValue();
+
+ if (!Src)
+ return NVPTX::PTXLdStInstCode::GENERIC;
+
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
+ switch (PT->getAddressSpace()) {
+ case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+ case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+ case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+ case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+ case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+ case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
+ default: break;
+ }
+ }
+ return NVPTX::PTXLdStInstCode::GENERIC;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IID) {
+ default:
+ return nullptr;
+ case Intrinsic::nvvm_texsurf_handle_internal:
+ return SelectTexSurfHandle(N);
+ }
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
+ // Op 0 is the intrinsic ID
+ SDValue Wrapper = N->getOperand(1);
+ SDValue GlobalVal = Wrapper.getOperand(0);
+ return CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), MVT::i64,
+ GlobalVal);
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+ SDValue Src = N->getOperand(0);
+ AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
+ unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
+ unsigned DstAddrSpace = CastN->getDestAddressSpace();
+
+ assert(SrcAddrSpace != DstAddrSpace &&
+ "addrspacecast must be between different address spaces");
+
+ if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
+ // Specific to generic
+ unsigned Opc;
+ switch (SrcAddrSpace) {
+ default: report_fatal_error("Bad address space in addrspacecast");
+ case ADDRESS_SPACE_GLOBAL:
+ Opc = Subtarget.is64Bit() ? NVPTX::cvta_global_yes_64
+ : NVPTX::cvta_global_yes;
+ break;
+ case ADDRESS_SPACE_SHARED:
+ Opc = Subtarget.is64Bit() ? NVPTX::cvta_shared_yes_64
+ : NVPTX::cvta_shared_yes;
+ break;
+ case ADDRESS_SPACE_CONST:
+ Opc = Subtarget.is64Bit() ? NVPTX::cvta_const_yes_64
+ : NVPTX::cvta_const_yes;
+ break;
+ case ADDRESS_SPACE_LOCAL:
+ Opc = Subtarget.is64Bit() ? NVPTX::cvta_local_yes_64
+ : NVPTX::cvta_local_yes;
+ break;
+ }
+ return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+ } else {
+ // Generic to specific
+ if (SrcAddrSpace != 0)
+ report_fatal_error("Cannot cast between two non-generic address spaces");
+ unsigned Opc;
+ switch (DstAddrSpace) {
+ default: report_fatal_error("Bad address space in addrspacecast");
+ case ADDRESS_SPACE_GLOBAL:
+ Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_global_yes_64
+ : NVPTX::cvta_to_global_yes;
+ break;
+ case ADDRESS_SPACE_SHARED:
+ Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_shared_yes_64
+ : NVPTX::cvta_to_shared_yes;
+ break;
+ case ADDRESS_SPACE_CONST:
+ Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_const_yes_64
+ : NVPTX::cvta_to_const_yes;
+ break;
+ case ADDRESS_SPACE_LOCAL:
+ Opc = Subtarget.is64Bit() ? NVPTX::cvta_to_local_yes_64
+ : NVPTX::cvta_to_local_yes;
+ break;
+ }
+ return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
+ }
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
+ SDLoc dl(N);
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ EVT LoadedVT = LD->getMemoryVT();
+ SDNode *NVPTXLD = nullptr;
+
+ // do not support pre/post inc/dec
+ if (LD->isIndexed())
+ return nullptr;
+
+ if (!LoadedVT.isSimple())
+ return nullptr;
+
+ // Address Space Setting
+ unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
+
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool isVolatile = LD->isVolatile();
+ if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ isVolatile = false;
+
+ // Vector Setting
+ MVT SimpleVT = LoadedVT.getSimpleVT();
+ unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+ if (SimpleVT.isVector()) {
+ unsigned num = SimpleVT.getVectorNumElements();
+ if (num == 2)
+ vecType = NVPTX::PTXLdStInstCode::V2;
+ else if (num == 4)
+ vecType = NVPTX::PTXLdStInstCode::V4;
+ else
+ return nullptr;
+ }
+
+ // Type Setting: fromType + fromTypeWidth
+ //
+ // Sign : ISD::SEXTLOAD
+ // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+ // type is integer
+ // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+ MVT ScalarVT = SimpleVT.getScalarType();
+ // Read at least 8 bits (predicates are stored as 8-bit values)
+ unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
+ unsigned int fromType;
+ if ((LD->getExtensionType() == ISD::SEXTLOAD))
+ fromType = NVPTX::PTXLdStInstCode::Signed;
+ else if (ScalarVT.isFloatingPoint())
+ fromType = NVPTX::PTXLdStInstCode::Float;
+ else
+ fromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+ // Create the machine instruction DAG
+ SDValue Chain = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue Addr;
+ SDValue Offset, Base;
+ unsigned Opcode;
+ MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
+
+ if (SelectDirectAddr(N1, Addr)) {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_avar;
+ break;
+ default:
+ return nullptr;
+ }
+ SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+ getI32Imm(vecType), getI32Imm(fromType),
+ getI32Imm(fromTypeWidth), Addr, Chain };
+ NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
+ : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_asi;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_asi;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_asi;
+ break;
+ default:
+ return nullptr;
+ }
+ SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+ getI32Imm(vecType), getI32Imm(fromType),
+ getI32Imm(fromTypeWidth), Base, Offset, Chain };
+ NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
+ : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+ if (Subtarget.is64Bit()) {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_ari_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_ari_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_ari_64;
+ break;
+ default:
+ return nullptr;
+ }
+ } else {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_ari;
+ break;
+ default:
+ return nullptr;
+ }
+ }
+ SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+ getI32Imm(vecType), getI32Imm(fromType),
+ getI32Imm(fromTypeWidth), Base, Offset, Chain };
+ NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+ } else {
+ if (Subtarget.is64Bit()) {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_areg_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_areg_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_areg_64;
+ break;
+ default:
+ return nullptr;
+ }
+ } else {
+ switch (TargetVT) {
+ case MVT::i8:
+ Opcode = NVPTX::LD_i8_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LD_i16_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LD_i32_areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LD_i64_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LD_f32_areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LD_f64_areg;
+ break;
+ default:
+ return nullptr;
+ }
+ }
+ SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+ getI32Imm(vecType), getI32Imm(fromType),
+ getI32Imm(fromTypeWidth), N1, Chain };
+ NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+ }
+
+ if (NVPTXLD) {
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+ }
+
+ return NVPTXLD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue Addr, Offset, Base;
+ unsigned Opcode;
+ SDLoc DL(N);
+ SDNode *LD;
+ MemSDNode *MemSD = cast<MemSDNode>(N);
+ EVT LoadedVT = MemSD->getMemoryVT();
+
+ if (!LoadedVT.isSimple())
+ return nullptr;
+
+ // Address Space Setting
+ unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool IsVolatile = MemSD->isVolatile();
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ IsVolatile = false;
+
+ // Vector Setting
+ MVT SimpleVT = LoadedVT.getSimpleVT();
+
+ // Type Setting: fromType + fromTypeWidth
+ //
+ // Sign : ISD::SEXTLOAD
+ // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+ // type is integer
+ // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+ MVT ScalarVT = SimpleVT.getScalarType();
+ // Read at least 8 bits (predicates are stored as 8-bit values)
+ unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
+ unsigned int FromType;
+ // The last operand holds the original LoadSDNode::getExtensionType() value
+ unsigned ExtensionType = cast<ConstantSDNode>(
+ N->getOperand(N->getNumOperands() - 1))->getZExtValue();
+ if (ExtensionType == ISD::SEXTLOAD)
+ FromType = NVPTX::PTXLdStInstCode::Signed;
+ else if (ScalarVT.isFloatingPoint())
+ FromType = NVPTX::PTXLdStInstCode::Float;
+ else
+ FromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+ unsigned VecType;
+
+ switch (N->getOpcode()) {
+ case NVPTXISD::LoadV2:
+ VecType = NVPTX::PTXLdStInstCode::V2;
+ break;
+ case NVPTXISD::LoadV4:
+ VecType = NVPTX::PTXLdStInstCode::V4;
+ break;
+ default:
+ return nullptr;
+ }
+
+ EVT EltVT = N->getValueType(0);
+
+ if (SelectDirectAddr(Op1, Addr)) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_avar;
+ break;
+ }
+ break;
+ }
+
+ SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+ getI32Imm(VecType), getI32Imm(FromType),
+ getI32Imm(FromTypeWidth), Addr, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
+ : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_asi;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_asi;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_asi;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_asi;
+ break;
+ }
+ break;
+ }
+
+ SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+ getI32Imm(VecType), getI32Imm(FromType),
+ getI32Imm(FromTypeWidth), Base, Offset, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+ : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+ if (Subtarget.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_ari_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_ari_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_ari_64;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_ari_64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_ari;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_ari;
+ break;
+ }
+ break;
+ }
+ }
+
+ SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+ getI32Imm(VecType), getI32Imm(FromType),
+ getI32Imm(FromTypeWidth), Base, Offset, Chain };
+
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ } else {
+ if (Subtarget.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_areg_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_areg_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_areg_64;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_areg_64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::LoadV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v2_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v2_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v2_areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::LDV_i64_v2_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v2_areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::LDV_f64_v2_areg;
+ break;
+ }
+ break;
+ case NVPTXISD::LoadV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::LDV_i8_v4_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::LDV_i16_v4_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::LDV_i32_v4_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::LDV_f32_v4_areg;
+ break;
+ }
+ break;
+ }
+ }
+
+ SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+ getI32Imm(VecType), getI32Imm(FromType),
+ getI32Imm(FromTypeWidth), Op1, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ }
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ return LD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
+
+ SDValue Chain = N->getOperand(0);
+ SDValue Op1;
+ MemSDNode *Mem;
+ bool IsLDG = true;
+
+ // If this is an LDG intrinsic, the address is the third operand. Its its an
+ // LDG/LDU SD node (from custom vector handling), then its the second operand
+ if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ Op1 = N->getOperand(2);
+ Mem = cast<MemIntrinsicSDNode>(N);
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IID) {
+ default:
+ return NULL;
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_p:
+ IsLDG = true;
+ break;
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_p:
+ IsLDG = false;
+ break;
+ }
+ } else {
+ Op1 = N->getOperand(1);
+ Mem = cast<MemSDNode>(N);
+ }
+
+ unsigned Opcode;
+ SDLoc DL(N);
+ SDNode *LD;
+ SDValue Base, Offset, Addr;
+
+ EVT EltVT = Mem->getMemoryVT();
+ if (EltVT.isVector()) {
+ EltVT = EltVT.getVectorElementType();
+ }
+
+ if (SelectDirectAddr(Op1, Addr)) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar;
+ break;
+ }
+ break;
+ }
+
+ SDValue Ops[] = { Addr, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+ : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+ if (Subtarget.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32;
+ break;
+ }
+ break;
+ }
+ }
+
+ SDValue Ops[] = { Base, Offset, Chain };
+
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ } else {
+ if (Subtarget.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case ISD::INTRINSIC_W_CHAIN:
+ if (IsLDG) {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
+ break;
+ }
+ } else {
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
+ break;
+ }
+ }
+ break;
+ case NVPTXISD::LDGV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDGV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32;
+ break;
+ }
+ break;
+ case NVPTXISD::LDUV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32;
+ break;
+ }
+ break;
+ }
+ }
+
+ SDValue Ops[] = { Op1, Chain };
+ LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+ }
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = Mem->getMemOperand();
+ cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ return LD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
+ SDLoc dl(N);
+ StoreSDNode *ST = cast<StoreSDNode>(N);
+ EVT StoreVT = ST->getMemoryVT();
+ SDNode *NVPTXST = nullptr;
+
+ // do not support pre/post inc/dec
+ if (ST->isIndexed())
+ return nullptr;
+
+ if (!StoreVT.isSimple())
+ return nullptr;
+
+ // Address Space Setting
+ unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
+
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool isVolatile = ST->isVolatile();
+ if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ isVolatile = false;
+
+ // Vector Setting
+ MVT SimpleVT = StoreVT.getSimpleVT();
+ unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+ if (SimpleVT.isVector()) {
+ unsigned num = SimpleVT.getVectorNumElements();
+ if (num == 2)
+ vecType = NVPTX::PTXLdStInstCode::V2;
+ else if (num == 4)
+ vecType = NVPTX::PTXLdStInstCode::V4;
+ else
+ return nullptr;
+ }
+
+ // Type Setting: toType + toTypeWidth
+ // - for integer type, always use 'u'
+ //
+ MVT ScalarVT = SimpleVT.getScalarType();
+ unsigned toTypeWidth = ScalarVT.getSizeInBits();
+ unsigned int toType;
+ if (ScalarVT.isFloatingPoint())
+ toType = NVPTX::PTXLdStInstCode::Float;
+ else
+ toType = NVPTX::PTXLdStInstCode::Unsigned;
+
+ // Create the machine instruction DAG
+ SDValue Chain = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ SDValue Addr;
+ SDValue Offset, Base;
+ unsigned Opcode;
+ MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
+
+ if (SelectDirectAddr(N2, Addr)) {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_avar;
+ break;
+ default:
+ return nullptr;
+ }
+ SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+ getI32Imm(vecType), getI32Imm(toType),
+ getI32Imm(toTypeWidth), Addr, Chain };
+ NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_asi;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_asi;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_asi;
+ break;
+ default:
+ return nullptr;
+ }
+ SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+ getI32Imm(vecType), getI32Imm(toType),
+ getI32Imm(toTypeWidth), Base, Offset, Chain };
+ NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+ if (Subtarget.is64Bit()) {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_ari_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_ari_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_ari_64;
+ break;
+ default:
+ return nullptr;
+ }
+ } else {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_ari;
+ break;
+ default:
+ return nullptr;
+ }
+ }
+ SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+ getI32Imm(vecType), getI32Imm(toType),
+ getI32Imm(toTypeWidth), Base, Offset, Chain };
+ NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ } else {
+ if (Subtarget.is64Bit()) {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_areg_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_areg_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_areg_64;
+ break;
+ default:
+ return nullptr;
+ }
+ } else {
+ switch (SourceVT) {
+ case MVT::i8:
+ Opcode = NVPTX::ST_i8_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::ST_i16_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::ST_i32_areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::ST_i64_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::ST_f32_areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::ST_f64_areg;
+ break;
+ default:
+ return nullptr;
+ }
+ }
+ SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+ getI32Imm(vecType), getI32Imm(toType),
+ getI32Imm(toTypeWidth), N2, Chain };
+ NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+ }
+
+ if (NVPTXST) {
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+ }
+
+ return NVPTXST;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue Addr, Offset, Base;
+ unsigned Opcode;
+ SDLoc DL(N);
+ SDNode *ST;
+ EVT EltVT = Op1.getValueType();
+ MemSDNode *MemSD = cast<MemSDNode>(N);
+ EVT StoreVT = MemSD->getMemoryVT();
+
+ // Address Space Setting
+ unsigned CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+
+ if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
+ report_fatal_error("Cannot store to pointer that points to constant "
+ "memory space");
+ }
+
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool IsVolatile = MemSD->isVolatile();
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ IsVolatile = false;
+
+ // Type Setting: toType + toTypeWidth
+ // - for integer type, always use 'u'
+ assert(StoreVT.isSimple() && "Store value is not simple");
+ MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
+ unsigned ToTypeWidth = ScalarVT.getSizeInBits();
+ unsigned ToType;
+ if (ScalarVT.isFloatingPoint())
+ ToType = NVPTX::PTXLdStInstCode::Float;
+ else
+ ToType = NVPTX::PTXLdStInstCode::Unsigned;
+
+ SmallVector<SDValue, 12> StOps;
+ SDValue N2;
+ unsigned VecType;
+
+ switch (N->getOpcode()) {
+ case NVPTXISD::StoreV2:
+ VecType = NVPTX::PTXLdStInstCode::V2;
+ StOps.push_back(N->getOperand(1));
+ StOps.push_back(N->getOperand(2));
+ N2 = N->getOperand(3);
+ break;
+ case NVPTXISD::StoreV4:
+ VecType = NVPTX::PTXLdStInstCode::V4;
+ StOps.push_back(N->getOperand(1));
+ StOps.push_back(N->getOperand(2));
+ StOps.push_back(N->getOperand(3));
+ StOps.push_back(N->getOperand(4));
+ N2 = N->getOperand(5);
+ break;
+ default:
+ return nullptr;
+ }
+
+ StOps.push_back(getI32Imm(IsVolatile));
+ StOps.push_back(getI32Imm(CodeAddrSpace));
+ StOps.push_back(getI32Imm(VecType));
+ StOps.push_back(getI32Imm(ToType));
+ StOps.push_back(getI32Imm(ToTypeWidth));
+
+ if (SelectDirectAddr(N2, Addr)) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_avar;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_avar;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_avar;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_avar;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_avar;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_avar;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_avar;
+ break;
+ }
+ break;
+ }
+ StOps.push_back(Addr);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_asi;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_asi;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_asi;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_asi;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_asi;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_asi;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_asi;
+ break;
+ }
+ break;
+ }
+ StOps.push_back(Base);
+ StOps.push_back(Offset);
+ } else if (Subtarget.is64Bit()
+ ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+ : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+ if (Subtarget.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_ari_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_ari_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_ari_64;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_ari_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_ari_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_ari_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_ari_64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_ari;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_ari;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_ari;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_ari;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_ari;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_ari;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_ari;
+ break;
+ }
+ break;
+ }
+ }
+ StOps.push_back(Base);
+ StOps.push_back(Offset);
+ } else {
+ if (Subtarget.is64Bit()) {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_areg_64;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_areg_64;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_areg_64;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_areg_64;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_areg_64;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_areg_64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_areg_64;
+ break;
+ }
+ break;
+ }
+ } else {
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::StoreV2:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v2_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v2_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v2_areg;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::STV_i64_v2_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v2_areg;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::STV_f64_v2_areg;
+ break;
+ }
+ break;
+ case NVPTXISD::StoreV4:
+ switch (EltVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i8:
+ Opcode = NVPTX::STV_i8_v4_areg;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::STV_i16_v4_areg;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::STV_i32_v4_areg;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::STV_f32_v4_areg;
+ break;
+ }
+ break;
+ }
+ }
+ StOps.push_back(N2);
+ }
+
+ StOps.push_back(Chain);
+
+ ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, StOps);
+
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ return ST;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
+ SDValue Chain = Node->getOperand(0);
+ SDValue Offset = Node->getOperand(2);
+ SDValue Flag = Node->getOperand(3);
+ SDLoc DL(Node);
+ MemSDNode *Mem = cast<MemSDNode>(Node);
+
+ unsigned VecSize;
+ switch (Node->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::LoadParam:
+ VecSize = 1;
+ break;
+ case NVPTXISD::LoadParamV2:
+ VecSize = 2;
+ break;
+ case NVPTXISD::LoadParamV4:
+ VecSize = 4;
+ break;
+ }
+
+ EVT EltVT = Node->getValueType(0);
+ EVT MemVT = Mem->getMemoryVT();
+
+ unsigned Opc = 0;
+
+ switch (VecSize) {
+ default:
+ return nullptr;
+ case 1:
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opc = NVPTX::LoadParamMemI8;
+ break;
+ case MVT::i8:
+ Opc = NVPTX::LoadParamMemI8;
+ break;
+ case MVT::i16:
+ Opc = NVPTX::LoadParamMemI16;
+ break;
+ case MVT::i32:
+ Opc = NVPTX::LoadParamMemI32;
+ break;
+ case MVT::i64:
+ Opc = NVPTX::LoadParamMemI64;
+ break;
+ case MVT::f32:
+ Opc = NVPTX::LoadParamMemF32;
+ break;
+ case MVT::f64:
+ Opc = NVPTX::LoadParamMemF64;
+ break;
+ }
+ break;
+ case 2:
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opc = NVPTX::LoadParamMemV2I8;
+ break;
+ case MVT::i8:
+ Opc = NVPTX::LoadParamMemV2I8;
+ break;
+ case MVT::i16:
+ Opc = NVPTX::LoadParamMemV2I16;
+ break;
+ case MVT::i32:
+ Opc = NVPTX::LoadParamMemV2I32;
+ break;
+ case MVT::i64:
+ Opc = NVPTX::LoadParamMemV2I64;
+ break;
+ case MVT::f32:
+ Opc = NVPTX::LoadParamMemV2F32;
+ break;
+ case MVT::f64:
+ Opc = NVPTX::LoadParamMemV2F64;
+ break;
+ }
+ break;
+ case 4:
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opc = NVPTX::LoadParamMemV4I8;
+ break;
+ case MVT::i8:
+ Opc = NVPTX::LoadParamMemV4I8;
+ break;
+ case MVT::i16:
+ Opc = NVPTX::LoadParamMemV4I16;
+ break;
+ case MVT::i32:
+ Opc = NVPTX::LoadParamMemV4I32;
+ break;
+ case MVT::f32:
+ Opc = NVPTX::LoadParamMemV4F32;
+ break;
+ }
+ break;
+ }
+
+ SDVTList VTs;
+ if (VecSize == 1) {
+ VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
+ } else if (VecSize == 2) {
+ VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
+ } else {
+ EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
+ VTs = CurDAG->getVTList(EVTs);
+ }
+
+ unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+
+ SmallVector<SDValue, 2> Ops;
+ Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+ Ops.push_back(Chain);
+ Ops.push_back(Flag);
+
+ SDNode *Ret =
+ CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+ return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Offset = N->getOperand(1);
+ unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+ MemSDNode *Mem = cast<MemSDNode>(N);
+
+ // How many elements do we have?
+ unsigned NumElts = 1;
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::StoreRetval:
+ NumElts = 1;
+ break;
+ case NVPTXISD::StoreRetvalV2:
+ NumElts = 2;
+ break;
+ case NVPTXISD::StoreRetvalV4:
+ NumElts = 4;
+ break;
+ }
+
+ // Build vector of operands
+ SmallVector<SDValue, 6> Ops;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Ops.push_back(N->getOperand(i + 2));
+ Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+ Ops.push_back(Chain);
+
+ // Determine target opcode
+ // If we have an i1, use an 8-bit store. The lowering code in
+ // NVPTXISelLowering will have already emitted an upcast.
+ unsigned Opcode = 0;
+ switch (NumElts) {
+ default:
+ return nullptr;
+ case 1:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opcode = NVPTX::StoreRetvalI8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreRetvalI8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreRetvalI16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreRetvalI64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreRetvalF32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::StoreRetvalF64;
+ break;
+ }
+ break;
+ case 2:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opcode = NVPTX::StoreRetvalV2I8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreRetvalV2I8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreRetvalV2I16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalV2I32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreRetvalV2I64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreRetvalV2F32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::StoreRetvalV2F64;
+ break;
+ }
+ break;
+ case 4:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opcode = NVPTX::StoreRetvalV4I8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreRetvalV4I8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreRetvalV4I16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreRetvalV4I32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreRetvalV4F32;
+ break;
+ }
+ break;
+ }
+
+ SDNode *Ret =
+ CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
+ SDLoc DL(N);
+ SDValue Chain = N->getOperand(0);
+ SDValue Param = N->getOperand(1);
+ unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
+ SDValue Offset = N->getOperand(2);
+ unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+ MemSDNode *Mem = cast<MemSDNode>(N);
+ SDValue Flag = N->getOperand(N->getNumOperands() - 1);
+
+ // How many elements do we have?
+ unsigned NumElts = 1;
+ switch (N->getOpcode()) {
+ default:
+ return nullptr;
+ case NVPTXISD::StoreParamU32:
+ case NVPTXISD::StoreParamS32:
+ case NVPTXISD::StoreParam:
+ NumElts = 1;
+ break;
+ case NVPTXISD::StoreParamV2:
+ NumElts = 2;
+ break;
+ case NVPTXISD::StoreParamV4:
+ NumElts = 4;
+ break;
+ }
+
+ // Build vector of operands
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i < NumElts; ++i)
+ Ops.push_back(N->getOperand(i + 3));
+ Ops.push_back(CurDAG->getTargetConstant(ParamVal, MVT::i32));
+ Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+ Ops.push_back(Chain);
+ Ops.push_back(Flag);
+
+ // Determine target opcode
+ // If we have an i1, use an 8-bit store. The lowering code in
+ // NVPTXISelLowering will have already emitted an upcast.
+ unsigned Opcode = 0;
+ switch (N->getOpcode()) {
+ default:
+ switch (NumElts) {
+ default:
+ return nullptr;
+ case 1:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opcode = NVPTX::StoreParamI8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreParamI8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreParamI16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamI32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreParamI64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreParamF32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::StoreParamF64;
+ break;
+ }
+ break;
+ case 2:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opcode = NVPTX::StoreParamV2I8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreParamV2I8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreParamV2I16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamV2I32;
+ break;
+ case MVT::i64:
+ Opcode = NVPTX::StoreParamV2I64;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreParamV2F32;
+ break;
+ case MVT::f64:
+ Opcode = NVPTX::StoreParamV2F64;
+ break;
+ }
+ break;
+ case 4:
+ switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+ default:
+ return nullptr;
+ case MVT::i1:
+ Opcode = NVPTX::StoreParamV4I8;
+ break;
+ case MVT::i8:
+ Opcode = NVPTX::StoreParamV4I8;
+ break;
+ case MVT::i16:
+ Opcode = NVPTX::StoreParamV4I16;
+ break;
+ case MVT::i32:
+ Opcode = NVPTX::StoreParamV4I32;
+ break;
+ case MVT::f32:
+ Opcode = NVPTX::StoreParamV4F32;
+ break;
+ }
+ break;
+ }
+ break;
+ // Special case: if we have a sign-extend/zero-extend node, insert the
+ // conversion instruction first, and use that as the value operand to
+ // the selected StoreParam node.
+ case NVPTXISD::StoreParamU32: {
+ Opcode = NVPTX::StoreParamI32;
+ SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+ MVT::i32);
+ SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
+ MVT::i32, Ops[0], CvtNone);
+ Ops[0] = SDValue(Cvt, 0);
+ break;
+ }
+ case NVPTXISD::StoreParamS32: {
+ Opcode = NVPTX::StoreParamI32;
+ SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+ MVT::i32);
+ SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
+ MVT::i32, Ops[0], CvtNone);
+ Ops[0] = SDValue(Cvt, 0);
+ break;
+ }
+ }
+
+ SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ SDNode *Ret =
+ CurDAG->getMachineNode(Opcode, DL, RetVTs, Ops);
+ MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+ MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+ return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
+ SDValue Chain = N->getOperand(0);
+ SDNode *Ret = nullptr;
+ unsigned Opc = 0;
+ SmallVector<SDValue, 8> Ops;
+
+ switch (N->getOpcode()) {
+ default: return nullptr;
+ case NVPTXISD::Tex1DFloatS32:
+ Opc = NVPTX::TEX_1D_F32_S32;
+ break;
+ case NVPTXISD::Tex1DFloatFloat:
+ Opc = NVPTX::TEX_1D_F32_F32;
+ break;
+ case NVPTXISD::Tex1DFloatFloatLevel:
+ Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DFloatFloatGrad:
+ Opc = NVPTX::TEX_1D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DS32S32:
+ Opc = NVPTX::TEX_1D_S32_S32;
+ break;
+ case NVPTXISD::Tex1DS32Float:
+ Opc = NVPTX::TEX_1D_S32_F32;
+ break;
+ case NVPTXISD::Tex1DS32FloatLevel:
+ Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DS32FloatGrad:
+ Opc = NVPTX::TEX_1D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DU32S32:
+ Opc = NVPTX::TEX_1D_U32_S32;
+ break;
+ case NVPTXISD::Tex1DU32Float:
+ Opc = NVPTX::TEX_1D_U32_F32;
+ break;
+ case NVPTXISD::Tex1DU32FloatLevel:
+ Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DU32FloatGrad:
+ Opc = NVPTX::TEX_1D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DArrayFloatS32:
+ Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
+ break;
+ case NVPTXISD::Tex1DArrayFloatFloat:
+ Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::Tex1DArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DArrayFloatFloatGrad:
+ Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DArrayS32S32:
+ Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
+ break;
+ case NVPTXISD::Tex1DArrayS32Float:
+ Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::Tex1DArrayS32FloatLevel:
+ Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DArrayS32FloatGrad:
+ Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex1DArrayU32S32:
+ Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
+ break;
+ case NVPTXISD::Tex1DArrayU32Float:
+ Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::Tex1DArrayU32FloatLevel:
+ Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex1DArrayU32FloatGrad:
+ Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DFloatS32:
+ Opc = NVPTX::TEX_2D_F32_S32;
+ break;
+ case NVPTXISD::Tex2DFloatFloat:
+ Opc = NVPTX::TEX_2D_F32_F32;
+ break;
+ case NVPTXISD::Tex2DFloatFloatLevel:
+ Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DFloatFloatGrad:
+ Opc = NVPTX::TEX_2D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DS32S32:
+ Opc = NVPTX::TEX_2D_S32_S32;
+ break;
+ case NVPTXISD::Tex2DS32Float:
+ Opc = NVPTX::TEX_2D_S32_F32;
+ break;
+ case NVPTXISD::Tex2DS32FloatLevel:
+ Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DS32FloatGrad:
+ Opc = NVPTX::TEX_2D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DU32S32:
+ Opc = NVPTX::TEX_2D_U32_S32;
+ break;
+ case NVPTXISD::Tex2DU32Float:
+ Opc = NVPTX::TEX_2D_U32_F32;
+ break;
+ case NVPTXISD::Tex2DU32FloatLevel:
+ Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DU32FloatGrad:
+ Opc = NVPTX::TEX_2D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DArrayFloatS32:
+ Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
+ break;
+ case NVPTXISD::Tex2DArrayFloatFloat:
+ Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::Tex2DArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DArrayFloatFloatGrad:
+ Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DArrayS32S32:
+ Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
+ break;
+ case NVPTXISD::Tex2DArrayS32Float:
+ Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::Tex2DArrayS32FloatLevel:
+ Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DArrayS32FloatGrad:
+ Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex2DArrayU32S32:
+ Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
+ break;
+ case NVPTXISD::Tex2DArrayU32Float:
+ Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::Tex2DArrayU32FloatLevel:
+ Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex2DArrayU32FloatGrad:
+ Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex3DFloatS32:
+ Opc = NVPTX::TEX_3D_F32_S32;
+ break;
+ case NVPTXISD::Tex3DFloatFloat:
+ Opc = NVPTX::TEX_3D_F32_F32;
+ break;
+ case NVPTXISD::Tex3DFloatFloatLevel:
+ Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex3DFloatFloatGrad:
+ Opc = NVPTX::TEX_3D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex3DS32S32:
+ Opc = NVPTX::TEX_3D_S32_S32;
+ break;
+ case NVPTXISD::Tex3DS32Float:
+ Opc = NVPTX::TEX_3D_S32_F32;
+ break;
+ case NVPTXISD::Tex3DS32FloatLevel:
+ Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex3DS32FloatGrad:
+ Opc = NVPTX::TEX_3D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::Tex3DU32S32:
+ Opc = NVPTX::TEX_3D_U32_S32;
+ break;
+ case NVPTXISD::Tex3DU32Float:
+ Opc = NVPTX::TEX_3D_U32_F32;
+ break;
+ case NVPTXISD::Tex3DU32FloatLevel:
+ Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tex3DU32FloatGrad:
+ Opc = NVPTX::TEX_3D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexCubeFloatFloat:
+ Opc = NVPTX::TEX_CUBE_F32_F32;
+ break;
+ case NVPTXISD::TexCubeFloatFloatLevel:
+ Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeS32Float:
+ Opc = NVPTX::TEX_CUBE_S32_F32;
+ break;
+ case NVPTXISD::TexCubeS32FloatLevel:
+ Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeU32Float:
+ Opc = NVPTX::TEX_CUBE_U32_F32;
+ break;
+ case NVPTXISD::TexCubeU32FloatLevel:
+ Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeArrayFloatFloat:
+ Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::TexCubeArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeArrayS32Float:
+ Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::TexCubeArrayS32FloatLevel:
+ Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexCubeArrayU32Float:
+ Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::TexCubeArrayU32FloatLevel:
+ Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tld4R2DFloatFloat:
+ Opc = NVPTX::TLD4_R_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4G2DFloatFloat:
+ Opc = NVPTX::TLD4_G_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4B2DFloatFloat:
+ Opc = NVPTX::TLD4_B_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4A2DFloatFloat:
+ Opc = NVPTX::TLD4_A_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4R2DS64Float:
+ Opc = NVPTX::TLD4_R_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4G2DS64Float:
+ Opc = NVPTX::TLD4_G_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4B2DS64Float:
+ Opc = NVPTX::TLD4_B_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4A2DS64Float:
+ Opc = NVPTX::TLD4_A_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4R2DU64Float:
+ Opc = NVPTX::TLD4_R_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4G2DU64Float:
+ Opc = NVPTX::TLD4_G_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4B2DU64Float:
+ Opc = NVPTX::TLD4_B_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4A2DU64Float:
+ Opc = NVPTX::TLD4_A_2D_U32_F32;
+ break;
+ case NVPTXISD::TexUnified1DFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
+ break;
+ case NVPTXISD::TexUnified1DFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
+ break;
+ case NVPTXISD::TexUnified1DFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DS32S32:
+ Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
+ break;
+ case NVPTXISD::TexUnified1DS32Float:
+ Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
+ break;
+ case NVPTXISD::TexUnified1DS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DU32S32:
+ Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
+ break;
+ case NVPTXISD::TexUnified1DU32Float:
+ Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
+ break;
+ case NVPTXISD::TexUnified1DU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DArrayFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
+ break;
+ case NVPTXISD::TexUnified1DArrayFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DArrayS32S32:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
+ break;
+ case NVPTXISD::TexUnified1DArrayS32Float:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified1DArrayU32S32:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
+ break;
+ case NVPTXISD::TexUnified1DArrayU32Float:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
+ break;
+ case NVPTXISD::TexUnified2DFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
+ break;
+ case NVPTXISD::TexUnified2DFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DS32S32:
+ Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
+ break;
+ case NVPTXISD::TexUnified2DS32Float:
+ Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
+ break;
+ case NVPTXISD::TexUnified2DS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DU32S32:
+ Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
+ break;
+ case NVPTXISD::TexUnified2DU32Float:
+ Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
+ break;
+ case NVPTXISD::TexUnified2DU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DArrayFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
+ break;
+ case NVPTXISD::TexUnified2DArrayFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DArrayS32S32:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
+ break;
+ case NVPTXISD::TexUnified2DArrayS32Float:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified2DArrayU32S32:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
+ break;
+ case NVPTXISD::TexUnified2DArrayU32Float:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified3DFloatS32:
+ Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
+ break;
+ case NVPTXISD::TexUnified3DFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
+ break;
+ case NVPTXISD::TexUnified3DFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified3DFloatFloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified3DS32S32:
+ Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
+ break;
+ case NVPTXISD::TexUnified3DS32Float:
+ Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
+ break;
+ case NVPTXISD::TexUnified3DS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified3DS32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnified3DU32S32:
+ Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
+ break;
+ case NVPTXISD::TexUnified3DU32Float:
+ Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
+ break;
+ case NVPTXISD::TexUnified3DU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnified3DU32FloatGrad:
+ Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
+ break;
+ case NVPTXISD::TexUnifiedCubeFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeS32Float:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeU32Float:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayS32Float:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayU32Float:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
+ break;
+ case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+ Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
+ break;
+ case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+ Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+ Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+ Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+ Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedR2DS64Float:
+ Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedG2DS64Float:
+ Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedB2DS64Float:
+ Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedA2DS64Float:
+ Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedR2DU64Float:
+ Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedG2DU64Float:
+ Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedB2DU64Float:
+ Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
+ break;
+ case NVPTXISD::Tld4UnifiedA2DU64Float:
+ Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
+ break;
+ }
+
+ // Copy over operands
+ for (unsigned i = 1; i < N->getNumOperands(); ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+
+ Ops.push_back(Chain);
+ Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+ return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
+ SDValue Chain = N->getOperand(0);
+ SDValue TexHandle = N->getOperand(1);
+ SDNode *Ret = nullptr;
+ unsigned Opc = 0;
+ SmallVector<SDValue, 8> Ops;
+ switch (N->getOpcode()) {
+ default: return nullptr;
+ case NVPTXISD::Suld1DI8Clamp:
+ Opc = NVPTX::SULD_1D_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI16Clamp:
+ Opc = NVPTX::SULD_1D_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI32Clamp:
+ Opc = NVPTX::SULD_1D_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI64Clamp:
+ Opc = NVPTX::SULD_1D_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I8Clamp:
+ Opc = NVPTX::SULD_1D_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I16Clamp:
+ Opc = NVPTX::SULD_1D_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I32Clamp:
+ Opc = NVPTX::SULD_1D_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I64Clamp:
+ Opc = NVPTX::SULD_1D_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I8Clamp:
+ Opc = NVPTX::SULD_1D_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I16Clamp:
+ Opc = NVPTX::SULD_1D_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I32Clamp:
+ Opc = NVPTX::SULD_1D_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI8Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI16Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI32Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI64Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I8Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I16Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I32Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I64Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I8Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I16Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I32Clamp:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI8Clamp:
+ Opc = NVPTX::SULD_2D_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI16Clamp:
+ Opc = NVPTX::SULD_2D_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI32Clamp:
+ Opc = NVPTX::SULD_2D_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI64Clamp:
+ Opc = NVPTX::SULD_2D_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I8Clamp:
+ Opc = NVPTX::SULD_2D_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I16Clamp:
+ Opc = NVPTX::SULD_2D_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I32Clamp:
+ Opc = NVPTX::SULD_2D_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I64Clamp:
+ Opc = NVPTX::SULD_2D_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I8Clamp:
+ Opc = NVPTX::SULD_2D_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I16Clamp:
+ Opc = NVPTX::SULD_2D_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I32Clamp:
+ Opc = NVPTX::SULD_2D_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI8Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI16Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI32Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI64Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I8Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I16Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I32Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I64Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I8Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I16Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I32Clamp:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI8Clamp:
+ Opc = NVPTX::SULD_3D_I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI16Clamp:
+ Opc = NVPTX::SULD_3D_I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI32Clamp:
+ Opc = NVPTX::SULD_3D_I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI64Clamp:
+ Opc = NVPTX::SULD_3D_I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I8Clamp:
+ Opc = NVPTX::SULD_3D_V2I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I16Clamp:
+ Opc = NVPTX::SULD_3D_V2I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I32Clamp:
+ Opc = NVPTX::SULD_3D_V2I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I64Clamp:
+ Opc = NVPTX::SULD_3D_V2I64_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I8Clamp:
+ Opc = NVPTX::SULD_3D_V4I8_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I16Clamp:
+ Opc = NVPTX::SULD_3D_V4I16_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I32Clamp:
+ Opc = NVPTX::SULD_3D_V4I32_CLAMP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI8Trap:
+ Opc = NVPTX::SULD_1D_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI16Trap:
+ Opc = NVPTX::SULD_1D_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI32Trap:
+ Opc = NVPTX::SULD_1D_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI64Trap:
+ Opc = NVPTX::SULD_1D_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I8Trap:
+ Opc = NVPTX::SULD_1D_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I16Trap:
+ Opc = NVPTX::SULD_1D_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I32Trap:
+ Opc = NVPTX::SULD_1D_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I64Trap:
+ Opc = NVPTX::SULD_1D_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I8Trap:
+ Opc = NVPTX::SULD_1D_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I16Trap:
+ Opc = NVPTX::SULD_1D_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I32Trap:
+ Opc = NVPTX::SULD_1D_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI8Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI16Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI32Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI64Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I8Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I16Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I32Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I64Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I8Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I16Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I32Trap:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI8Trap:
+ Opc = NVPTX::SULD_2D_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI16Trap:
+ Opc = NVPTX::SULD_2D_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI32Trap:
+ Opc = NVPTX::SULD_2D_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI64Trap:
+ Opc = NVPTX::SULD_2D_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I8Trap:
+ Opc = NVPTX::SULD_2D_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I16Trap:
+ Opc = NVPTX::SULD_2D_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I32Trap:
+ Opc = NVPTX::SULD_2D_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I64Trap:
+ Opc = NVPTX::SULD_2D_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I8Trap:
+ Opc = NVPTX::SULD_2D_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I16Trap:
+ Opc = NVPTX::SULD_2D_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I32Trap:
+ Opc = NVPTX::SULD_2D_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI8Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI16Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI32Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI64Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I8Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I16Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I32Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I64Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I8Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I16Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I32Trap:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI8Trap:
+ Opc = NVPTX::SULD_3D_I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI16Trap:
+ Opc = NVPTX::SULD_3D_I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI32Trap:
+ Opc = NVPTX::SULD_3D_I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI64Trap:
+ Opc = NVPTX::SULD_3D_I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I8Trap:
+ Opc = NVPTX::SULD_3D_V2I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I16Trap:
+ Opc = NVPTX::SULD_3D_V2I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I32Trap:
+ Opc = NVPTX::SULD_3D_V2I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I64Trap:
+ Opc = NVPTX::SULD_3D_V2I64_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I8Trap:
+ Opc = NVPTX::SULD_3D_V4I8_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I16Trap:
+ Opc = NVPTX::SULD_3D_V4I16_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I32Trap:
+ Opc = NVPTX::SULD_3D_V4I32_TRAP;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI8Zero:
+ Opc = NVPTX::SULD_1D_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI16Zero:
+ Opc = NVPTX::SULD_1D_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI32Zero:
+ Opc = NVPTX::SULD_1D_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DI64Zero:
+ Opc = NVPTX::SULD_1D_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I8Zero:
+ Opc = NVPTX::SULD_1D_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I16Zero:
+ Opc = NVPTX::SULD_1D_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I32Zero:
+ Opc = NVPTX::SULD_1D_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV2I64Zero:
+ Opc = NVPTX::SULD_1D_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I8Zero:
+ Opc = NVPTX::SULD_1D_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I16Zero:
+ Opc = NVPTX::SULD_1D_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DV4I32Zero:
+ Opc = NVPTX::SULD_1D_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI8Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI16Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI32Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayI64Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I8Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I16Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I32Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV2I64Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I8Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I16Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld1DArrayV4I32Zero:
+ Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI8Zero:
+ Opc = NVPTX::SULD_2D_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI16Zero:
+ Opc = NVPTX::SULD_2D_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI32Zero:
+ Opc = NVPTX::SULD_2D_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DI64Zero:
+ Opc = NVPTX::SULD_2D_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I8Zero:
+ Opc = NVPTX::SULD_2D_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I16Zero:
+ Opc = NVPTX::SULD_2D_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I32Zero:
+ Opc = NVPTX::SULD_2D_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV2I64Zero:
+ Opc = NVPTX::SULD_2D_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I8Zero:
+ Opc = NVPTX::SULD_2D_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I16Zero:
+ Opc = NVPTX::SULD_2D_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DV4I32Zero:
+ Opc = NVPTX::SULD_2D_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI8Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI16Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI32Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayI64Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I8Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I16Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I32Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV2I64Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I8Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I16Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld2DArrayV4I32Zero:
+ Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI8Zero:
+ Opc = NVPTX::SULD_3D_I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI16Zero:
+ Opc = NVPTX::SULD_3D_I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI32Zero:
+ Opc = NVPTX::SULD_3D_I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DI64Zero:
+ Opc = NVPTX::SULD_3D_I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I8Zero:
+ Opc = NVPTX::SULD_3D_V2I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I16Zero:
+ Opc = NVPTX::SULD_3D_V2I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I32Zero:
+ Opc = NVPTX::SULD_3D_V2I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV2I64Zero:
+ Opc = NVPTX::SULD_3D_V2I64_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I8Zero:
+ Opc = NVPTX::SULD_3D_V4I8_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I16Zero:
+ Opc = NVPTX::SULD_3D_V4I16_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ case NVPTXISD::Suld3DV4I32Zero:
+ Opc = NVPTX::SULD_3D_V4I32_ZERO;
+ Ops.push_back(TexHandle);
+ Ops.push_back(N->getOperand(2));
+ Ops.push_back(N->getOperand(3));
+ Ops.push_back(N->getOperand(4));
+ Ops.push_back(Chain);
+ break;
+ }
+ Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+ return Ret;
+}
+
+
+/// SelectBFE - Look for instruction sequences that can be made more efficient
+/// by using the 'bfe' (bit-field extract) PTX instruction
+SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue Len;
+ SDValue Start;
+ SDValue Val;
+ bool IsSigned = false;
+
+ if (N->getOpcode() == ISD::AND) {
+ // Canonicalize the operands
+ // We want 'and %val, %mask'
+ if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
+ std::swap(LHS, RHS);
+ }
+
+ ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
+ if (!Mask) {
+ // We need a constant mask on the RHS of the AND
+ return NULL;
+ }
+
+ // Extract the mask bits
+ uint64_t MaskVal = Mask->getZExtValue();
+ if (!isMask_64(MaskVal)) {
+ // We *could* handle shifted masks here, but doing so would require an
+ // 'and' operation to fix up the low-order bits so we would trade
+ // shr+and for bfe+and, which has the same throughput
+ return NULL;
+ }
+
+ // How many bits are in our mask?
+ uint64_t NumBits = CountTrailingOnes_64(MaskVal);
+ Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
+
+ if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
+ // We have a 'srl/and' pair, extract the effective start bit and length
+ Val = LHS.getNode()->getOperand(0);
+ Start = LHS.getNode()->getOperand(1);
+ ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
+ if (StartConst) {
+ uint64_t StartVal = StartConst->getZExtValue();
+ // How many "good" bits do we have left? "good" is defined here as bits
+ // that exist in the original value, not shifted in.
+ uint64_t GoodBits = Start.getValueType().getSizeInBits() - StartVal;
+ if (NumBits > GoodBits) {
+ // Do not handle the case where bits have been shifted in. In theory
+ // we could handle this, but the cost is likely higher than just
+ // emitting the srl/and pair.
+ return NULL;
+ }
+ Start = CurDAG->getTargetConstant(StartVal, MVT::i32);
+ } else {
+ // Do not handle the case where the shift amount (can be zero if no srl
+ // was found) is not constant. We could handle this case, but it would
+ // require run-time logic that would be more expensive than just
+ // emitting the srl/and pair.
+ return NULL;
+ }
+ } else {
+ // Do not handle the case where the LHS of the and is not a shift. While
+ // it would be trivial to handle this case, it would just transform
+ // 'and' -> 'bfe', but 'and' has higher-throughput.
+ return NULL;
+ }
+ } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
+ if (LHS->getOpcode() == ISD::AND) {
+ ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
+ if (!ShiftCnst) {
+ // Shift amount must be constant
+ return NULL;
+ }
+
+ uint64_t ShiftAmt = ShiftCnst->getZExtValue();
+
+ SDValue AndLHS = LHS->getOperand(0);
+ SDValue AndRHS = LHS->getOperand(1);
+
+ // Canonicalize the AND to have the mask on the RHS
+ if (isa<ConstantSDNode>(AndLHS)) {
+ std::swap(AndLHS, AndRHS);
+ }
+
+ ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
+ if (!MaskCnst) {
+ // Mask must be constant
+ return NULL;
+ }
+
+ uint64_t MaskVal = MaskCnst->getZExtValue();
+ uint64_t NumZeros;
+ uint64_t NumBits;
+ if (isMask_64(MaskVal)) {
+ NumZeros = 0;
+ // The number of bits in the result bitfield will be the number of
+ // trailing ones (the AND) minus the number of bits we shift off
+ NumBits = CountTrailingOnes_64(MaskVal) - ShiftAmt;
+ } else if (isShiftedMask_64(MaskVal)) {
+ NumZeros = countTrailingZeros(MaskVal);
+ unsigned NumOnes = CountTrailingOnes_64(MaskVal >> NumZeros);
+ // The number of bits in the result bitfield will be the number of
+ // trailing zeros plus the number of set bits in the mask minus the
+ // number of bits we shift off
+ NumBits = NumZeros + NumOnes - ShiftAmt;
+ } else {
+ // This is not a mask we can handle
+ return NULL;
+ }
+
+ if (ShiftAmt < NumZeros) {
+ // Handling this case would require extra logic that would make this
+ // transformation non-profitable
+ return NULL;
+ }
+
+ Val = AndLHS;
+ Start = CurDAG->getTargetConstant(ShiftAmt, MVT::i32);
+ Len = CurDAG->getTargetConstant(NumBits, MVT::i32);
+ } else if (LHS->getOpcode() == ISD::SHL) {
+ // Here, we have a pattern like:
+ //
+ // (sra (shl val, NN), MM)
+ // or
+ // (srl (shl val, NN), MM)
+ //
+ // If MM >= NN, we can efficiently optimize this with bfe
+ Val = LHS->getOperand(0);
+
+ SDValue ShlRHS = LHS->getOperand(1);
+ ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
+ if (!ShlCnst) {
+ // Shift amount must be constant
+ return NULL;
+ }
+ uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
+
+ SDValue ShrRHS = RHS;
+ ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
+ if (!ShrCnst) {
+ // Shift amount must be constant
+ return NULL;
+ }
+ uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
+
+ // To avoid extra codegen and be profitable, we need Outer >= Inner
+ if (OuterShiftAmt < InnerShiftAmt) {
+ return NULL;
+ }
+
+ // If the outer shift is more than the type size, we have no bitfield to
+ // extract (since we also check that the inner shift is <= the outer shift
+ // then this also implies that the inner shift is < the type size)
+ if (OuterShiftAmt >= Val.getValueType().getSizeInBits()) {
+ return NULL;
+ }
+
+ Start =
+ CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, MVT::i32);
+ Len =
+ CurDAG->getTargetConstant(Val.getValueType().getSizeInBits() -
+ OuterShiftAmt, MVT::i32);
+
+ if (N->getOpcode() == ISD::SRA) {
+ // If we have a arithmetic right shift, we need to use the signed bfe
+ // variant
+ IsSigned = true;
+ }
+ } else {
+ // No can do...
+ return NULL;
+ }
+ } else {
+ // No can do...
+ return NULL;
+ }
+
+
+ unsigned Opc;
+ // For the BFE operations we form here from "and" and "srl", always use the
+ // unsigned variants.
+ if (Val.getValueType() == MVT::i32) {
+ if (IsSigned) {
+ Opc = NVPTX::BFE_S32rii;
+ } else {
+ Opc = NVPTX::BFE_U32rii;
+ }
+ } else if (Val.getValueType() == MVT::i64) {
+ if (IsSigned) {
+ Opc = NVPTX::BFE_S64rii;
+ } else {
+ Opc = NVPTX::BFE_U64rii;
+ }
+ } else {
+ // We cannot handle this type
+ return NULL;
+ }
+
+ SDValue Ops[] = {
+ Val, Start, Len
+ };
+
+ SDNode *Ret =
+ CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+
+ return Ret;
+}
+
+// SelectDirectAddr - Match a direct address for DAG.
+// A direct address could be a globaladdress or externalsymbol.
+bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
+ // Return true if TGA or ES.
+ if (N.getOpcode() == ISD::TargetGlobalAddress ||
+ N.getOpcode() == ISD::TargetExternalSymbol) {
+ Address = N;
+ return true;
+ }
+ if (N.getOpcode() == NVPTXISD::Wrapper) {
+ Address = N.getOperand(0);
+ return true;
+ }
+ if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+ unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue();
+ if (IID == Intrinsic::nvvm_ptr_gen_to_param)
+ if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam)
+ return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address));
+ }
+ return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
+ SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+ if (Addr.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+ SDValue base = Addr.getOperand(0);
+ if (SelectDirectAddr(base, Base)) {
+ Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(
+ SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+ if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+ Offset = CurDAG->getTargetConstant(0, mvt);
+ return true;
+ }
+ if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+ Addr.getOpcode() == ISD::TargetGlobalAddress)
+ return false; // direct calls.
+
+ if (Addr.getOpcode() == ISD::ADD) {
+ if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
+ return false;
+ }
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+ if (FrameIndexSDNode *FIN =
+ dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+ // Constant offset from frame ref.
+ Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+ else
+ Base = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+ return true;
+ }
+ }
+ return false;
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
+ SDValue &Base, SDValue &Offset) {
+ return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
+ unsigned int spN) const {
+ const Value *Src = nullptr;
+ // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
+ // the classof() for MemSDNode does not include MemIntrinsicSDNode
+ // (See SelectionDAGNodes.h). So we need to check for both.
+ if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
+ if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+ return true;
+ Src = mN->getMemOperand()->getValue();
+ } else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
+ if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+ return true;
+ Src = mN->getMemOperand()->getValue();
+ }
+ if (!Src)
+ return false;
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ return (PT->getAddressSpace() == spN);
+ return false;
+}
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
+ const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1;
+ switch (ConstraintCode) {
+ default:
+ return true;
+ case 'm': // memory
+ if (SelectDirectAddr(Op, Op0)) {
+ OutOps.push_back(Op0);
+ OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32));
+ return false;
+ }
+ if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ return false;
+ }
+ break;
+ }
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
new file mode 100644
index 000000000000..c62fc253c33d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -0,0 +1,94 @@
+//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+namespace {
+
+class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+
+ // If true, generate mul.wide from sext and mul
+ bool doMulWide;
+
+ int getDivF32Level() const;
+ bool usePrecSqrtF32() const;
+ bool useF32FTZ() const;
+ bool allowFMA() const;
+
+public:
+ explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+ CodeGenOpt::Level OptLevel);
+
+ // Pass Name
+ const char *getPassName() const override {
+ return "NVPTX DAG->DAG Pattern Instruction Selection";
+ }
+
+ const NVPTXSubtarget &Subtarget;
+
+ bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ char ConstraintCode,
+ std::vector<SDValue> &OutOps) override;
+private:
+// Include the pieces autogenerated from the target description.
+#include "NVPTXGenDAGISel.inc"
+
+ SDNode *Select(SDNode *N) override;
+ SDNode *SelectIntrinsicNoChain(SDNode *N);
+ SDNode *SelectIntrinsicChain(SDNode *N);
+ SDNode *SelectTexSurfHandle(SDNode *N);
+ SDNode *SelectLoad(SDNode *N);
+ SDNode *SelectLoadVector(SDNode *N);
+ SDNode *SelectLDGLDU(SDNode *N);
+ SDNode *SelectStore(SDNode *N);
+ SDNode *SelectStoreVector(SDNode *N);
+ SDNode *SelectLoadParam(SDNode *N);
+ SDNode *SelectStoreRetval(SDNode *N);
+ SDNode *SelectStoreParam(SDNode *N);
+ SDNode *SelectAddrSpaceCast(SDNode *N);
+ SDNode *SelectTextureIntrinsic(SDNode *N);
+ SDNode *SelectSurfaceIntrinsic(SDNode *N);
+ SDNode *SelectBFE(SDNode *N);
+
+ inline SDValue getI32Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i32);
+ }
+
+ // Match direct address complex pattern.
+ bool SelectDirectAddr(SDValue N, SDValue &Address);
+
+ bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset, MVT mvt);
+ bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+ bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+
+ bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset, MVT mvt);
+ bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+ bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+ SDValue &Offset);
+
+ bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+
+};
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
new file mode 100644
index 000000000000..d76b20a29eb7
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -0,0 +1,4516 @@
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelLowering.h"
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-lower"
+
+using namespace llvm;
+
+static unsigned int uniqueCallSite = 0;
+
+static cl::opt<bool> sched4reg(
+ "nvptx-sched4reg",
+ cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
+
+static cl::opt<unsigned>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+ " 1: do it 2: do it aggressively"),
+ cl::init(2));
+
+static bool IsPTXVectorType(MVT VT) {
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::v2i1:
+ case MVT::v4i1:
+ case MVT::v2i8:
+ case MVT::v4i8:
+ case MVT::v2i16:
+ case MVT::v4i16:
+ case MVT::v2i32:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v2f32:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return true;
+ }
+}
+
+/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
+/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
+/// into their primitive components.
+/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
+/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
+/// LowerCall, and LowerReturn.
+static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
+ SmallVectorImpl<EVT> &ValueVTs,
+ SmallVectorImpl<uint64_t> *Offsets = nullptr,
+ uint64_t StartingOffset = 0) {
+ SmallVector<EVT, 16> TempVTs;
+ SmallVector<uint64_t, 16> TempOffsets;
+
+ ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset);
+ for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
+ EVT VT = TempVTs[i];
+ uint64_t Off = TempOffsets[i];
+ if (VT.isVector())
+ for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
+ ValueVTs.push_back(VT.getVectorElementType());
+ if (Offsets)
+ Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
+ }
+ else {
+ ValueVTs.push_back(VT);
+ if (Offsets)
+ Offsets->push_back(Off);
+ }
+ }
+}
+
+// NVPTXTargetLowering Constructor.
+NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
+ : TargetLowering(TM, new NVPTXTargetObjectFile()), nvTM(&TM),
+ nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+
+ // always lower memset, memcpy, and memmove intrinsics to load/store
+ // instructions, rather
+ // then generating calls to memset, mempcy or memmove.
+ MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
+ MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
+ MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
+
+ setBooleanContents(ZeroOrNegativeOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+ // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+ // condition branches.
+ setJumpIsExpensive(true);
+
+ // By default, use the Source scheduling
+ if (sched4reg)
+ setSchedulingPreference(Sched::RegPressure);
+ else
+ setSchedulingPreference(Sched::Source);
+
+ addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+ addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+ addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+ addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+ addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+ addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+
+ // Operations not directly supported by NVPTX.
+ setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i8, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+ // Some SIGN_EXTEND_INREG can be done using cvt instruction.
+ // For others we will expand to a SHL/SRA pair.
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+ setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
+ setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
+ setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
+ setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
+
+ if (nvptxSubtarget.hasROT64()) {
+ setOperationAction(ISD::ROTL, MVT::i64, Legal);
+ setOperationAction(ISD::ROTR, MVT::i64, Legal);
+ } else {
+ setOperationAction(ISD::ROTL, MVT::i64, Expand);
+ setOperationAction(ISD::ROTR, MVT::i64, Expand);
+ }
+ if (nvptxSubtarget.hasROT32()) {
+ setOperationAction(ISD::ROTL, MVT::i32, Legal);
+ setOperationAction(ISD::ROTR, MVT::i32, Legal);
+ } else {
+ setOperationAction(ISD::ROTL, MVT::i32, Expand);
+ setOperationAction(ISD::ROTR, MVT::i32, Expand);
+ }
+
+ setOperationAction(ISD::ROTL, MVT::i16, Expand);
+ setOperationAction(ISD::ROTR, MVT::i16, Expand);
+ setOperationAction(ISD::ROTL, MVT::i8, Expand);
+ setOperationAction(ISD::ROTR, MVT::i8, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+ // Indirect branch is not supported.
+ // This also disables Jump Table creation.
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+ // We want to legalize constant related memmove and memcopy
+ // intrinsics.
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+ // Turn FP extload into load/fextend
+ setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+ // Turn FP truncstore into trunc + store.
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // PTX does not support load / store predicate registers
+ setOperationAction(ISD::LOAD, MVT::i1, Custom);
+ setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+ setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i1, Expand);
+ setTruncStoreAction(MVT::i16, MVT::i1, Expand);
+ setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+
+ // This is legal in NVPTX
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+
+ // TRAP can be lowered to PTX trap
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ setOperationAction(ISD::ADDC, MVT::i64, Expand);
+ setOperationAction(ISD::ADDE, MVT::i64, Expand);
+
+ // Register custom handling for vector loads/stores
+ for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE;
+ ++i) {
+ MVT VT = (MVT::SimpleValueType) i;
+ if (IsPTXVectorType(VT)) {
+ setOperationAction(ISD::LOAD, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+ }
+ }
+
+ // Custom handling for i8 intrinsics
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+ setOperationAction(ISD::CTLZ, MVT::i16, Legal);
+ setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+ setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
+ setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+ setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+ setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+ setOperationAction(ISD::CTPOP, MVT::i16, Legal);
+ setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+ setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+
+ // We have some custom DAG combine patterns for these nodes
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::SHL);
+
+ // Now deduce the information based on the above mentioned
+ // actions
+ computeRegisterProperties();
+}
+
+const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (Opcode) {
+ default:
+ return nullptr;
+ case NVPTXISD::CALL:
+ return "NVPTXISD::CALL";
+ case NVPTXISD::RET_FLAG:
+ return "NVPTXISD::RET_FLAG";
+ case NVPTXISD::Wrapper:
+ return "NVPTXISD::Wrapper";
+ case NVPTXISD::DeclareParam:
+ return "NVPTXISD::DeclareParam";
+ case NVPTXISD::DeclareScalarParam:
+ return "NVPTXISD::DeclareScalarParam";
+ case NVPTXISD::DeclareRet:
+ return "NVPTXISD::DeclareRet";
+ case NVPTXISD::DeclareRetParam:
+ return "NVPTXISD::DeclareRetParam";
+ case NVPTXISD::PrintCall:
+ return "NVPTXISD::PrintCall";
+ case NVPTXISD::LoadParam:
+ return "NVPTXISD::LoadParam";
+ case NVPTXISD::LoadParamV2:
+ return "NVPTXISD::LoadParamV2";
+ case NVPTXISD::LoadParamV4:
+ return "NVPTXISD::LoadParamV4";
+ case NVPTXISD::StoreParam:
+ return "NVPTXISD::StoreParam";
+ case NVPTXISD::StoreParamV2:
+ return "NVPTXISD::StoreParamV2";
+ case NVPTXISD::StoreParamV4:
+ return "NVPTXISD::StoreParamV4";
+ case NVPTXISD::StoreParamS32:
+ return "NVPTXISD::StoreParamS32";
+ case NVPTXISD::StoreParamU32:
+ return "NVPTXISD::StoreParamU32";
+ case NVPTXISD::CallArgBegin:
+ return "NVPTXISD::CallArgBegin";
+ case NVPTXISD::CallArg:
+ return "NVPTXISD::CallArg";
+ case NVPTXISD::LastCallArg:
+ return "NVPTXISD::LastCallArg";
+ case NVPTXISD::CallArgEnd:
+ return "NVPTXISD::CallArgEnd";
+ case NVPTXISD::CallVoid:
+ return "NVPTXISD::CallVoid";
+ case NVPTXISD::CallVal:
+ return "NVPTXISD::CallVal";
+ case NVPTXISD::CallSymbol:
+ return "NVPTXISD::CallSymbol";
+ case NVPTXISD::Prototype:
+ return "NVPTXISD::Prototype";
+ case NVPTXISD::MoveParam:
+ return "NVPTXISD::MoveParam";
+ case NVPTXISD::StoreRetval:
+ return "NVPTXISD::StoreRetval";
+ case NVPTXISD::StoreRetvalV2:
+ return "NVPTXISD::StoreRetvalV2";
+ case NVPTXISD::StoreRetvalV4:
+ return "NVPTXISD::StoreRetvalV4";
+ case NVPTXISD::PseudoUseParam:
+ return "NVPTXISD::PseudoUseParam";
+ case NVPTXISD::RETURN:
+ return "NVPTXISD::RETURN";
+ case NVPTXISD::CallSeqBegin:
+ return "NVPTXISD::CallSeqBegin";
+ case NVPTXISD::CallSeqEnd:
+ return "NVPTXISD::CallSeqEnd";
+ case NVPTXISD::CallPrototype:
+ return "NVPTXISD::CallPrototype";
+ case NVPTXISD::LoadV2:
+ return "NVPTXISD::LoadV2";
+ case NVPTXISD::LoadV4:
+ return "NVPTXISD::LoadV4";
+ case NVPTXISD::LDGV2:
+ return "NVPTXISD::LDGV2";
+ case NVPTXISD::LDGV4:
+ return "NVPTXISD::LDGV4";
+ case NVPTXISD::LDUV2:
+ return "NVPTXISD::LDUV2";
+ case NVPTXISD::LDUV4:
+ return "NVPTXISD::LDUV4";
+ case NVPTXISD::StoreV2:
+ return "NVPTXISD::StoreV2";
+ case NVPTXISD::StoreV4:
+ return "NVPTXISD::StoreV4";
+ case NVPTXISD::FUN_SHFL_CLAMP:
+ return "NVPTXISD::FUN_SHFL_CLAMP";
+ case NVPTXISD::FUN_SHFR_CLAMP:
+ return "NVPTXISD::FUN_SHFR_CLAMP";
+ case NVPTXISD::IMAD:
+ return "NVPTXISD::IMAD";
+ case NVPTXISD::MUL_WIDE_SIGNED:
+ return "NVPTXISD::MUL_WIDE_SIGNED";
+ case NVPTXISD::MUL_WIDE_UNSIGNED:
+ return "NVPTXISD::MUL_WIDE_UNSIGNED";
+ case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
+ case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
+ case NVPTXISD::Tex1DFloatFloatLevel:
+ return "NVPTXISD::Tex1DFloatFloatLevel";
+ case NVPTXISD::Tex1DFloatFloatGrad:
+ return "NVPTXISD::Tex1DFloatFloatGrad";
+ case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
+ case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
+ case NVPTXISD::Tex1DS32FloatLevel:
+ return "NVPTXISD::Tex1DS32FloatLevel";
+ case NVPTXISD::Tex1DS32FloatGrad:
+ return "NVPTXISD::Tex1DS32FloatGrad";
+ case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
+ case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
+ case NVPTXISD::Tex1DU32FloatLevel:
+ return "NVPTXISD::Tex1DU32FloatLevel";
+ case NVPTXISD::Tex1DU32FloatGrad:
+ return "NVPTXISD::Tex1DU32FloatGrad";
+ case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
+ case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
+ case NVPTXISD::Tex1DArrayFloatFloatLevel:
+ return "NVPTXISD::Tex1DArrayFloatFloatLevel";
+ case NVPTXISD::Tex1DArrayFloatFloatGrad:
+ return "NVPTXISD::Tex1DArrayFloatFloatGrad";
+ case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
+ case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
+ case NVPTXISD::Tex1DArrayS32FloatLevel:
+ return "NVPTXISD::Tex1DArrayS32FloatLevel";
+ case NVPTXISD::Tex1DArrayS32FloatGrad:
+ return "NVPTXISD::Tex1DArrayS32FloatGrad";
+ case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
+ case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
+ case NVPTXISD::Tex1DArrayU32FloatLevel:
+ return "NVPTXISD::Tex1DArrayU32FloatLevel";
+ case NVPTXISD::Tex1DArrayU32FloatGrad:
+ return "NVPTXISD::Tex1DArrayU32FloatGrad";
+ case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
+ case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
+ case NVPTXISD::Tex2DFloatFloatLevel:
+ return "NVPTXISD::Tex2DFloatFloatLevel";
+ case NVPTXISD::Tex2DFloatFloatGrad:
+ return "NVPTXISD::Tex2DFloatFloatGrad";
+ case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
+ case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
+ case NVPTXISD::Tex2DS32FloatLevel:
+ return "NVPTXISD::Tex2DS32FloatLevel";
+ case NVPTXISD::Tex2DS32FloatGrad:
+ return "NVPTXISD::Tex2DS32FloatGrad";
+ case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
+ case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
+ case NVPTXISD::Tex2DU32FloatLevel:
+ return "NVPTXISD::Tex2DU32FloatLevel";
+ case NVPTXISD::Tex2DU32FloatGrad:
+ return "NVPTXISD::Tex2DU32FloatGrad";
+ case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
+ case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+ case NVPTXISD::Tex2DArrayFloatFloatLevel:
+ return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+ case NVPTXISD::Tex2DArrayFloatFloatGrad:
+ return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+ case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
+ case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
+ case NVPTXISD::Tex2DArrayS32FloatLevel:
+ return "NVPTXISD::Tex2DArrayS32FloatLevel";
+ case NVPTXISD::Tex2DArrayS32FloatGrad:
+ return "NVPTXISD::Tex2DArrayS32FloatGrad";
+ case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
+ case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
+ case NVPTXISD::Tex2DArrayU32FloatLevel:
+ return "NVPTXISD::Tex2DArrayU32FloatLevel";
+ case NVPTXISD::Tex2DArrayU32FloatGrad:
+ return "NVPTXISD::Tex2DArrayU32FloatGrad";
+ case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
+ case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
+ case NVPTXISD::Tex3DFloatFloatLevel:
+ return "NVPTXISD::Tex3DFloatFloatLevel";
+ case NVPTXISD::Tex3DFloatFloatGrad:
+ return "NVPTXISD::Tex3DFloatFloatGrad";
+ case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
+ case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
+ case NVPTXISD::Tex3DS32FloatLevel:
+ return "NVPTXISD::Tex3DS32FloatLevel";
+ case NVPTXISD::Tex3DS32FloatGrad:
+ return "NVPTXISD::Tex3DS32FloatGrad";
+ case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
+ case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
+ case NVPTXISD::Tex3DU32FloatLevel:
+ return "NVPTXISD::Tex3DU32FloatLevel";
+ case NVPTXISD::Tex3DU32FloatGrad:
+ return "NVPTXISD::Tex3DU32FloatGrad";
+ case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
+ case NVPTXISD::TexCubeFloatFloatLevel:
+ return "NVPTXISD::TexCubeFloatFloatLevel";
+ case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
+ case NVPTXISD::TexCubeS32FloatLevel:
+ return "NVPTXISD::TexCubeS32FloatLevel";
+ case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
+ case NVPTXISD::TexCubeU32FloatLevel:
+ return "NVPTXISD::TexCubeU32FloatLevel";
+ case NVPTXISD::TexCubeArrayFloatFloat:
+ return "NVPTXISD::TexCubeArrayFloatFloat";
+ case NVPTXISD::TexCubeArrayFloatFloatLevel:
+ return "NVPTXISD::TexCubeArrayFloatFloatLevel";
+ case NVPTXISD::TexCubeArrayS32Float:
+ return "NVPTXISD::TexCubeArrayS32Float";
+ case NVPTXISD::TexCubeArrayS32FloatLevel:
+ return "NVPTXISD::TexCubeArrayS32FloatLevel";
+ case NVPTXISD::TexCubeArrayU32Float:
+ return "NVPTXISD::TexCubeArrayU32Float";
+ case NVPTXISD::TexCubeArrayU32FloatLevel:
+ return "NVPTXISD::TexCubeArrayU32FloatLevel";
+ case NVPTXISD::Tld4R2DFloatFloat:
+ return "NVPTXISD::Tld4R2DFloatFloat";
+ case NVPTXISD::Tld4G2DFloatFloat:
+ return "NVPTXISD::Tld4G2DFloatFloat";
+ case NVPTXISD::Tld4B2DFloatFloat:
+ return "NVPTXISD::Tld4B2DFloatFloat";
+ case NVPTXISD::Tld4A2DFloatFloat:
+ return "NVPTXISD::Tld4A2DFloatFloat";
+ case NVPTXISD::Tld4R2DS64Float:
+ return "NVPTXISD::Tld4R2DS64Float";
+ case NVPTXISD::Tld4G2DS64Float:
+ return "NVPTXISD::Tld4G2DS64Float";
+ case NVPTXISD::Tld4B2DS64Float:
+ return "NVPTXISD::Tld4B2DS64Float";
+ case NVPTXISD::Tld4A2DS64Float:
+ return "NVPTXISD::Tld4A2DS64Float";
+ case NVPTXISD::Tld4R2DU64Float:
+ return "NVPTXISD::Tld4R2DU64Float";
+ case NVPTXISD::Tld4G2DU64Float:
+ return "NVPTXISD::Tld4G2DU64Float";
+ case NVPTXISD::Tld4B2DU64Float:
+ return "NVPTXISD::Tld4B2DU64Float";
+ case NVPTXISD::Tld4A2DU64Float:
+ return "NVPTXISD::Tld4A2DU64Float";
+
+ case NVPTXISD::TexUnified1DFloatS32:
+ return "NVPTXISD::TexUnified1DFloatS32";
+ case NVPTXISD::TexUnified1DFloatFloat:
+ return "NVPTXISD::TexUnified1DFloatFloat";
+ case NVPTXISD::TexUnified1DFloatFloatLevel:
+ return "NVPTXISD::TexUnified1DFloatFloatLevel";
+ case NVPTXISD::TexUnified1DFloatFloatGrad:
+ return "NVPTXISD::TexUnified1DFloatFloatGrad";
+ case NVPTXISD::TexUnified1DS32S32:
+ return "NVPTXISD::TexUnified1DS32S32";
+ case NVPTXISD::TexUnified1DS32Float:
+ return "NVPTXISD::TexUnified1DS32Float";
+ case NVPTXISD::TexUnified1DS32FloatLevel:
+ return "NVPTXISD::TexUnified1DS32FloatLevel";
+ case NVPTXISD::TexUnified1DS32FloatGrad:
+ return "NVPTXISD::TexUnified1DS32FloatGrad";
+ case NVPTXISD::TexUnified1DU32S32:
+ return "NVPTXISD::TexUnified1DU32S32";
+ case NVPTXISD::TexUnified1DU32Float:
+ return "NVPTXISD::TexUnified1DU32Float";
+ case NVPTXISD::TexUnified1DU32FloatLevel:
+ return "NVPTXISD::TexUnified1DU32FloatLevel";
+ case NVPTXISD::TexUnified1DU32FloatGrad:
+ return "NVPTXISD::TexUnified1DU32FloatGrad";
+ case NVPTXISD::TexUnified1DArrayFloatS32:
+ return "NVPTXISD::TexUnified1DArrayFloatS32";
+ case NVPTXISD::TexUnified1DArrayFloatFloat:
+ return "NVPTXISD::TexUnified1DArrayFloatFloat";
+ case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+ return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
+ case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+ return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
+ case NVPTXISD::TexUnified1DArrayS32S32:
+ return "NVPTXISD::TexUnified1DArrayS32S32";
+ case NVPTXISD::TexUnified1DArrayS32Float:
+ return "NVPTXISD::TexUnified1DArrayS32Float";
+ case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+ return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
+ case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+ return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
+ case NVPTXISD::TexUnified1DArrayU32S32:
+ return "NVPTXISD::TexUnified1DArrayU32S32";
+ case NVPTXISD::TexUnified1DArrayU32Float:
+ return "NVPTXISD::TexUnified1DArrayU32Float";
+ case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+ return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
+ case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+ return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
+ case NVPTXISD::TexUnified2DFloatS32:
+ return "NVPTXISD::TexUnified2DFloatS32";
+ case NVPTXISD::TexUnified2DFloatFloat:
+ return "NVPTXISD::TexUnified2DFloatFloat";
+ case NVPTXISD::TexUnified2DFloatFloatLevel:
+ return "NVPTXISD::TexUnified2DFloatFloatLevel";
+ case NVPTXISD::TexUnified2DFloatFloatGrad:
+ return "NVPTXISD::TexUnified2DFloatFloatGrad";
+ case NVPTXISD::TexUnified2DS32S32:
+ return "NVPTXISD::TexUnified2DS32S32";
+ case NVPTXISD::TexUnified2DS32Float:
+ return "NVPTXISD::TexUnified2DS32Float";
+ case NVPTXISD::TexUnified2DS32FloatLevel:
+ return "NVPTXISD::TexUnified2DS32FloatLevel";
+ case NVPTXISD::TexUnified2DS32FloatGrad:
+ return "NVPTXISD::TexUnified2DS32FloatGrad";
+ case NVPTXISD::TexUnified2DU32S32:
+ return "NVPTXISD::TexUnified2DU32S32";
+ case NVPTXISD::TexUnified2DU32Float:
+ return "NVPTXISD::TexUnified2DU32Float";
+ case NVPTXISD::TexUnified2DU32FloatLevel:
+ return "NVPTXISD::TexUnified2DU32FloatLevel";
+ case NVPTXISD::TexUnified2DU32FloatGrad:
+ return "NVPTXISD::TexUnified2DU32FloatGrad";
+ case NVPTXISD::TexUnified2DArrayFloatS32:
+ return "NVPTXISD::TexUnified2DArrayFloatS32";
+ case NVPTXISD::TexUnified2DArrayFloatFloat:
+ return "NVPTXISD::TexUnified2DArrayFloatFloat";
+ case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+ return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
+ case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+ return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
+ case NVPTXISD::TexUnified2DArrayS32S32:
+ return "NVPTXISD::TexUnified2DArrayS32S32";
+ case NVPTXISD::TexUnified2DArrayS32Float:
+ return "NVPTXISD::TexUnified2DArrayS32Float";
+ case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+ return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
+ case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+ return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
+ case NVPTXISD::TexUnified2DArrayU32S32:
+ return "NVPTXISD::TexUnified2DArrayU32S32";
+ case NVPTXISD::TexUnified2DArrayU32Float:
+ return "NVPTXISD::TexUnified2DArrayU32Float";
+ case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+ return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
+ case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+ return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
+ case NVPTXISD::TexUnified3DFloatS32:
+ return "NVPTXISD::TexUnified3DFloatS32";
+ case NVPTXISD::TexUnified3DFloatFloat:
+ return "NVPTXISD::TexUnified3DFloatFloat";
+ case NVPTXISD::TexUnified3DFloatFloatLevel:
+ return "NVPTXISD::TexUnified3DFloatFloatLevel";
+ case NVPTXISD::TexUnified3DFloatFloatGrad:
+ return "NVPTXISD::TexUnified3DFloatFloatGrad";
+ case NVPTXISD::TexUnified3DS32S32:
+ return "NVPTXISD::TexUnified3DS32S32";
+ case NVPTXISD::TexUnified3DS32Float:
+ return "NVPTXISD::TexUnified3DS32Float";
+ case NVPTXISD::TexUnified3DS32FloatLevel:
+ return "NVPTXISD::TexUnified3DS32FloatLevel";
+ case NVPTXISD::TexUnified3DS32FloatGrad:
+ return "NVPTXISD::TexUnified3DS32FloatGrad";
+ case NVPTXISD::TexUnified3DU32S32:
+ return "NVPTXISD::TexUnified3DU32S32";
+ case NVPTXISD::TexUnified3DU32Float:
+ return "NVPTXISD::TexUnified3DU32Float";
+ case NVPTXISD::TexUnified3DU32FloatLevel:
+ return "NVPTXISD::TexUnified3DU32FloatLevel";
+ case NVPTXISD::TexUnified3DU32FloatGrad:
+ return "NVPTXISD::TexUnified3DU32FloatGrad";
+ case NVPTXISD::TexUnifiedCubeFloatFloat:
+ return "NVPTXISD::TexUnifiedCubeFloatFloat";
+ case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+ return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
+ case NVPTXISD::TexUnifiedCubeS32Float:
+ return "NVPTXISD::TexUnifiedCubeS32Float";
+ case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+ return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
+ case NVPTXISD::TexUnifiedCubeU32Float:
+ return "NVPTXISD::TexUnifiedCubeU32Float";
+ case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+ return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+ return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
+ case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+ return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
+ case NVPTXISD::TexUnifiedCubeArrayS32Float:
+ return "NVPTXISD::TexUnifiedCubeArrayS32Float";
+ case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+ return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
+ case NVPTXISD::TexUnifiedCubeArrayU32Float:
+ return "NVPTXISD::TexUnifiedCubeArrayU32Float";
+ case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+ return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
+ case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+ return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
+ case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+ return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
+ case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+ return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
+ case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+ return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
+ case NVPTXISD::Tld4UnifiedR2DS64Float:
+ return "NVPTXISD::Tld4UnifiedR2DS64Float";
+ case NVPTXISD::Tld4UnifiedG2DS64Float:
+ return "NVPTXISD::Tld4UnifiedG2DS64Float";
+ case NVPTXISD::Tld4UnifiedB2DS64Float:
+ return "NVPTXISD::Tld4UnifiedB2DS64Float";
+ case NVPTXISD::Tld4UnifiedA2DS64Float:
+ return "NVPTXISD::Tld4UnifiedA2DS64Float";
+ case NVPTXISD::Tld4UnifiedR2DU64Float:
+ return "NVPTXISD::Tld4UnifiedR2DU64Float";
+ case NVPTXISD::Tld4UnifiedG2DU64Float:
+ return "NVPTXISD::Tld4UnifiedG2DU64Float";
+ case NVPTXISD::Tld4UnifiedB2DU64Float:
+ return "NVPTXISD::Tld4UnifiedB2DU64Float";
+ case NVPTXISD::Tld4UnifiedA2DU64Float:
+ return "NVPTXISD::Tld4UnifiedA2DU64Float";
+
+ case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
+ case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
+ case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
+ case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
+ case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
+ case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
+ case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
+ case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
+ case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
+ case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
+ case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
+
+ case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
+ case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
+ case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
+ case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
+ case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
+ case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
+ case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
+ case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
+ case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
+ case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
+ case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
+
+ case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
+ case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
+ case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
+ case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
+ case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
+ case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
+ case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
+ case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
+ case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
+ case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
+ case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
+
+ case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
+ case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
+ case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
+ case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
+ case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
+ case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
+ case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
+ case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
+ case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
+ case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
+ case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
+
+ case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
+ case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
+ case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
+ case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
+ case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
+ case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
+ case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
+ case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
+ case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
+ case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
+ case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
+
+ case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
+ case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
+ case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
+ case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
+ case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
+ case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
+ case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
+ case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
+ case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
+ case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
+ case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
+
+ case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
+ case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
+ case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
+ case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
+ case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
+ case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
+ case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
+ case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
+ case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
+ case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
+ case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
+
+ case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
+ case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
+ case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
+ case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
+ case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
+ case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
+ case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
+ case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
+ case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
+ case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
+ case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
+
+ case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
+ case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
+ case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
+ case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
+ case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
+ case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
+ case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
+ case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
+ case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
+ case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
+ case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
+
+ case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
+ case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
+ case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
+ case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
+ case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
+ case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
+ case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
+ case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
+ case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
+ case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
+ case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
+
+ case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
+ case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
+ case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
+ case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
+ case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
+ case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
+ case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
+ case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
+ case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
+ case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
+ case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
+
+ case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
+ case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
+ case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
+ case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
+ case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
+ case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
+ case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
+ case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
+ case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
+ case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
+ case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
+
+ case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
+ case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
+ case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
+ case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
+ case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
+ case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
+ case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
+ case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
+ case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
+ case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
+ case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
+
+ case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
+ case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
+ case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
+ case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
+ case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
+ case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
+ case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
+ case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
+ case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
+ case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
+ case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
+
+ case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
+ case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
+ case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
+ case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
+ case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
+ case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
+ case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
+ case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
+ case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
+ case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
+ case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
+ }
+}
+
+TargetLoweringBase::LegalizeTypeAction
+NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+ if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+ return TypeSplitVector;
+
+ return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+SDValue
+NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+ return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
+}
+
+std::string
+NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ unsigned retAlignment,
+ const ImmutableCallSite *CS) const {
+
+ bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+ assert(isABI && "Non-ABI compilation is not supported");
+ if (!isABI)
+ return "";
+
+ std::stringstream O;
+ O << "prototype_" << uniqueCallSite << " : .callprototype ";
+
+ if (retTy->getTypeID() == Type::VoidTyID) {
+ O << "()";
+ } else {
+ O << "(";
+ if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
+ unsigned size = 0;
+ if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
+ size = ITy->getBitWidth();
+ if (size < 32)
+ size = 32;
+ } else {
+ assert(retTy->isFloatingPointTy() &&
+ "Floating point type expected here");
+ size = retTy->getPrimitiveSizeInBits();
+ }
+
+ O << ".param .b" << size << " _";
+ } else if (isa<PointerType>(retTy)) {
+ O << ".param .b" << getPointerTy().getSizeInBits() << " _";
+ } else {
+ if((retTy->getTypeID() == Type::StructTyID) ||
+ isa<VectorType>(retTy)) {
+ O << ".param .align "
+ << retAlignment
+ << " .b8 _["
+ << getDataLayout()->getTypeAllocSize(retTy) << "]";
+ } else {
+ assert(false && "Unknown return type");
+ }
+ }
+ O << ") ";
+ }
+ O << "_ (";
+
+ bool first = true;
+ MVT thePointerTy = getPointerTy();
+
+ unsigned OIdx = 0;
+ for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+ Type *Ty = Args[i].Ty;
+ if (!first) {
+ O << ", ";
+ }
+ first = false;
+
+ if (Outs[OIdx].Flags.isByVal() == false) {
+ if (Ty->isAggregateType() || Ty->isVectorTy()) {
+ unsigned align = 0;
+ const CallInst *CallI = cast<CallInst>(CS->getInstruction());
+ const DataLayout *TD = getDataLayout();
+ // +1 because index 0 is reserved for return type alignment
+ if (!llvm::getAlign(*CallI, i + 1, align))
+ align = TD->getABITypeAlignment(Ty);
+ unsigned sz = TD->getTypeAllocSize(Ty);
+ O << ".param .align " << align << " .b8 ";
+ O << "_";
+ O << "[" << sz << "]";
+ // update the index for Outs
+ SmallVector<EVT, 16> vtparts;
+ ComputeValueVTs(*this, Ty, vtparts);
+ if (unsigned len = vtparts.size())
+ OIdx += len - 1;
+ continue;
+ }
+ // i8 types in IR will be i16 types in SDAG
+ assert((getValueType(Ty) == Outs[OIdx].VT ||
+ (getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+ "type mismatch between callee prototype and arguments");
+ // scalar type
+ unsigned sz = 0;
+ if (isa<IntegerType>(Ty)) {
+ sz = cast<IntegerType>(Ty)->getBitWidth();
+ if (sz < 32)
+ sz = 32;
+ } else if (isa<PointerType>(Ty))
+ sz = thePointerTy.getSizeInBits();
+ else
+ sz = Ty->getPrimitiveSizeInBits();
+ O << ".param .b" << sz << " ";
+ O << "_";
+ continue;
+ }
+ const PointerType *PTy = dyn_cast<PointerType>(Ty);
+ assert(PTy && "Param with byval attribute should be a pointer type");
+ Type *ETy = PTy->getElementType();
+
+ unsigned align = Outs[OIdx].Flags.getByValAlign();
+ unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
+ O << ".param .align " << align << " .b8 ";
+ O << "_";
+ O << "[" << sz << "]";
+ }
+ O << ");";
+ return O.str();
+}
+
+unsigned
+NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+ const ImmutableCallSite *CS,
+ Type *Ty,
+ unsigned Idx) const {
+ const DataLayout *TD = getDataLayout();
+ unsigned Align = 0;
+ const Value *DirectCallee = CS->getCalledFunction();
+
+ if (!DirectCallee) {
+ // We don't have a direct function symbol, but that may be because of
+ // constant cast instructions in the call.
+ const Instruction *CalleeI = CS->getInstruction();
+ assert(CalleeI && "Call target is not a function or derived value?");
+
+ // With bitcast'd call targets, the instruction will be the call
+ if (isa<CallInst>(CalleeI)) {
+ // Check if we have call alignment metadata
+ if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+ return Align;
+
+ const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+ // Ignore any bitcast instructions
+ while(isa<ConstantExpr>(CalleeV)) {
+ const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+ if (!CE->isCast())
+ break;
+ // Look through the bitcast
+ CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
+ }
+
+ // We have now looked past all of the bitcasts. Do we finally have a
+ // Function?
+ if (isa<Function>(CalleeV))
+ DirectCallee = CalleeV;
+ }
+ }
+
+ // Check for function alignment information if we found that the
+ // ultimate target is a Function
+ if (DirectCallee)
+ if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+ return Align;
+
+ // Call is indirect or alignment information is not available, fall back to
+ // the ABI type alignment
+ return TD->getABITypeAlignment(Ty);
+}
+
+SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ SDLoc dl = CLI.DL;
+ SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+ SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+ SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &isTailCall = CLI.IsTailCall;
+ ArgListTy &Args = CLI.getArgs();
+ Type *retTy = CLI.RetTy;
+ ImmutableCallSite *CS = CLI.CS;
+
+ bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+ assert(isABI && "Non-ABI compilation is not supported");
+ if (!isABI)
+ return Chain;
+ const DataLayout *TD = getDataLayout();
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function *F = MF.getFunction();
+
+ SDValue tempChain = Chain;
+ Chain =
+ DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
+ dl);
+ SDValue InFlag = Chain.getValue(1);
+
+ unsigned paramCount = 0;
+ // Args.size() and Outs.size() need not match.
+ // Outs.size() will be larger
+ // * if there is an aggregate argument with multiple fields (each field
+ // showing up separately in Outs)
+ // * if there is a vector argument with more than typical vector-length
+ // elements (generally if more than 4) where each vector element is
+ // individually present in Outs.
+ // So a different index should be used for indexing into Outs/OutVals.
+ // See similar issue in LowerFormalArguments.
+ unsigned OIdx = 0;
+ // Declare the .params or .reg need to pass values
+ // to the function
+ for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+ EVT VT = Outs[OIdx].VT;
+ Type *Ty = Args[i].Ty;
+
+ if (Outs[OIdx].Flags.isByVal() == false) {
+ if (Ty->isAggregateType()) {
+ // aggregate
+ SmallVector<EVT, 16> vtparts;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, Ty, vtparts, &Offsets, 0);
+
+ unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
+ // declare .param .align <align> .b8 .param<n>[<size>];
+ unsigned sz = TD->getTypeAllocSize(Ty);
+ SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
+ DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(sz, MVT::i32), InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+ DeclareParamOps);
+ InFlag = Chain.getValue(1);
+ for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+ EVT elemtype = vtparts[j];
+ unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
+ if (elemtype.isInteger() && (sz < 8))
+ sz = 8;
+ SDValue StVal = OutVals[OIdx];
+ if (elemtype.getSizeInBits() < 16) {
+ StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+ }
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(Offsets[j], MVT::i32),
+ StVal, InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+ CopyParamVTs, CopyParamOps,
+ elemtype, MachinePointerInfo(),
+ ArgAlign);
+ InFlag = Chain.getValue(1);
+ ++OIdx;
+ }
+ if (vtparts.size() > 0)
+ --OIdx;
+ ++paramCount;
+ continue;
+ }
+ if (Ty->isVectorTy()) {
+ EVT ObjectVT = getValueType(Ty);
+ unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
+ // declare .param .align <align> .b8 .param<n>[<size>];
+ unsigned sz = TD->getTypeAllocSize(Ty);
+ SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
+ DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(sz, MVT::i32), InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+ DeclareParamOps);
+ InFlag = Chain.getValue(1);
+ unsigned NumElts = ObjectVT.getVectorNumElements();
+ EVT EltVT = ObjectVT.getVectorElementType();
+ EVT MemVT = EltVT;
+ bool NeedExtend = false;
+ if (EltVT.getSizeInBits() < 16) {
+ NeedExtend = true;
+ EltVT = MVT::i16;
+ }
+
+ // V1 store
+ if (NumElts == 1) {
+ SDValue Elt = OutVals[OIdx++];
+ if (NeedExtend)
+ Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
+
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(0, MVT::i32), Elt,
+ InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+ CopyParamVTs, CopyParamOps,
+ MemVT, MachinePointerInfo());
+ InFlag = Chain.getValue(1);
+ } else if (NumElts == 2) {
+ SDValue Elt0 = OutVals[OIdx++];
+ SDValue Elt1 = OutVals[OIdx++];
+ if (NeedExtend) {
+ Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
+ Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
+ }
+
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain,
+ DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(0, MVT::i32), Elt0, Elt1,
+ InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
+ CopyParamVTs, CopyParamOps,
+ MemVT, MachinePointerInfo());
+ InFlag = Chain.getValue(1);
+ } else {
+ unsigned curOffset = 0;
+ // V4 stores
+ // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+ // the
+ // vector will be expanded to a power of 2 elements, so we know we can
+ // always round up to the next multiple of 4 when creating the vector
+ // stores.
+ // e.g. 4 elem => 1 st.v4
+ // 6 elem => 2 st.v4
+ // 8 elem => 2 st.v4
+ // 11 elem => 3 st.v4
+ unsigned VecSize = 4;
+ if (EltVT.getSizeInBits() == 64)
+ VecSize = 2;
+
+ // This is potentially only part of a vector, so assume all elements
+ // are packed together.
+ unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
+
+ for (unsigned i = 0; i < NumElts; i += VecSize) {
+ // Get values
+ SDValue StoreVal;
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(paramCount, MVT::i32));
+ Ops.push_back(DAG.getConstant(curOffset, MVT::i32));
+
+ unsigned Opc = NVPTXISD::StoreParamV2;
+
+ StoreVal = OutVals[OIdx++];
+ if (NeedExtend)
+ StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ Ops.push_back(StoreVal);
+
+ if (i + 1 < NumElts) {
+ StoreVal = OutVals[OIdx++];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(EltVT);
+ }
+ Ops.push_back(StoreVal);
+
+ if (VecSize == 4) {
+ Opc = NVPTXISD::StoreParamV4;
+ if (i + 2 < NumElts) {
+ StoreVal = OutVals[OIdx++];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(EltVT);
+ }
+ Ops.push_back(StoreVal);
+
+ if (i + 3 < NumElts) {
+ StoreVal = OutVals[OIdx++];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(EltVT);
+ }
+ Ops.push_back(StoreVal);
+ }
+
+ Ops.push_back(InFlag);
+
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
+ MemVT, MachinePointerInfo());
+ InFlag = Chain.getValue(1);
+ curOffset += PerStoreOffset;
+ }
+ }
+ ++paramCount;
+ --OIdx;
+ continue;
+ }
+ // Plain scalar
+ // for ABI, declare .param .b<size> .param<n>;
+ unsigned sz = VT.getSizeInBits();
+ bool needExtend = false;
+ if (VT.isInteger()) {
+ if (sz < 16)
+ needExtend = true;
+ if (sz < 32)
+ sz = 32;
+ }
+ SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareParamOps[] = { Chain,
+ DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(sz, MVT::i32),
+ DAG.getConstant(0, MVT::i32), InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+ DeclareParamOps);
+ InFlag = Chain.getValue(1);
+ SDValue OutV = OutVals[OIdx];
+ if (needExtend) {
+ // zext/sext i1 to i16
+ unsigned opc = ISD::ZERO_EXTEND;
+ if (Outs[OIdx].Flags.isSExt())
+ opc = ISD::SIGN_EXTEND;
+ OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
+ }
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(0, MVT::i32), OutV, InFlag };
+
+ unsigned opcode = NVPTXISD::StoreParam;
+ if (Outs[OIdx].Flags.isZExt())
+ opcode = NVPTXISD::StoreParamU32;
+ else if (Outs[OIdx].Flags.isSExt())
+ opcode = NVPTXISD::StoreParamS32;
+ Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
+ VT, MachinePointerInfo());
+
+ InFlag = Chain.getValue(1);
+ ++paramCount;
+ continue;
+ }
+ // struct or vector
+ SmallVector<EVT, 16> vtparts;
+ SmallVector<uint64_t, 16> Offsets;
+ const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
+ assert(PTy && "Type of a byval parameter should be pointer");
+ ComputePTXValueVTs(*this, PTy->getElementType(), vtparts, &Offsets, 0);
+
+ // declare .param .align <align> .b8 .param<n>[<size>];
+ unsigned sz = Outs[OIdx].Flags.getByValSize();
+ SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
+ // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
+ // so we don't need to worry about natural alignment or not.
+ // See TargetLowering::LowerCallTo().
+ SDValue DeclareParamOps[] = {
+ Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), MVT::i32),
+ DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32),
+ InFlag
+ };
+ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+ DeclareParamOps);
+ InFlag = Chain.getValue(1);
+ for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+ EVT elemtype = vtparts[j];
+ int curOffset = Offsets[j];
+ unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+ SDValue srcAddr =
+ DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
+ DAG.getConstant(curOffset, getPointerTy()));
+ SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+ MachinePointerInfo(), false, false, false,
+ PartAlign);
+ if (elemtype.getSizeInBits() < 16) {
+ theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
+ }
+ SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+ DAG.getConstant(curOffset, MVT::i32), theVal,
+ InFlag };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+ CopyParamOps, elemtype,
+ MachinePointerInfo());
+
+ InFlag = Chain.getValue(1);
+ }
+ ++paramCount;
+ }
+
+ GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+ unsigned retAlignment = 0;
+
+ // Handle Result
+ if (Ins.size() > 0) {
+ SmallVector<EVT, 16> resvtparts;
+ ComputeValueVTs(*this, retTy, resvtparts);
+
+ // Declare
+ // .param .align 16 .b8 retval0[<size-in-bytes>], or
+ // .param .b<size-in-bits> retval0
+ unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
+ if (retTy->isSingleValueType()) {
+ // Scalar needs to be at least 32bit wide
+ if (resultsz < 32)
+ resultsz = 32;
+ SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+ DAG.getConstant(resultsz, MVT::i32),
+ DAG.getConstant(0, MVT::i32), InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+ DeclareRetOps);
+ InFlag = Chain.getValue(1);
+ } else {
+ retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
+ SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue DeclareRetOps[] = { Chain,
+ DAG.getConstant(retAlignment, MVT::i32),
+ DAG.getConstant(resultsz / 8, MVT::i32),
+ DAG.getConstant(0, MVT::i32), InFlag };
+ Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+ DeclareRetOps);
+ InFlag = Chain.getValue(1);
+ }
+ }
+
+ if (!Func) {
+ // This is indirect function call case : PTX requires a prototype of the
+ // form
+ // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+ // to be emitted, and the label has to used as the last arg of call
+ // instruction.
+ // The prototype is embedded in a string and put as the operand for a
+ // CallPrototype SDNode which will print out to the value of the string.
+ SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
+ const char *ProtoStr =
+ nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+ SDValue ProtoOps[] = {
+ Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
+ };
+ Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
+ InFlag = Chain.getValue(1);
+ }
+ // Op to just print "call"
+ SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue PrintCallOps[] = {
+ Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag
+ };
+ Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
+ dl, PrintCallVTs, PrintCallOps);
+ InFlag = Chain.getValue(1);
+
+ // Ops to print out the function name
+ SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+ Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
+ InFlag = Chain.getValue(1);
+
+ // Ops to print out the param list
+ SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CallArgBeginOps[] = { Chain, InFlag };
+ Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+ CallArgBeginOps);
+ InFlag = Chain.getValue(1);
+
+ for (unsigned i = 0, e = paramCount; i != e; ++i) {
+ unsigned opcode;
+ if (i == (e - 1))
+ opcode = NVPTXISD::LastCallArg;
+ else
+ opcode = NVPTXISD::CallArg;
+ SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+ DAG.getConstant(i, MVT::i32), InFlag };
+ Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
+ InFlag = Chain.getValue(1);
+ }
+ SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32),
+ InFlag };
+ Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
+ InFlag = Chain.getValue(1);
+
+ if (!Func) {
+ SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32),
+ InFlag };
+ Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Generate loads from param memory/moves from registers for result
+ if (Ins.size() > 0) {
+ if (retTy && retTy->isVectorTy()) {
+ EVT ObjectVT = getValueType(retTy);
+ unsigned NumElts = ObjectVT.getVectorNumElements();
+ EVT EltVT = ObjectVT.getVectorElementType();
+ assert(nvTM->getTargetLowering()->getNumRegisters(F->getContext(),
+ ObjectVT) == NumElts &&
+ "Vector was not scalarized");
+ unsigned sz = EltVT.getSizeInBits();
+ bool needTruncate = sz < 8 ? true : false;
+
+ if (NumElts == 1) {
+ // Just a simple load
+ SmallVector<EVT, 4> LoadRetVTs;
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
+ // trunc i16 to i1
+ LoadRetVTs.push_back(MVT::i16);
+ } else
+ LoadRetVTs.push_back(EltVT);
+ LoadRetVTs.push_back(MVT::Other);
+ LoadRetVTs.push_back(MVT::Glue);
+ SmallVector<SDValue, 4> LoadRetOps;
+ LoadRetOps.push_back(Chain);
+ LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+ LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
+ LoadRetOps.push_back(InFlag);
+ SDValue retval = DAG.getMemIntrinsicNode(
+ NVPTXISD::LoadParam, dl,
+ DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
+ Chain = retval.getValue(1);
+ InFlag = retval.getValue(2);
+ SDValue Ret0 = retval;
+ if (needTruncate)
+ Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
+ InVals.push_back(Ret0);
+ } else if (NumElts == 2) {
+ // LoadV2
+ SmallVector<EVT, 4> LoadRetVTs;
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
+ // trunc i16 to i1
+ LoadRetVTs.push_back(MVT::i16);
+ LoadRetVTs.push_back(MVT::i16);
+ } else {
+ LoadRetVTs.push_back(EltVT);
+ LoadRetVTs.push_back(EltVT);
+ }
+ LoadRetVTs.push_back(MVT::Other);
+ LoadRetVTs.push_back(MVT::Glue);
+ SmallVector<SDValue, 4> LoadRetOps;
+ LoadRetOps.push_back(Chain);
+ LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+ LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
+ LoadRetOps.push_back(InFlag);
+ SDValue retval = DAG.getMemIntrinsicNode(
+ NVPTXISD::LoadParamV2, dl,
+ DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
+ Chain = retval.getValue(2);
+ InFlag = retval.getValue(3);
+ SDValue Ret0 = retval.getValue(0);
+ SDValue Ret1 = retval.getValue(1);
+ if (needTruncate) {
+ Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
+ InVals.push_back(Ret0);
+ Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
+ InVals.push_back(Ret1);
+ } else {
+ InVals.push_back(Ret0);
+ InVals.push_back(Ret1);
+ }
+ } else {
+ // Split into N LoadV4
+ unsigned Ofst = 0;
+ unsigned VecSize = 4;
+ unsigned Opc = NVPTXISD::LoadParamV4;
+ if (EltVT.getSizeInBits() == 64) {
+ VecSize = 2;
+ Opc = NVPTXISD::LoadParamV2;
+ }
+ EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+ for (unsigned i = 0; i < NumElts; i += VecSize) {
+ SmallVector<EVT, 8> LoadRetVTs;
+ if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+ // If loading i1/i8 result, generate
+ // load.b8 i16
+ // if i1
+ // trunc i16 to i1
+ for (unsigned j = 0; j < VecSize; ++j)
+ LoadRetVTs.push_back(MVT::i16);
+ } else {
+ for (unsigned j = 0; j < VecSize; ++j)
+ LoadRetVTs.push_back(EltVT);
+ }
+ LoadRetVTs.push_back(MVT::Other);
+ LoadRetVTs.push_back(MVT::Glue);
+ SmallVector<SDValue, 4> LoadRetOps;
+ LoadRetOps.push_back(Chain);
+ LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+ LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32));
+ LoadRetOps.push_back(InFlag);
+ SDValue retval = DAG.getMemIntrinsicNode(
+ Opc, dl, DAG.getVTList(LoadRetVTs),
+ LoadRetOps, EltVT, MachinePointerInfo());
+ if (VecSize == 2) {
+ Chain = retval.getValue(2);
+ InFlag = retval.getValue(3);
+ } else {
+ Chain = retval.getValue(4);
+ InFlag = retval.getValue(5);
+ }
+
+ for (unsigned j = 0; j < VecSize; ++j) {
+ if (i + j >= NumElts)
+ break;
+ SDValue Elt = retval.getValue(j);
+ if (needTruncate)
+ Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+ InVals.push_back(Elt);
+ }
+ Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+ }
+ }
+ } else {
+ SmallVector<EVT, 16> VTs;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, retTy, VTs, &Offsets, 0);
+ assert(VTs.size() == Ins.size() && "Bad value decomposition");
+ unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ unsigned sz = VTs[i].getSizeInBits();
+ unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+ bool needTruncate = sz < 8 ? true : false;
+ if (VTs[i].isInteger() && (sz < 8))
+ sz = 8;
+
+ SmallVector<EVT, 4> LoadRetVTs;
+ EVT TheLoadType = VTs[i];
+ if (retTy->isIntegerTy() &&
+ TD->getTypeAllocSizeInBits(retTy) < 32) {
+ // This is for integer types only, and specifically not for
+ // aggregates.
+ LoadRetVTs.push_back(MVT::i32);
+ TheLoadType = MVT::i32;
+ } else if (sz < 16) {
+ // If loading i1/i8 result, generate
+ // load i8 (-> i16)
+ // trunc i16 to i1/i8
+ LoadRetVTs.push_back(MVT::i16);
+ } else
+ LoadRetVTs.push_back(Ins[i].VT);
+ LoadRetVTs.push_back(MVT::Other);
+ LoadRetVTs.push_back(MVT::Glue);
+
+ SmallVector<SDValue, 4> LoadRetOps;
+ LoadRetOps.push_back(Chain);
+ LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+ LoadRetOps.push_back(DAG.getConstant(Offsets[i], MVT::i32));
+ LoadRetOps.push_back(InFlag);
+ SDValue retval = DAG.getMemIntrinsicNode(
+ NVPTXISD::LoadParam, dl,
+ DAG.getVTList(LoadRetVTs), LoadRetOps,
+ TheLoadType, MachinePointerInfo(), AlignI);
+ Chain = retval.getValue(1);
+ InFlag = retval.getValue(2);
+ SDValue Ret0 = retval.getValue(0);
+ if (needTruncate)
+ Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
+ InVals.push_back(Ret0);
+ }
+ }
+ }
+
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
+ DAG.getIntPtrConstant(uniqueCallSite + 1, true),
+ InFlag, dl);
+ uniqueCallSite++;
+
+ // set isTailCall to false for now, until we figure out how to express
+ // tail call optimization in PTX
+ isTailCall = false;
+ return Chain;
+}
+
+// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+// (see LegalizeDAG.cpp). This is slow and uses local memory.
+// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+SDValue
+NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+ SmallVector<SDValue, 8> Ops;
+ unsigned NumOperands = Node->getNumOperands();
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ SDValue SubOp = Node->getOperand(i);
+ EVT VVT = SubOp.getNode()->getValueType(0);
+ EVT EltVT = VVT.getVectorElementType();
+ unsigned NumSubElem = VVT.getVectorNumElements();
+ for (unsigned j = 0; j < NumSubElem; ++j) {
+ Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+ DAG.getIntPtrConstant(j)));
+ }
+ }
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
+}
+
+/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+/// amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+/// amount.
+SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+ if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+
+ // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+ // {dHi, dLo} = {aHi, aLo} >> Amt
+ // dHi = aHi >> Amt
+ // dLo = shf.r.clamp aLo, aHi, Amt
+
+ SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
+ ShAmt);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ else {
+
+ // {dHi, dLo} = {aHi, aLo} >> Amt
+ // - if (Amt>=size) then
+ // dLo = aHi >> (Amt-size)
+ // dHi = aHi >> Amt (this is either all 0 or all 1)
+ // else
+ // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
+ // dHi = aHi >> Amt
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, MVT::i32), ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, MVT::i32));
+ SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+ SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+ SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+ DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
+ SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+ SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+/// amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+/// amount.
+SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ assert(Op.getOpcode() == ISD::SHL_PARTS);
+
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ SDLoc dl(Op);
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+
+ if (VTBits == 32 && nvptxSubtarget.getSmVersion() >= 35) {
+
+ // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+ // {dHi, dLo} = {aHi, aLo} << Amt
+ // dHi = shf.l.clamp aLo, aHi, Amt
+ // dLo = aLo << Amt
+
+ SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
+ ShAmt);
+ SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+ else {
+
+ // {dHi, dLo} = {aHi, aLo} << Amt
+ // - if (Amt>=size) then
+ // dLo = aLo << Amt (all 0)
+ // dLo = aLo << (Amt-size)
+ // else
+ // dLo = aLo << Amt
+ // dHi = (aHi << Amt) | (aLo >> (size-Amt))
+
+ SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(VTBits, MVT::i32), ShAmt);
+ SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+ SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+ DAG.getConstant(VTBits, MVT::i32));
+ SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+ SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+ SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+ SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+ DAG.getConstant(VTBits, MVT::i32), ISD::SETGE);
+ SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, dl);
+ }
+}
+
+SDValue
+NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ case ISD::RETURNADDR:
+ return SDValue();
+ case ISD::FRAMEADDR:
+ return SDValue();
+ case ISD::GlobalAddress:
+ return LowerGlobalAddress(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN:
+ return Op;
+ case ISD::BUILD_VECTOR:
+ case ISD::EXTRACT_SUBVECTOR:
+ return Op;
+ case ISD::CONCAT_VECTORS:
+ return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::STORE:
+ return LowerSTORE(Op, DAG);
+ case ISD::LOAD:
+ return LowerLOAD(Op, DAG);
+ case ISD::SHL_PARTS:
+ return LowerShiftLeftParts(Op, DAG);
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS:
+ return LowerShiftRightParts(Op, DAG);
+ default:
+ llvm_unreachable("Custom lowering not defined for operation");
+ }
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType() == MVT::i1)
+ return LowerLOADi1(Op, DAG);
+ else
+ return SDValue();
+}
+
+// v = ld i1* addr
+// =>
+// v1 = ld i8* addr (-> i16)
+// v = trunc i16 to i1
+SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ LoadSDNode *LD = cast<LoadSDNode>(Node);
+ SDLoc dl(Node);
+ assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
+ assert(Node->getValueType(0) == MVT::i1 &&
+ "Custom lowering for i1 load only");
+ SDValue newLD =
+ DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
+ LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
+ LD->isInvariant(), LD->getAlignment());
+ SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+ // The legalizer (the caller) is expecting two values from the legalized
+ // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+ // in LegalizeDAG.cpp which also uses MergeValues.
+ SDValue Ops[] = { result, LD->getChain() };
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ EVT ValVT = Op.getOperand(1).getValueType();
+ if (ValVT == MVT::i1)
+ return LowerSTOREi1(Op, DAG);
+ else if (ValVT.isVector())
+ return LowerSTOREVector(Op, DAG);
+ else
+ return SDValue();
+}
+
+SDValue
+NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *N = Op.getNode();
+ SDValue Val = N->getOperand(1);
+ SDLoc DL(N);
+ EVT ValVT = Val.getValueType();
+
+ if (ValVT.isVector()) {
+ // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+ // legal. We can (and should) split that into 2 stores of <2 x double> here
+ // but I'm leaving that as a TODO for now.
+ if (!ValVT.isSimple())
+ return SDValue();
+ switch (ValVT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v2i8:
+ case MVT::v2i16:
+ case MVT::v2i32:
+ case MVT::v2i64:
+ case MVT::v2f32:
+ case MVT::v2f64:
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v4i32:
+ case MVT::v4f32:
+ // This is a "native" vector type
+ break;
+ }
+
+ MemSDNode *MemSD = cast<MemSDNode>(N);
+ const DataLayout *TD = getDataLayout();
+
+ unsigned Align = MemSD->getAlignment();
+ unsigned PrefAlign =
+ TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+ if (Align < PrefAlign) {
+ // This store is not sufficiently aligned, so bail out and let this vector
+ // store be scalarized. Note that we may still be able to emit smaller
+ // vector stores. For example, if we are storing a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return SDValue();
+ }
+
+ unsigned Opcode = 0;
+ EVT EltVT = ValVT.getVectorElementType();
+ unsigned NumElts = ValVT.getVectorNumElements();
+
+ // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // stored type to i16 and propagate the "real" type as the memory type.
+ bool NeedExt = false;
+ if (EltVT.getSizeInBits() < 16)
+ NeedExt = true;
+
+ switch (NumElts) {
+ default:
+ return SDValue();
+ case 2:
+ Opcode = NVPTXISD::StoreV2;
+ break;
+ case 4: {
+ Opcode = NVPTXISD::StoreV4;
+ break;
+ }
+ }
+
+ SmallVector<SDValue, 8> Ops;
+
+ // First is the chain
+ Ops.push_back(N->getOperand(0));
+
+ // Then the split values
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+ DAG.getIntPtrConstant(i));
+ if (NeedExt)
+ ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+ Ops.push_back(ExtVal);
+ }
+
+ // Then any remaining arguments
+ for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+
+ SDValue NewSt = DAG.getMemIntrinsicNode(
+ Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+ MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+ //return DCI.CombineTo(N, NewSt, true);
+ return NewSt;
+ }
+
+ return SDValue();
+}
+
+// st i1 v, addr
+// =>
+// v1 = zxt v to i16
+// st.u8 i16, addr
+SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+ StoreSDNode *ST = cast<StoreSDNode>(Node);
+ SDValue Tmp1 = ST->getChain();
+ SDValue Tmp2 = ST->getBasePtr();
+ SDValue Tmp3 = ST->getValue();
+ assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
+ unsigned Alignment = ST->getAlignment();
+ bool isVolatile = ST->isVolatile();
+ bool isNonTemporal = ST->isNonTemporal();
+ Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
+ SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
+ ST->getPointerInfo(), MVT::i8, isNonTemporal,
+ isVolatile, Alignment);
+ return Result;
+}
+
+SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname,
+ int idx, EVT v) const {
+ std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
+ std::stringstream suffix;
+ suffix << idx;
+ *name += suffix.str();
+ return DAG.getTargetExternalSymbol(name->c_str(), v);
+}
+
+SDValue
+NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+ std::string ParamSym;
+ raw_string_ostream ParamStr(ParamSym);
+
+ ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
+ ParamStr.flush();
+
+ std::string *SavedStr =
+ nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
+ return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
+}
+
+SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
+ return getExtSymb(DAG, ".HLPPARAM", idx);
+}
+
+// Check to see if the kernel argument is image*_t or sampler_t
+
+bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
+ static const char *const specialTypes[] = { "struct._image2d_t",
+ "struct._image3d_t",
+ "struct._sampler_t" };
+
+ const Type *Ty = arg->getType();
+ const PointerType *PTy = dyn_cast<PointerType>(Ty);
+
+ if (!PTy)
+ return false;
+
+ if (!context)
+ return false;
+
+ const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
+ const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
+
+ for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
+ if (TypeName == specialTypes[i])
+ return true;
+
+ return false;
+}
+
+SDValue NVPTXTargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const DataLayout *TD = getDataLayout();
+
+ const Function *F = MF.getFunction();
+ const AttributeSet &PAL = F->getAttributes();
+ const TargetLowering *TLI = DAG.getTarget().getTargetLowering();
+
+ SDValue Root = DAG.getRoot();
+ std::vector<SDValue> OutChains;
+
+ bool isKernel = llvm::isKernelFunction(*F);
+ bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+ assert(isABI && "Non-ABI compilation is not supported");
+ if (!isABI)
+ return Chain;
+
+ std::vector<Type *> argTypes;
+ std::vector<const Argument *> theArgs;
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I) {
+ theArgs.push_back(I);
+ argTypes.push_back(I->getType());
+ }
+ // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
+ // Ins.size() will be larger
+ // * if there is an aggregate argument with multiple fields (each field
+ // showing up separately in Ins)
+ // * if there is a vector argument with more than typical vector-length
+ // elements (generally if more than 4) where each vector element is
+ // individually present in Ins.
+ // So a different index should be used for indexing into Ins.
+ // See similar issue in LowerCall.
+ unsigned InsIdx = 0;
+
+ int idx = 0;
+ for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
+ Type *Ty = argTypes[i];
+
+ // If the kernel argument is image*_t or sampler_t, convert it to
+ // a i32 constant holding the parameter position. This can later
+ // matched in the AsmPrinter to output the correct mangled name.
+ if (isImageOrSamplerVal(
+ theArgs[i],
+ (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
+ : nullptr))) {
+ assert(isKernel && "Only kernels can have image/sampler params");
+ InVals.push_back(DAG.getConstant(i + 1, MVT::i32));
+ continue;
+ }
+
+ if (theArgs[i]->use_empty()) {
+ // argument is dead
+ if (Ty->isAggregateType()) {
+ SmallVector<EVT, 16> vtparts;
+
+ ComputePTXValueVTs(*this, Ty, vtparts);
+ assert(vtparts.size() > 0 && "empty aggregate type not expected");
+ for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+ ++parti) {
+ InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+ ++InsIdx;
+ }
+ if (vtparts.size() > 0)
+ --InsIdx;
+ continue;
+ }
+ if (Ty->isVectorTy()) {
+ EVT ObjectVT = getValueType(Ty);
+ unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
+ for (unsigned parti = 0; parti < NumRegs; ++parti) {
+ InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+ ++InsIdx;
+ }
+ if (NumRegs > 0)
+ --InsIdx;
+ continue;
+ }
+ InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+ continue;
+ }
+
+ // In the following cases, assign a node order of "idx+1"
+ // to newly created nodes. The SDNodes for params have to
+ // appear in the same order as their order of appearance
+ // in the original function. "idx+1" holds that order.
+ if (PAL.hasAttribute(i + 1, Attribute::ByVal) == false) {
+ if (Ty->isAggregateType()) {
+ SmallVector<EVT, 16> vtparts;
+ SmallVector<uint64_t, 16> offsets;
+
+ // NOTE: Here, we lose the ability to issue vector loads for vectors
+ // that are a part of a struct. This should be investigated in the
+ // future.
+ ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0);
+ assert(vtparts.size() > 0 && "empty aggregate type not expected");
+ bool aggregateIsPacked = false;
+ if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
+ aggregateIsPacked = STy->isPacked();
+
+ SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+ for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+ ++parti) {
+ EVT partVT = vtparts[parti];
+ Value *srcValue = Constant::getNullValue(
+ PointerType::get(partVT.getTypeForEVT(F->getContext()),
+ llvm::ADDRESS_SPACE_PARAM));
+ SDValue srcAddr =
+ DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+ DAG.getConstant(offsets[parti], getPointerTy()));
+ unsigned partAlign =
+ aggregateIsPacked ? 1
+ : TD->getABITypeAlignment(
+ partVT.getTypeForEVT(F->getContext()));
+ SDValue p;
+ if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
+ ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
+ ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
+ MachinePointerInfo(srcValue), partVT, false,
+ false, partAlign);
+ } else {
+ p = DAG.getLoad(partVT, dl, Root, srcAddr,
+ MachinePointerInfo(srcValue), false, false, false,
+ partAlign);
+ }
+ if (p.getNode())
+ p.getNode()->setIROrder(idx + 1);
+ InVals.push_back(p);
+ ++InsIdx;
+ }
+ if (vtparts.size() > 0)
+ --InsIdx;
+ continue;
+ }
+ if (Ty->isVectorTy()) {
+ EVT ObjectVT = getValueType(Ty);
+ SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+ unsigned NumElts = ObjectVT.getVectorNumElements();
+ assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
+ "Vector was not scalarized");
+ unsigned Ofst = 0;
+ EVT EltVT = ObjectVT.getVectorElementType();
+
+ // V1 load
+ // f32 = load ...
+ if (NumElts == 1) {
+ // We only have one element, so just directly load it
+ Value *SrcValue = Constant::getNullValue(PointerType::get(
+ EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+ DAG.getConstant(Ofst, getPointerTy()));
+ SDValue P = DAG.getLoad(
+ EltVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+ false, true,
+ TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
+ if (P.getNode())
+ P.getNode()->setIROrder(idx + 1);
+
+ if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+ P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
+ InVals.push_back(P);
+ Ofst += TD->getTypeAllocSize(EltVT.getTypeForEVT(F->getContext()));
+ ++InsIdx;
+ } else if (NumElts == 2) {
+ // V2 load
+ // f32,f32 = load ...
+ EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
+ Value *SrcValue = Constant::getNullValue(PointerType::get(
+ VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+ DAG.getConstant(Ofst, getPointerTy()));
+ SDValue P = DAG.getLoad(
+ VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+ false, true,
+ TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+ if (P.getNode())
+ P.getNode()->setIROrder(idx + 1);
+
+ SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+ DAG.getIntPtrConstant(0));
+ SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+ DAG.getIntPtrConstant(1));
+
+ if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
+ Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
+ Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
+ }
+
+ InVals.push_back(Elt0);
+ InVals.push_back(Elt1);
+ Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+ InsIdx += 2;
+ } else {
+ // V4 loads
+ // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+ // the
+ // vector will be expanded to a power of 2 elements, so we know we can
+ // always round up to the next multiple of 4 when creating the vector
+ // loads.
+ // e.g. 4 elem => 1 ld.v4
+ // 6 elem => 2 ld.v4
+ // 8 elem => 2 ld.v4
+ // 11 elem => 3 ld.v4
+ unsigned VecSize = 4;
+ if (EltVT.getSizeInBits() == 64) {
+ VecSize = 2;
+ }
+ EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+ for (unsigned i = 0; i < NumElts; i += VecSize) {
+ Value *SrcValue = Constant::getNullValue(
+ PointerType::get(VecVT.getTypeForEVT(F->getContext()),
+ llvm::ADDRESS_SPACE_PARAM));
+ SDValue SrcAddr =
+ DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+ DAG.getConstant(Ofst, getPointerTy()));
+ SDValue P = DAG.getLoad(
+ VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+ false, true,
+ TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+ if (P.getNode())
+ P.getNode()->setIROrder(idx + 1);
+
+ for (unsigned j = 0; j < VecSize; ++j) {
+ if (i + j >= NumElts)
+ break;
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+ DAG.getIntPtrConstant(j));
+ if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+ Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
+ InVals.push_back(Elt);
+ }
+ Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+ }
+ InsIdx += NumElts;
+ }
+
+ if (NumElts > 0)
+ --InsIdx;
+ continue;
+ }
+ // A plain scalar.
+ EVT ObjectVT = getValueType(Ty);
+ // If ABI, load from the param symbol
+ SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+ Value *srcValue = Constant::getNullValue(PointerType::get(
+ ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+ SDValue p;
+ if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
+ ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
+ ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg,
+ MachinePointerInfo(srcValue), ObjectVT, false, false,
+ TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+ } else {
+ p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg,
+ MachinePointerInfo(srcValue), false, false, false,
+ TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+ }
+ if (p.getNode())
+ p.getNode()->setIROrder(idx + 1);
+ InVals.push_back(p);
+ continue;
+ }
+
+ // Param has ByVal attribute
+ // Return MoveParam(param symbol).
+ // Ideally, the param symbol can be returned directly,
+ // but when SDNode builder decides to use it in a CopyToReg(),
+ // machine instruction fails because TargetExternalSymbol
+ // (not lowered) is target dependent, and CopyToReg assumes
+ // the source is lowered.
+ EVT ObjectVT = getValueType(Ty);
+ assert(ObjectVT == Ins[InsIdx].VT &&
+ "Ins type did not match function type");
+ SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+ SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+ if (p.getNode())
+ p.getNode()->setIROrder(idx + 1);
+ if (isKernel)
+ InVals.push_back(p);
+ else {
+ SDValue p2 = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
+ DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), p);
+ InVals.push_back(p2);
+ }
+ }
+
+ // Clang will check explicit VarArg and issue error if any. However, Clang
+ // will let code with
+ // implicit var arg like f() pass. See bug 617733.
+ // We treat this case as if the arg list is empty.
+ // if (F.isVarArg()) {
+ // assert(0 && "VarArg not supported yet!");
+ //}
+
+ if (!OutChains.empty())
+ DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
+
+ return Chain;
+}
+
+
+SDValue
+NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDLoc dl, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function *F = MF.getFunction();
+ Type *RetTy = F->getReturnType();
+ const DataLayout *TD = getDataLayout();
+
+ bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+ assert(isABI && "Non-ABI compilation is not supported");
+ if (!isABI)
+ return Chain;
+
+ if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
+ // If we have a vector type, the OutVals array will be the scalarized
+ // components and we have combine them into 1 or more vector stores.
+ unsigned NumElts = VTy->getNumElements();
+ assert(NumElts == Outs.size() && "Bad scalarization of return value");
+
+ // const_cast can be removed in later LLVM versions
+ EVT EltVT = getValueType(RetTy).getVectorElementType();
+ bool NeedExtend = false;
+ if (EltVT.getSizeInBits() < 16)
+ NeedExtend = true;
+
+ // V1 store
+ if (NumElts == 1) {
+ SDValue StoreVal = OutVals[0];
+ // We only have one element, so just directly store it
+ if (NeedExtend)
+ StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+ SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ EltVT, MachinePointerInfo());
+
+ } else if (NumElts == 2) {
+ // V2 store
+ SDValue StoreVal0 = OutVals[0];
+ SDValue StoreVal1 = OutVals[1];
+
+ if (NeedExtend) {
+ StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
+ StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
+ }
+
+ SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0,
+ StoreVal1 };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ EltVT, MachinePointerInfo());
+ } else {
+ // V4 stores
+ // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
+ // vector will be expanded to a power of 2 elements, so we know we can
+ // always round up to the next multiple of 4 when creating the vector
+ // stores.
+ // e.g. 4 elem => 1 st.v4
+ // 6 elem => 2 st.v4
+ // 8 elem => 2 st.v4
+ // 11 elem => 3 st.v4
+
+ unsigned VecSize = 4;
+ if (OutVals[0].getValueType().getSizeInBits() == 64)
+ VecSize = 2;
+
+ unsigned Offset = 0;
+
+ EVT VecVT =
+ EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+ unsigned PerStoreOffset =
+ TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+
+ for (unsigned i = 0; i < NumElts; i += VecSize) {
+ // Get values
+ SDValue StoreVal;
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(Offset, MVT::i32));
+ unsigned Opc = NVPTXISD::StoreRetvalV2;
+ EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
+
+ StoreVal = OutVals[i];
+ if (NeedExtend)
+ StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+ Ops.push_back(StoreVal);
+
+ if (i + 1 < NumElts) {
+ StoreVal = OutVals[i + 1];
+ if (NeedExtend)
+ StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(ExtendedVT);
+ }
+ Ops.push_back(StoreVal);
+
+ if (VecSize == 4) {
+ Opc = NVPTXISD::StoreRetvalV4;
+ if (i + 2 < NumElts) {
+ StoreVal = OutVals[i + 2];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(ExtendedVT);
+ }
+ Ops.push_back(StoreVal);
+
+ if (i + 3 < NumElts) {
+ StoreVal = OutVals[i + 3];
+ if (NeedExtend)
+ StoreVal =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+ } else {
+ StoreVal = DAG.getUNDEF(ExtendedVT);
+ }
+ Ops.push_back(StoreVal);
+ }
+
+ // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
+ Chain =
+ DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
+ EltVT, MachinePointerInfo());
+ Offset += PerStoreOffset;
+ }
+ }
+ } else {
+ SmallVector<EVT, 16> ValVTs;
+ SmallVector<uint64_t, 16> Offsets;
+ ComputePTXValueVTs(*this, RetTy, ValVTs, &Offsets, 0);
+ assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
+
+ for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+ SDValue theVal = OutVals[i];
+ EVT TheValType = theVal.getValueType();
+ unsigned numElems = 1;
+ if (TheValType.isVector())
+ numElems = TheValType.getVectorNumElements();
+ for (unsigned j = 0, je = numElems; j != je; ++j) {
+ SDValue TmpVal = theVal;
+ if (TheValType.isVector())
+ TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ TheValType.getVectorElementType(), TmpVal,
+ DAG.getIntPtrConstant(j));
+ EVT TheStoreType = ValVTs[i];
+ if (RetTy->isIntegerTy() &&
+ TD->getTypeAllocSizeInBits(RetTy) < 32) {
+ // The following zero-extension is for integer types only, and
+ // specifically not for aggregates.
+ TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
+ TheStoreType = MVT::i32;
+ }
+ else if (TmpVal.getValueType().getSizeInBits() < 16)
+ TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
+
+ SDValue Ops[] = {
+ Chain,
+ DAG.getConstant(Offsets[i], MVT::i32),
+ TmpVal };
+ Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+ DAG.getVTList(MVT::Other), Ops,
+ TheStoreType,
+ MachinePointerInfo());
+ }
+ }
+ }
+
+ return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+
+void NVPTXTargetLowering::LowerAsmOperandForConstraint(
+ SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const {
+ if (Constraint.length() > 1)
+ return;
+ else
+ TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// NVPTX suuport vector of legal types of any length in Intrinsics because the
+// NVPTX specific type legalizer
+// will legalize them to the PTX supported length.
+bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
+ if (isTypeLegal(VT))
+ return true;
+ if (VT.isVector()) {
+ MVT eVT = VT.getVectorElementType();
+ if (isTypeLegal(eVT))
+ return true;
+ }
+ return false;
+}
+
+static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
+ switch (Intrinsic) {
+ default:
+ return 0;
+
+ case Intrinsic::nvvm_tex_1d_v4f32_s32:
+ return NVPTXISD::Tex1DFloatS32;
+ case Intrinsic::nvvm_tex_1d_v4f32_f32:
+ return NVPTXISD::Tex1DFloatFloat;
+ case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+ return NVPTXISD::Tex1DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+ return NVPTXISD::Tex1DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_1d_v4s32_s32:
+ return NVPTXISD::Tex1DS32S32;
+ case Intrinsic::nvvm_tex_1d_v4s32_f32:
+ return NVPTXISD::Tex1DS32Float;
+ case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+ return NVPTXISD::Tex1DS32FloatLevel;
+ case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+ return NVPTXISD::Tex1DS32FloatGrad;
+ case Intrinsic::nvvm_tex_1d_v4u32_s32:
+ return NVPTXISD::Tex1DU32S32;
+ case Intrinsic::nvvm_tex_1d_v4u32_f32:
+ return NVPTXISD::Tex1DU32Float;
+ case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+ return NVPTXISD::Tex1DU32FloatLevel;
+ case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+ return NVPTXISD::Tex1DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+ return NVPTXISD::Tex1DArrayFloatS32;
+ case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+ return NVPTXISD::Tex1DArrayFloatFloat;
+ case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+ return NVPTXISD::Tex1DArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+ return NVPTXISD::Tex1DArrayFloatFloatGrad;
+ case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+ return NVPTXISD::Tex1DArrayS32S32;
+ case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+ return NVPTXISD::Tex1DArrayS32Float;
+ case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+ return NVPTXISD::Tex1DArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+ return NVPTXISD::Tex1DArrayS32FloatGrad;
+ case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+ return NVPTXISD::Tex1DArrayU32S32;
+ case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+ return NVPTXISD::Tex1DArrayU32Float;
+ case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+ return NVPTXISD::Tex1DArrayU32FloatLevel;
+ case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+ return NVPTXISD::Tex1DArrayU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_2d_v4f32_s32:
+ return NVPTXISD::Tex2DFloatS32;
+ case Intrinsic::nvvm_tex_2d_v4f32_f32:
+ return NVPTXISD::Tex2DFloatFloat;
+ case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+ return NVPTXISD::Tex2DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+ return NVPTXISD::Tex2DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_2d_v4s32_s32:
+ return NVPTXISD::Tex2DS32S32;
+ case Intrinsic::nvvm_tex_2d_v4s32_f32:
+ return NVPTXISD::Tex2DS32Float;
+ case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+ return NVPTXISD::Tex2DS32FloatLevel;
+ case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+ return NVPTXISD::Tex2DS32FloatGrad;
+ case Intrinsic::nvvm_tex_2d_v4u32_s32:
+ return NVPTXISD::Tex2DU32S32;
+ case Intrinsic::nvvm_tex_2d_v4u32_f32:
+ return NVPTXISD::Tex2DU32Float;
+ case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+ return NVPTXISD::Tex2DU32FloatLevel;
+ case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+ return NVPTXISD::Tex2DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+ return NVPTXISD::Tex2DArrayFloatS32;
+ case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+ return NVPTXISD::Tex2DArrayFloatFloat;
+ case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+ return NVPTXISD::Tex2DArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+ return NVPTXISD::Tex2DArrayFloatFloatGrad;
+ case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+ return NVPTXISD::Tex2DArrayS32S32;
+ case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+ return NVPTXISD::Tex2DArrayS32Float;
+ case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+ return NVPTXISD::Tex2DArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+ return NVPTXISD::Tex2DArrayS32FloatGrad;
+ case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+ return NVPTXISD::Tex2DArrayU32S32;
+ case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+ return NVPTXISD::Tex2DArrayU32Float;
+ case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+ return NVPTXISD::Tex2DArrayU32FloatLevel;
+ case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+ return NVPTXISD::Tex2DArrayU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_3d_v4f32_s32:
+ return NVPTXISD::Tex3DFloatS32;
+ case Intrinsic::nvvm_tex_3d_v4f32_f32:
+ return NVPTXISD::Tex3DFloatFloat;
+ case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+ return NVPTXISD::Tex3DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+ return NVPTXISD::Tex3DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_3d_v4s32_s32:
+ return NVPTXISD::Tex3DS32S32;
+ case Intrinsic::nvvm_tex_3d_v4s32_f32:
+ return NVPTXISD::Tex3DS32Float;
+ case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+ return NVPTXISD::Tex3DS32FloatLevel;
+ case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+ return NVPTXISD::Tex3DS32FloatGrad;
+ case Intrinsic::nvvm_tex_3d_v4u32_s32:
+ return NVPTXISD::Tex3DU32S32;
+ case Intrinsic::nvvm_tex_3d_v4u32_f32:
+ return NVPTXISD::Tex3DU32Float;
+ case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+ return NVPTXISD::Tex3DU32FloatLevel;
+ case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+ return NVPTXISD::Tex3DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_cube_v4f32_f32:
+ return NVPTXISD::TexCubeFloatFloat;
+ case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+ return NVPTXISD::TexCubeFloatFloatLevel;
+ case Intrinsic::nvvm_tex_cube_v4s32_f32:
+ return NVPTXISD::TexCubeS32Float;
+ case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+ return NVPTXISD::TexCubeS32FloatLevel;
+ case Intrinsic::nvvm_tex_cube_v4u32_f32:
+ return NVPTXISD::TexCubeU32Float;
+ case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+ return NVPTXISD::TexCubeU32FloatLevel;
+
+ case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+ return NVPTXISD::TexCubeArrayFloatFloat;
+ case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+ return NVPTXISD::TexCubeArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+ return NVPTXISD::TexCubeArrayS32Float;
+ case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+ return NVPTXISD::TexCubeArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+ return NVPTXISD::TexCubeArrayU32Float;
+ case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+ return NVPTXISD::TexCubeArrayU32FloatLevel;
+
+ case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+ return NVPTXISD::Tld4R2DFloatFloat;
+ case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+ return NVPTXISD::Tld4G2DFloatFloat;
+ case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+ return NVPTXISD::Tld4B2DFloatFloat;
+ case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+ return NVPTXISD::Tld4A2DFloatFloat;
+ case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+ return NVPTXISD::Tld4R2DS64Float;
+ case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+ return NVPTXISD::Tld4G2DS64Float;
+ case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+ return NVPTXISD::Tld4B2DS64Float;
+ case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+ return NVPTXISD::Tld4A2DS64Float;
+ case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+ return NVPTXISD::Tld4R2DU64Float;
+ case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+ return NVPTXISD::Tld4G2DU64Float;
+ case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+ return NVPTXISD::Tld4B2DU64Float;
+ case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+ return NVPTXISD::Tld4A2DU64Float;
+
+ case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+ return NVPTXISD::TexUnified1DFloatS32;
+ case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+ return NVPTXISD::TexUnified1DFloatFloat;
+ case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+ return NVPTXISD::TexUnified1DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+ return NVPTXISD::TexUnified1DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+ return NVPTXISD::TexUnified1DS32S32;
+ case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+ return NVPTXISD::TexUnified1DS32Float;
+ case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+ return NVPTXISD::TexUnified1DS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+ return NVPTXISD::TexUnified1DS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+ return NVPTXISD::TexUnified1DU32S32;
+ case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+ return NVPTXISD::TexUnified1DU32Float;
+ case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+ return NVPTXISD::TexUnified1DU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+ return NVPTXISD::TexUnified1DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+ return NVPTXISD::TexUnified1DArrayFloatS32;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+ return NVPTXISD::TexUnified1DArrayFloatFloat;
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+ return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+ return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+ return NVPTXISD::TexUnified1DArrayS32S32;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+ return NVPTXISD::TexUnified1DArrayS32Float;
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+ return NVPTXISD::TexUnified1DArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+ return NVPTXISD::TexUnified1DArrayS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+ return NVPTXISD::TexUnified1DArrayU32S32;
+ case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+ return NVPTXISD::TexUnified1DArrayU32Float;
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+ return NVPTXISD::TexUnified1DArrayU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+ return NVPTXISD::TexUnified1DArrayU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+ return NVPTXISD::TexUnified2DFloatS32;
+ case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+ return NVPTXISD::TexUnified2DFloatFloat;
+ case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+ return NVPTXISD::TexUnified2DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+ return NVPTXISD::TexUnified2DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+ return NVPTXISD::TexUnified2DS32S32;
+ case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+ return NVPTXISD::TexUnified2DS32Float;
+ case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+ return NVPTXISD::TexUnified2DS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+ return NVPTXISD::TexUnified2DS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+ return NVPTXISD::TexUnified2DU32S32;
+ case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+ return NVPTXISD::TexUnified2DU32Float;
+ case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+ return NVPTXISD::TexUnified2DU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+ return NVPTXISD::TexUnified2DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+ return NVPTXISD::TexUnified2DArrayFloatS32;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+ return NVPTXISD::TexUnified2DArrayFloatFloat;
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+ return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+ return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+ return NVPTXISD::TexUnified2DArrayS32S32;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+ return NVPTXISD::TexUnified2DArrayS32Float;
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+ return NVPTXISD::TexUnified2DArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+ return NVPTXISD::TexUnified2DArrayS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+ return NVPTXISD::TexUnified2DArrayU32S32;
+ case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+ return NVPTXISD::TexUnified2DArrayU32Float;
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+ return NVPTXISD::TexUnified2DArrayU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+ return NVPTXISD::TexUnified2DArrayU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+ return NVPTXISD::TexUnified3DFloatS32;
+ case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+ return NVPTXISD::TexUnified3DFloatFloat;
+ case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+ return NVPTXISD::TexUnified3DFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+ return NVPTXISD::TexUnified3DFloatFloatGrad;
+ case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+ return NVPTXISD::TexUnified3DS32S32;
+ case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+ return NVPTXISD::TexUnified3DS32Float;
+ case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+ return NVPTXISD::TexUnified3DS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+ return NVPTXISD::TexUnified3DS32FloatGrad;
+ case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+ return NVPTXISD::TexUnified3DU32S32;
+ case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+ return NVPTXISD::TexUnified3DU32Float;
+ case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+ return NVPTXISD::TexUnified3DU32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+ return NVPTXISD::TexUnified3DU32FloatGrad;
+
+ case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+ return NVPTXISD::TexUnifiedCubeFloatFloat;
+ case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+ return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+ return NVPTXISD::TexUnifiedCubeS32Float;
+ case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+ return NVPTXISD::TexUnifiedCubeS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+ return NVPTXISD::TexUnifiedCubeU32Float;
+ case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+ return NVPTXISD::TexUnifiedCubeU32FloatLevel;
+
+ case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
+ case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayS32Float;
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
+ case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayU32Float;
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+ return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
+
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+ return NVPTXISD::Tld4UnifiedR2DFloatFloat;
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+ return NVPTXISD::Tld4UnifiedG2DFloatFloat;
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+ return NVPTXISD::Tld4UnifiedB2DFloatFloat;
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+ return NVPTXISD::Tld4UnifiedA2DFloatFloat;
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+ return NVPTXISD::Tld4UnifiedR2DS64Float;
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+ return NVPTXISD::Tld4UnifiedG2DS64Float;
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+ return NVPTXISD::Tld4UnifiedB2DS64Float;
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+ return NVPTXISD::Tld4UnifiedA2DS64Float;
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+ return NVPTXISD::Tld4UnifiedR2DU64Float;
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+ return NVPTXISD::Tld4UnifiedG2DU64Float;
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+ return NVPTXISD::Tld4UnifiedB2DU64Float;
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+ return NVPTXISD::Tld4UnifiedA2DU64Float;
+ }
+}
+
+static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
+ switch (Intrinsic) {
+ default:
+ return 0;
+ case Intrinsic::nvvm_suld_1d_i8_clamp:
+ return NVPTXISD::Suld1DI8Clamp;
+ case Intrinsic::nvvm_suld_1d_i16_clamp:
+ return NVPTXISD::Suld1DI16Clamp;
+ case Intrinsic::nvvm_suld_1d_i32_clamp:
+ return NVPTXISD::Suld1DI32Clamp;
+ case Intrinsic::nvvm_suld_1d_i64_clamp:
+ return NVPTXISD::Suld1DI64Clamp;
+ case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+ return NVPTXISD::Suld1DV2I8Clamp;
+ case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+ return NVPTXISD::Suld1DV2I16Clamp;
+ case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+ return NVPTXISD::Suld1DV2I32Clamp;
+ case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+ return NVPTXISD::Suld1DV2I64Clamp;
+ case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+ return NVPTXISD::Suld1DV4I8Clamp;
+ case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+ return NVPTXISD::Suld1DV4I16Clamp;
+ case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+ return NVPTXISD::Suld1DV4I32Clamp;
+ case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+ return NVPTXISD::Suld1DArrayI8Clamp;
+ case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+ return NVPTXISD::Suld1DArrayI16Clamp;
+ case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+ return NVPTXISD::Suld1DArrayI32Clamp;
+ case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+ return NVPTXISD::Suld1DArrayI64Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+ return NVPTXISD::Suld1DArrayV2I8Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+ return NVPTXISD::Suld1DArrayV2I16Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+ return NVPTXISD::Suld1DArrayV2I32Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+ return NVPTXISD::Suld1DArrayV2I64Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+ return NVPTXISD::Suld1DArrayV4I8Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+ return NVPTXISD::Suld1DArrayV4I16Clamp;
+ case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+ return NVPTXISD::Suld1DArrayV4I32Clamp;
+ case Intrinsic::nvvm_suld_2d_i8_clamp:
+ return NVPTXISD::Suld2DI8Clamp;
+ case Intrinsic::nvvm_suld_2d_i16_clamp:
+ return NVPTXISD::Suld2DI16Clamp;
+ case Intrinsic::nvvm_suld_2d_i32_clamp:
+ return NVPTXISD::Suld2DI32Clamp;
+ case Intrinsic::nvvm_suld_2d_i64_clamp:
+ return NVPTXISD::Suld2DI64Clamp;
+ case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+ return NVPTXISD::Suld2DV2I8Clamp;
+ case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+ return NVPTXISD::Suld2DV2I16Clamp;
+ case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+ return NVPTXISD::Suld2DV2I32Clamp;
+ case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+ return NVPTXISD::Suld2DV2I64Clamp;
+ case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+ return NVPTXISD::Suld2DV4I8Clamp;
+ case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+ return NVPTXISD::Suld2DV4I16Clamp;
+ case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+ return NVPTXISD::Suld2DV4I32Clamp;
+ case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+ return NVPTXISD::Suld2DArrayI8Clamp;
+ case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+ return NVPTXISD::Suld2DArrayI16Clamp;
+ case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+ return NVPTXISD::Suld2DArrayI32Clamp;
+ case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+ return NVPTXISD::Suld2DArrayI64Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+ return NVPTXISD::Suld2DArrayV2I8Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+ return NVPTXISD::Suld2DArrayV2I16Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+ return NVPTXISD::Suld2DArrayV2I32Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+ return NVPTXISD::Suld2DArrayV2I64Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+ return NVPTXISD::Suld2DArrayV4I8Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+ return NVPTXISD::Suld2DArrayV4I16Clamp;
+ case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+ return NVPTXISD::Suld2DArrayV4I32Clamp;
+ case Intrinsic::nvvm_suld_3d_i8_clamp:
+ return NVPTXISD::Suld3DI8Clamp;
+ case Intrinsic::nvvm_suld_3d_i16_clamp:
+ return NVPTXISD::Suld3DI16Clamp;
+ case Intrinsic::nvvm_suld_3d_i32_clamp:
+ return NVPTXISD::Suld3DI32Clamp;
+ case Intrinsic::nvvm_suld_3d_i64_clamp:
+ return NVPTXISD::Suld3DI64Clamp;
+ case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+ return NVPTXISD::Suld3DV2I8Clamp;
+ case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+ return NVPTXISD::Suld3DV2I16Clamp;
+ case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+ return NVPTXISD::Suld3DV2I32Clamp;
+ case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+ return NVPTXISD::Suld3DV2I64Clamp;
+ case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+ return NVPTXISD::Suld3DV4I8Clamp;
+ case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+ return NVPTXISD::Suld3DV4I16Clamp;
+ case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+ return NVPTXISD::Suld3DV4I32Clamp;
+ case Intrinsic::nvvm_suld_1d_i8_trap:
+ return NVPTXISD::Suld1DI8Trap;
+ case Intrinsic::nvvm_suld_1d_i16_trap:
+ return NVPTXISD::Suld1DI16Trap;
+ case Intrinsic::nvvm_suld_1d_i32_trap:
+ return NVPTXISD::Suld1DI32Trap;
+ case Intrinsic::nvvm_suld_1d_i64_trap:
+ return NVPTXISD::Suld1DI64Trap;
+ case Intrinsic::nvvm_suld_1d_v2i8_trap:
+ return NVPTXISD::Suld1DV2I8Trap;
+ case Intrinsic::nvvm_suld_1d_v2i16_trap:
+ return NVPTXISD::Suld1DV2I16Trap;
+ case Intrinsic::nvvm_suld_1d_v2i32_trap:
+ return NVPTXISD::Suld1DV2I32Trap;
+ case Intrinsic::nvvm_suld_1d_v2i64_trap:
+ return NVPTXISD::Suld1DV2I64Trap;
+ case Intrinsic::nvvm_suld_1d_v4i8_trap:
+ return NVPTXISD::Suld1DV4I8Trap;
+ case Intrinsic::nvvm_suld_1d_v4i16_trap:
+ return NVPTXISD::Suld1DV4I16Trap;
+ case Intrinsic::nvvm_suld_1d_v4i32_trap:
+ return NVPTXISD::Suld1DV4I32Trap;
+ case Intrinsic::nvvm_suld_1d_array_i8_trap:
+ return NVPTXISD::Suld1DArrayI8Trap;
+ case Intrinsic::nvvm_suld_1d_array_i16_trap:
+ return NVPTXISD::Suld1DArrayI16Trap;
+ case Intrinsic::nvvm_suld_1d_array_i32_trap:
+ return NVPTXISD::Suld1DArrayI32Trap;
+ case Intrinsic::nvvm_suld_1d_array_i64_trap:
+ return NVPTXISD::Suld1DArrayI64Trap;
+ case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+ return NVPTXISD::Suld1DArrayV2I8Trap;
+ case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+ return NVPTXISD::Suld1DArrayV2I16Trap;
+ case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+ return NVPTXISD::Suld1DArrayV2I32Trap;
+ case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+ return NVPTXISD::Suld1DArrayV2I64Trap;
+ case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+ return NVPTXISD::Suld1DArrayV4I8Trap;
+ case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+ return NVPTXISD::Suld1DArrayV4I16Trap;
+ case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+ return NVPTXISD::Suld1DArrayV4I32Trap;
+ case Intrinsic::nvvm_suld_2d_i8_trap:
+ return NVPTXISD::Suld2DI8Trap;
+ case Intrinsic::nvvm_suld_2d_i16_trap:
+ return NVPTXISD::Suld2DI16Trap;
+ case Intrinsic::nvvm_suld_2d_i32_trap:
+ return NVPTXISD::Suld2DI32Trap;
+ case Intrinsic::nvvm_suld_2d_i64_trap:
+ return NVPTXISD::Suld2DI64Trap;
+ case Intrinsic::nvvm_suld_2d_v2i8_trap:
+ return NVPTXISD::Suld2DV2I8Trap;
+ case Intrinsic::nvvm_suld_2d_v2i16_trap:
+ return NVPTXISD::Suld2DV2I16Trap;
+ case Intrinsic::nvvm_suld_2d_v2i32_trap:
+ return NVPTXISD::Suld2DV2I32Trap;
+ case Intrinsic::nvvm_suld_2d_v2i64_trap:
+ return NVPTXISD::Suld2DV2I64Trap;
+ case Intrinsic::nvvm_suld_2d_v4i8_trap:
+ return NVPTXISD::Suld2DV4I8Trap;
+ case Intrinsic::nvvm_suld_2d_v4i16_trap:
+ return NVPTXISD::Suld2DV4I16Trap;
+ case Intrinsic::nvvm_suld_2d_v4i32_trap:
+ return NVPTXISD::Suld2DV4I32Trap;
+ case Intrinsic::nvvm_suld_2d_array_i8_trap:
+ return NVPTXISD::Suld2DArrayI8Trap;
+ case Intrinsic::nvvm_suld_2d_array_i16_trap:
+ return NVPTXISD::Suld2DArrayI16Trap;
+ case Intrinsic::nvvm_suld_2d_array_i32_trap:
+ return NVPTXISD::Suld2DArrayI32Trap;
+ case Intrinsic::nvvm_suld_2d_array_i64_trap:
+ return NVPTXISD::Suld2DArrayI64Trap;
+ case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+ return NVPTXISD::Suld2DArrayV2I8Trap;
+ case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+ return NVPTXISD::Suld2DArrayV2I16Trap;
+ case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+ return NVPTXISD::Suld2DArrayV2I32Trap;
+ case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+ return NVPTXISD::Suld2DArrayV2I64Trap;
+ case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+ return NVPTXISD::Suld2DArrayV4I8Trap;
+ case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+ return NVPTXISD::Suld2DArrayV4I16Trap;
+ case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+ return NVPTXISD::Suld2DArrayV4I32Trap;
+ case Intrinsic::nvvm_suld_3d_i8_trap:
+ return NVPTXISD::Suld3DI8Trap;
+ case Intrinsic::nvvm_suld_3d_i16_trap:
+ return NVPTXISD::Suld3DI16Trap;
+ case Intrinsic::nvvm_suld_3d_i32_trap:
+ return NVPTXISD::Suld3DI32Trap;
+ case Intrinsic::nvvm_suld_3d_i64_trap:
+ return NVPTXISD::Suld3DI64Trap;
+ case Intrinsic::nvvm_suld_3d_v2i8_trap:
+ return NVPTXISD::Suld3DV2I8Trap;
+ case Intrinsic::nvvm_suld_3d_v2i16_trap:
+ return NVPTXISD::Suld3DV2I16Trap;
+ case Intrinsic::nvvm_suld_3d_v2i32_trap:
+ return NVPTXISD::Suld3DV2I32Trap;
+ case Intrinsic::nvvm_suld_3d_v2i64_trap:
+ return NVPTXISD::Suld3DV2I64Trap;
+ case Intrinsic::nvvm_suld_3d_v4i8_trap:
+ return NVPTXISD::Suld3DV4I8Trap;
+ case Intrinsic::nvvm_suld_3d_v4i16_trap:
+ return NVPTXISD::Suld3DV4I16Trap;
+ case Intrinsic::nvvm_suld_3d_v4i32_trap:
+ return NVPTXISD::Suld3DV4I32Trap;
+ case Intrinsic::nvvm_suld_1d_i8_zero:
+ return NVPTXISD::Suld1DI8Zero;
+ case Intrinsic::nvvm_suld_1d_i16_zero:
+ return NVPTXISD::Suld1DI16Zero;
+ case Intrinsic::nvvm_suld_1d_i32_zero:
+ return NVPTXISD::Suld1DI32Zero;
+ case Intrinsic::nvvm_suld_1d_i64_zero:
+ return NVPTXISD::Suld1DI64Zero;
+ case Intrinsic::nvvm_suld_1d_v2i8_zero:
+ return NVPTXISD::Suld1DV2I8Zero;
+ case Intrinsic::nvvm_suld_1d_v2i16_zero:
+ return NVPTXISD::Suld1DV2I16Zero;
+ case Intrinsic::nvvm_suld_1d_v2i32_zero:
+ return NVPTXISD::Suld1DV2I32Zero;
+ case Intrinsic::nvvm_suld_1d_v2i64_zero:
+ return NVPTXISD::Suld1DV2I64Zero;
+ case Intrinsic::nvvm_suld_1d_v4i8_zero:
+ return NVPTXISD::Suld1DV4I8Zero;
+ case Intrinsic::nvvm_suld_1d_v4i16_zero:
+ return NVPTXISD::Suld1DV4I16Zero;
+ case Intrinsic::nvvm_suld_1d_v4i32_zero:
+ return NVPTXISD::Suld1DV4I32Zero;
+ case Intrinsic::nvvm_suld_1d_array_i8_zero:
+ return NVPTXISD::Suld1DArrayI8Zero;
+ case Intrinsic::nvvm_suld_1d_array_i16_zero:
+ return NVPTXISD::Suld1DArrayI16Zero;
+ case Intrinsic::nvvm_suld_1d_array_i32_zero:
+ return NVPTXISD::Suld1DArrayI32Zero;
+ case Intrinsic::nvvm_suld_1d_array_i64_zero:
+ return NVPTXISD::Suld1DArrayI64Zero;
+ case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+ return NVPTXISD::Suld1DArrayV2I8Zero;
+ case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+ return NVPTXISD::Suld1DArrayV2I16Zero;
+ case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+ return NVPTXISD::Suld1DArrayV2I32Zero;
+ case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+ return NVPTXISD::Suld1DArrayV2I64Zero;
+ case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+ return NVPTXISD::Suld1DArrayV4I8Zero;
+ case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+ return NVPTXISD::Suld1DArrayV4I16Zero;
+ case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+ return NVPTXISD::Suld1DArrayV4I32Zero;
+ case Intrinsic::nvvm_suld_2d_i8_zero:
+ return NVPTXISD::Suld2DI8Zero;
+ case Intrinsic::nvvm_suld_2d_i16_zero:
+ return NVPTXISD::Suld2DI16Zero;
+ case Intrinsic::nvvm_suld_2d_i32_zero:
+ return NVPTXISD::Suld2DI32Zero;
+ case Intrinsic::nvvm_suld_2d_i64_zero:
+ return NVPTXISD::Suld2DI64Zero;
+ case Intrinsic::nvvm_suld_2d_v2i8_zero:
+ return NVPTXISD::Suld2DV2I8Zero;
+ case Intrinsic::nvvm_suld_2d_v2i16_zero:
+ return NVPTXISD::Suld2DV2I16Zero;
+ case Intrinsic::nvvm_suld_2d_v2i32_zero:
+ return NVPTXISD::Suld2DV2I32Zero;
+ case Intrinsic::nvvm_suld_2d_v2i64_zero:
+ return NVPTXISD::Suld2DV2I64Zero;
+ case Intrinsic::nvvm_suld_2d_v4i8_zero:
+ return NVPTXISD::Suld2DV4I8Zero;
+ case Intrinsic::nvvm_suld_2d_v4i16_zero:
+ return NVPTXISD::Suld2DV4I16Zero;
+ case Intrinsic::nvvm_suld_2d_v4i32_zero:
+ return NVPTXISD::Suld2DV4I32Zero;
+ case Intrinsic::nvvm_suld_2d_array_i8_zero:
+ return NVPTXISD::Suld2DArrayI8Zero;
+ case Intrinsic::nvvm_suld_2d_array_i16_zero:
+ return NVPTXISD::Suld2DArrayI16Zero;
+ case Intrinsic::nvvm_suld_2d_array_i32_zero:
+ return NVPTXISD::Suld2DArrayI32Zero;
+ case Intrinsic::nvvm_suld_2d_array_i64_zero:
+ return NVPTXISD::Suld2DArrayI64Zero;
+ case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+ return NVPTXISD::Suld2DArrayV2I8Zero;
+ case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+ return NVPTXISD::Suld2DArrayV2I16Zero;
+ case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+ return NVPTXISD::Suld2DArrayV2I32Zero;
+ case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+ return NVPTXISD::Suld2DArrayV2I64Zero;
+ case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+ return NVPTXISD::Suld2DArrayV4I8Zero;
+ case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+ return NVPTXISD::Suld2DArrayV4I16Zero;
+ case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+ return NVPTXISD::Suld2DArrayV4I32Zero;
+ case Intrinsic::nvvm_suld_3d_i8_zero:
+ return NVPTXISD::Suld3DI8Zero;
+ case Intrinsic::nvvm_suld_3d_i16_zero:
+ return NVPTXISD::Suld3DI16Zero;
+ case Intrinsic::nvvm_suld_3d_i32_zero:
+ return NVPTXISD::Suld3DI32Zero;
+ case Intrinsic::nvvm_suld_3d_i64_zero:
+ return NVPTXISD::Suld3DI64Zero;
+ case Intrinsic::nvvm_suld_3d_v2i8_zero:
+ return NVPTXISD::Suld3DV2I8Zero;
+ case Intrinsic::nvvm_suld_3d_v2i16_zero:
+ return NVPTXISD::Suld3DV2I16Zero;
+ case Intrinsic::nvvm_suld_3d_v2i32_zero:
+ return NVPTXISD::Suld3DV2I32Zero;
+ case Intrinsic::nvvm_suld_3d_v2i64_zero:
+ return NVPTXISD::Suld3DV2I64Zero;
+ case Intrinsic::nvvm_suld_3d_v4i8_zero:
+ return NVPTXISD::Suld3DV4I8Zero;
+ case Intrinsic::nvvm_suld_3d_v4i16_zero:
+ return NVPTXISD::Suld3DV4I16Zero;
+ case Intrinsic::nvvm_suld_3d_v4i32_zero:
+ return NVPTXISD::Suld3DV4I32Zero;
+ }
+}
+
+// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+// TgtMemIntrinsic
+// because we need the information that is only available in the "Value" type
+// of destination
+// pointer. In particular, the address space information.
+bool NVPTXTargetLowering::getTgtMemIntrinsic(
+ IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
+ switch (Intrinsic) {
+ default:
+ return false;
+
+ case Intrinsic::nvvm_atomic_load_add_f32:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::f32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = true;
+ Info.align = 0;
+ return true;
+
+ case Intrinsic::nvvm_atomic_load_inc_32:
+ case Intrinsic::nvvm_atomic_load_dec_32:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = true;
+ Info.align = 0;
+ return true;
+
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_p: {
+
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+ Info.memVT = getValueType(I.getType());
+ else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
+ Info.memVT = getPointerTy();
+ else
+ Info.memVT = getValueType(I.getType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+
+ // alignment is available as metadata.
+ // Grab it and set the alignment.
+ assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
+ MDNode *AlignMD = I.getMetadata("align");
+ assert(AlignMD && "Must have a non-null MDNode");
+ assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
+ Value *Align = AlignMD->getOperand(0);
+ int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
+ Info.align = Alignment;
+
+ return true;
+ }
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p: {
+
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
+ Info.memVT = getValueType(I.getType());
+ else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
+ Info.memVT = getPointerTy();
+ else
+ Info.memVT = getValueType(I.getType());
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+
+ // alignment is available as metadata.
+ // Grab it and set the alignment.
+ assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
+ MDNode *AlignMD = I.getMetadata("align");
+ assert(AlignMD && "Must have a non-null MDNode");
+ assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
+ Value *Align = AlignMD->getOperand(0);
+ int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
+ Info.align = Alignment;
+
+ return true;
+ }
+
+ case Intrinsic::nvvm_tex_1d_v4f32_s32:
+ case Intrinsic::nvvm_tex_1d_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+ case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_v4f32_s32:
+ case Intrinsic::nvvm_tex_2d_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+ case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_3d_v4f32_s32:
+ case Intrinsic::nvvm_tex_3d_v4f32_f32:
+ case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_cube_v4f32_f32:
+ case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+ case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
+ Info.opc = getOpcForTextureInstr(Intrinsic);
+ Info.memVT = MVT::v4f32;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_tex_1d_v4s32_s32:
+ case Intrinsic::nvvm_tex_1d_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+ case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_v4s32_s32:
+ case Intrinsic::nvvm_tex_2d_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+ case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_3d_v4s32_s32:
+ case Intrinsic::nvvm_tex_3d_v4s32_f32:
+ case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_cube_v4u32_f32:
+ case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_v4u32_s32:
+ case Intrinsic::nvvm_tex_1d_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+ case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_v4u32_s32:
+ case Intrinsic::nvvm_tex_2d_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+ case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_3d_v4u32_s32:
+ case Intrinsic::nvvm_tex_3d_v4u32_f32:
+ case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+ case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+ case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+ case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+ case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+ case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
+ Info.opc = getOpcForTextureInstr(Intrinsic);
+ Info.memVT = MVT::v4i32;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_suld_1d_i8_clamp:
+ case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+ case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+ case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+ case Intrinsic::nvvm_suld_2d_i8_clamp:
+ case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+ case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+ case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+ case Intrinsic::nvvm_suld_3d_i8_clamp:
+ case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+ case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+ case Intrinsic::nvvm_suld_1d_i8_trap:
+ case Intrinsic::nvvm_suld_1d_v2i8_trap:
+ case Intrinsic::nvvm_suld_1d_v4i8_trap:
+ case Intrinsic::nvvm_suld_1d_array_i8_trap:
+ case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+ case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+ case Intrinsic::nvvm_suld_2d_i8_trap:
+ case Intrinsic::nvvm_suld_2d_v2i8_trap:
+ case Intrinsic::nvvm_suld_2d_v4i8_trap:
+ case Intrinsic::nvvm_suld_2d_array_i8_trap:
+ case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+ case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+ case Intrinsic::nvvm_suld_3d_i8_trap:
+ case Intrinsic::nvvm_suld_3d_v2i8_trap:
+ case Intrinsic::nvvm_suld_3d_v4i8_trap:
+ case Intrinsic::nvvm_suld_1d_i8_zero:
+ case Intrinsic::nvvm_suld_1d_v2i8_zero:
+ case Intrinsic::nvvm_suld_1d_v4i8_zero:
+ case Intrinsic::nvvm_suld_1d_array_i8_zero:
+ case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+ case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+ case Intrinsic::nvvm_suld_2d_i8_zero:
+ case Intrinsic::nvvm_suld_2d_v2i8_zero:
+ case Intrinsic::nvvm_suld_2d_v4i8_zero:
+ case Intrinsic::nvvm_suld_2d_array_i8_zero:
+ case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+ case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+ case Intrinsic::nvvm_suld_3d_i8_zero:
+ case Intrinsic::nvvm_suld_3d_v2i8_zero:
+ case Intrinsic::nvvm_suld_3d_v4i8_zero: {
+ Info.opc = getOpcForSurfaceInstr(Intrinsic);
+ Info.memVT = MVT::i8;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_suld_1d_i16_clamp:
+ case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+ case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+ case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+ case Intrinsic::nvvm_suld_2d_i16_clamp:
+ case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+ case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+ case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+ case Intrinsic::nvvm_suld_3d_i16_clamp:
+ case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+ case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+ case Intrinsic::nvvm_suld_1d_i16_trap:
+ case Intrinsic::nvvm_suld_1d_v2i16_trap:
+ case Intrinsic::nvvm_suld_1d_v4i16_trap:
+ case Intrinsic::nvvm_suld_1d_array_i16_trap:
+ case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+ case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+ case Intrinsic::nvvm_suld_2d_i16_trap:
+ case Intrinsic::nvvm_suld_2d_v2i16_trap:
+ case Intrinsic::nvvm_suld_2d_v4i16_trap:
+ case Intrinsic::nvvm_suld_2d_array_i16_trap:
+ case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+ case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+ case Intrinsic::nvvm_suld_3d_i16_trap:
+ case Intrinsic::nvvm_suld_3d_v2i16_trap:
+ case Intrinsic::nvvm_suld_3d_v4i16_trap:
+ case Intrinsic::nvvm_suld_1d_i16_zero:
+ case Intrinsic::nvvm_suld_1d_v2i16_zero:
+ case Intrinsic::nvvm_suld_1d_v4i16_zero:
+ case Intrinsic::nvvm_suld_1d_array_i16_zero:
+ case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+ case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+ case Intrinsic::nvvm_suld_2d_i16_zero:
+ case Intrinsic::nvvm_suld_2d_v2i16_zero:
+ case Intrinsic::nvvm_suld_2d_v4i16_zero:
+ case Intrinsic::nvvm_suld_2d_array_i16_zero:
+ case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+ case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+ case Intrinsic::nvvm_suld_3d_i16_zero:
+ case Intrinsic::nvvm_suld_3d_v2i16_zero:
+ case Intrinsic::nvvm_suld_3d_v4i16_zero: {
+ Info.opc = getOpcForSurfaceInstr(Intrinsic);
+ Info.memVT = MVT::i16;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_suld_1d_i32_clamp:
+ case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+ case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+ case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+ case Intrinsic::nvvm_suld_2d_i32_clamp:
+ case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+ case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+ case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+ case Intrinsic::nvvm_suld_3d_i32_clamp:
+ case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+ case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+ case Intrinsic::nvvm_suld_1d_i32_trap:
+ case Intrinsic::nvvm_suld_1d_v2i32_trap:
+ case Intrinsic::nvvm_suld_1d_v4i32_trap:
+ case Intrinsic::nvvm_suld_1d_array_i32_trap:
+ case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+ case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+ case Intrinsic::nvvm_suld_2d_i32_trap:
+ case Intrinsic::nvvm_suld_2d_v2i32_trap:
+ case Intrinsic::nvvm_suld_2d_v4i32_trap:
+ case Intrinsic::nvvm_suld_2d_array_i32_trap:
+ case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+ case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+ case Intrinsic::nvvm_suld_3d_i32_trap:
+ case Intrinsic::nvvm_suld_3d_v2i32_trap:
+ case Intrinsic::nvvm_suld_3d_v4i32_trap:
+ case Intrinsic::nvvm_suld_1d_i32_zero:
+ case Intrinsic::nvvm_suld_1d_v2i32_zero:
+ case Intrinsic::nvvm_suld_1d_v4i32_zero:
+ case Intrinsic::nvvm_suld_1d_array_i32_zero:
+ case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+ case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+ case Intrinsic::nvvm_suld_2d_i32_zero:
+ case Intrinsic::nvvm_suld_2d_v2i32_zero:
+ case Intrinsic::nvvm_suld_2d_v4i32_zero:
+ case Intrinsic::nvvm_suld_2d_array_i32_zero:
+ case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+ case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+ case Intrinsic::nvvm_suld_3d_i32_zero:
+ case Intrinsic::nvvm_suld_3d_v2i32_zero:
+ case Intrinsic::nvvm_suld_3d_v4i32_zero: {
+ Info.opc = getOpcForSurfaceInstr(Intrinsic);
+ Info.memVT = MVT::i32;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ case Intrinsic::nvvm_suld_1d_i64_clamp:
+ case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+ case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+ case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+ case Intrinsic::nvvm_suld_2d_i64_clamp:
+ case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+ case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+ case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+ case Intrinsic::nvvm_suld_3d_i64_clamp:
+ case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+ case Intrinsic::nvvm_suld_1d_i64_trap:
+ case Intrinsic::nvvm_suld_1d_v2i64_trap:
+ case Intrinsic::nvvm_suld_1d_array_i64_trap:
+ case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+ case Intrinsic::nvvm_suld_2d_i64_trap:
+ case Intrinsic::nvvm_suld_2d_v2i64_trap:
+ case Intrinsic::nvvm_suld_2d_array_i64_trap:
+ case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+ case Intrinsic::nvvm_suld_3d_i64_trap:
+ case Intrinsic::nvvm_suld_3d_v2i64_trap:
+ case Intrinsic::nvvm_suld_1d_i64_zero:
+ case Intrinsic::nvvm_suld_1d_v2i64_zero:
+ case Intrinsic::nvvm_suld_1d_array_i64_zero:
+ case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+ case Intrinsic::nvvm_suld_2d_i64_zero:
+ case Intrinsic::nvvm_suld_2d_v2i64_zero:
+ case Intrinsic::nvvm_suld_2d_array_i64_zero:
+ case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+ case Intrinsic::nvvm_suld_3d_i64_zero:
+ case Intrinsic::nvvm_suld_3d_v2i64_zero: {
+ Info.opc = getOpcForSurfaceInstr(Intrinsic);
+ Info.memVT = MVT::i64;
+ Info.ptrVal = nullptr;
+ Info.offset = 0;
+ Info.vol = 0;
+ Info.readMem = true;
+ Info.writeMem = false;
+ Info.align = 16;
+ return true;
+ }
+ }
+ return false;
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+/// Used to guide target specific optimizations, like loop strength reduction
+/// (LoopStrengthReduce.cpp) and memory optimization for address mode
+/// (CodeGenPrepare.cpp)
+bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+ Type *Ty) const {
+
+ // AddrMode - This represents an addressing mode of:
+ // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+ //
+ // The legal address modes are
+ // - [avar]
+ // - [areg]
+ // - [areg+immoff]
+ // - [immAddr]
+
+ if (AM.BaseGV) {
+ if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
+ return false;
+ return true;
+ }
+
+ switch (AM.Scale) {
+ case 0: // "r", "r+i" or "i" is allowed
+ break;
+ case 1:
+ if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
+ return false;
+ // Otherwise we have r+i.
+ break;
+ default:
+ // No scale > 1 is allowed
+ return false;
+ }
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+NVPTXTargetLowering::ConstraintType
+NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default:
+ break;
+ case 'b':
+ case 'r':
+ case 'h':
+ case 'c':
+ case 'l':
+ case 'f':
+ case 'd':
+ case '0':
+ case 'N':
+ return C_RegisterClass;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+ MVT VT) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'b':
+ return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
+ case 'c':
+ return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+ case 'h':
+ return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+ case 'r':
+ return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+ case 'l':
+ case 'N':
+ return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+ case 'f':
+ return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+ case 'd':
+ return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+ }
+ }
+ return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
+ return 4;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX DAG Combining
+//===----------------------------------------------------------------------===//
+
+bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+ CodeGenOpt::Level OptLevel) const {
+ const Function *F = MF.getFunction();
+ const TargetOptions &TO = MF.getTarget().Options;
+
+ // Always honor command-line argument
+ if (FMAContractLevelOpt.getNumOccurrences() > 0) {
+ return FMAContractLevelOpt > 0;
+ } else if (OptLevel == 0) {
+ // Do not contract if we're not optimizing the code
+ return false;
+ } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
+ // Honor TargetOptions flags that explicitly say fusion is okay
+ return true;
+ } else if (F->hasFnAttribute("unsafe-fp-math")) {
+ // Check for unsafe-fp-math=true coming from Clang
+ Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+ StringRef Val = Attr.getValueAsString();
+ if (Val == "true")
+ return true;
+ }
+
+ // We did not have a clear indication that fusion is allowed, so assume not
+ return false;
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1. This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &Subtarget,
+ CodeGenOpt::Level OptLevel) {
+ SelectionDAG &DAG = DCI.DAG;
+ // Skip non-integer, non-scalar case
+ EVT VT=N0.getValueType();
+ if (VT.isVector())
+ return SDValue();
+
+ // fold (add (mul a, b), c) -> (mad a, b, c)
+ //
+ if (N0.getOpcode() == ISD::MUL) {
+ assert (VT.isInteger());
+ // For integer:
+ // Since integer multiply-add costs the same as integer multiply
+ // but is more costly than integer add, do the fusion only when
+ // the mul is only used in the add.
+ if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
+ !N0.getNode()->hasOneUse())
+ return SDValue();
+
+ // Do the folding
+ return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+ N0.getOperand(0), N0.getOperand(1), N1);
+ }
+ else if (N0.getOpcode() == ISD::FMUL) {
+ if (VT == MVT::f32 || VT == MVT::f64) {
+ NVPTXTargetLowering *TLI =
+ (NVPTXTargetLowering *)&DAG.getTargetLoweringInfo();
+ if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
+ return SDValue();
+
+ // For floating point:
+ // Do the fusion only when the mul has less than 5 uses and all
+ // are add.
+ // The heuristic is that if a use is not an add, then that use
+ // cannot be fused into fma, therefore mul is still needed anyway.
+ // If there are more than 4 uses, even if they are all add, fusing
+ // them will increase register pressue.
+ //
+ int numUses = 0;
+ int nonAddCount = 0;
+ for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+ UE = N0.getNode()->use_end();
+ UI != UE; ++UI) {
+ numUses++;
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::FADD)
+ ++nonAddCount;
+ }
+ if (numUses >= 5)
+ return SDValue();
+ if (nonAddCount) {
+ int orderNo = N->getIROrder();
+ int orderNo2 = N0.getNode()->getIROrder();
+ // simple heuristics here for considering potential register
+ // pressure, the logics here is that the differnce are used
+ // to measure the distance between def and use, the longer distance
+ // more likely cause register pressure.
+ if (orderNo - orderNo2 < 500)
+ return SDValue();
+
+ // Now, check if at least one of the FMUL's operands is live beyond the node N,
+ // which guarantees that the FMA will not increase register pressure at node N.
+ bool opIsLive = false;
+ const SDNode *left = N0.getOperand(0).getNode();
+ const SDNode *right = N0.getOperand(1).getNode();
+
+ if (dyn_cast<ConstantSDNode>(left) || dyn_cast<ConstantSDNode>(right))
+ opIsLive = true;
+
+ if (!opIsLive)
+ for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ int orderNo3 = User->getIROrder();
+ if (orderNo3 > orderNo) {
+ opIsLive = true;
+ break;
+ }
+ }
+
+ if (!opIsLive)
+ for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ int orderNo3 = User->getIROrder();
+ if (orderNo3 > orderNo) {
+ opIsLive = true;
+ break;
+ }
+ }
+
+ if (!opIsLive)
+ return SDValue();
+ }
+
+ return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+ N0.getOperand(0), N0.getOperand(1), N1);
+ }
+ }
+
+ return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const NVPTXSubtarget &Subtarget,
+ CodeGenOpt::Level OptLevel) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // First try with the default operand order.
+ SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
+ OptLevel);
+ if (Result.getNode())
+ return Result;
+
+ // If that didn't work, try again with the operands commuted.
+ return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // The type legalizer turns a vector load of i8 values into a zextload to i16
+ // registers, optionally ANY_EXTENDs it (if target type is integer),
+ // and ANDs off the high 8 bits. Since we turn this load into a
+ // target-specific DAG node, the DAG combiner fails to eliminate these AND
+ // nodes. Do that here.
+ SDValue Val = N->getOperand(0);
+ SDValue Mask = N->getOperand(1);
+
+ if (isa<ConstantSDNode>(Val)) {
+ std::swap(Val, Mask);
+ }
+
+ SDValue AExt;
+ // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
+ if (Val.getOpcode() == ISD::ANY_EXTEND) {
+ AExt = Val;
+ Val = Val->getOperand(0);
+ }
+
+ if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
+ Val = Val->getOperand(0);
+ }
+
+ if (Val->getOpcode() == NVPTXISD::LoadV2 ||
+ Val->getOpcode() == NVPTXISD::LoadV4) {
+ ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+ if (!MaskCnst) {
+ // Not an AND with a constant
+ return SDValue();
+ }
+
+ uint64_t MaskVal = MaskCnst->getZExtValue();
+ if (MaskVal != 0xff) {
+ // Not an AND that chops off top 8 bits
+ return SDValue();
+ }
+
+ MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
+ if (!Mem) {
+ // Not a MemSDNode?!?
+ return SDValue();
+ }
+
+ EVT MemVT = Mem->getMemoryVT();
+ if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
+ // We only handle the i8 case
+ return SDValue();
+ }
+
+ unsigned ExtType =
+ cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
+ getZExtValue();
+ if (ExtType == ISD::SEXTLOAD) {
+ // If for some reason the load is a sextload, the and is needed to zero
+ // out the high 8 bits
+ return SDValue();
+ }
+
+ bool AddTo = false;
+ if (AExt.getNode() != 0) {
+ // Re-insert the ext as a zext.
+ Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+ AExt.getValueType(), Val);
+ AddTo = true;
+ }
+
+ // If we get here, the AND is unnecessary. Just replace it with the load
+ DCI.CombineTo(N, Val, AddTo);
+ }
+
+ return SDValue();
+}
+
+enum OperandSignedness {
+ Signed = 0,
+ Unsigned,
+ Unknown
+};
+
+/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
+/// that can be demoted to \p OptSize bits without loss of information. The
+/// signedness of the operand, if determinable, is placed in \p S.
+static bool IsMulWideOperandDemotable(SDValue Op,
+ unsigned OptSize,
+ OperandSignedness &S) {
+ S = Unknown;
+
+ if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+ Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ EVT OrigVT = Op.getOperand(0).getValueType();
+ if (OrigVT.getSizeInBits() == OptSize) {
+ S = Signed;
+ return true;
+ }
+ } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+ EVT OrigVT = Op.getOperand(0).getValueType();
+ if (OrigVT.getSizeInBits() == OptSize) {
+ S = Unsigned;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
+/// be demoted to \p OptSize bits without loss of information. If the operands
+/// contain a constant, it should appear as the RHS operand. The signedness of
+/// the operands is placed in \p IsSigned.
+static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
+ unsigned OptSize,
+ bool &IsSigned) {
+
+ OperandSignedness LHSSign;
+
+ // The LHS operand must be a demotable op
+ if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
+ return false;
+
+ // We should have been able to determine the signedness from the LHS
+ if (LHSSign == Unknown)
+ return false;
+
+ IsSigned = (LHSSign == Signed);
+
+ // The RHS can be a demotable op or a constant
+ if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
+ APInt Val = CI->getAPIntValue();
+ if (LHSSign == Unsigned) {
+ if (Val.isIntN(OptSize)) {
+ return true;
+ }
+ return false;
+ } else {
+ if (Val.isSignedIntN(OptSize)) {
+ return true;
+ }
+ return false;
+ }
+ } else {
+ OperandSignedness RHSSign;
+ if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
+ return false;
+
+ if (LHSSign != RHSSign)
+ return false;
+
+ return true;
+ }
+}
+
+/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
+/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
+/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
+/// amount.
+static SDValue TryMULWIDECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT MulType = N->getValueType(0);
+ if (MulType != MVT::i32 && MulType != MVT::i64) {
+ return SDValue();
+ }
+
+ unsigned OptSize = MulType.getSizeInBits() >> 1;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Canonicalize the multiply so the constant (if any) is on the right
+ if (N->getOpcode() == ISD::MUL) {
+ if (isa<ConstantSDNode>(LHS)) {
+ std::swap(LHS, RHS);
+ }
+ }
+
+ // If we have a SHL, determine the actual multiply amount
+ if (N->getOpcode() == ISD::SHL) {
+ ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (!ShlRHS) {
+ return SDValue();
+ }
+
+ APInt ShiftAmt = ShlRHS->getAPIntValue();
+ unsigned BitWidth = MulType.getSizeInBits();
+ if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
+ APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
+ RHS = DCI.DAG.getConstant(MulVal, MulType);
+ } else {
+ return SDValue();
+ }
+ }
+
+ bool Signed;
+ // Verify that our operands are demotable
+ if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
+ return SDValue();
+ }
+
+ EVT DemotedVT;
+ if (MulType == MVT::i32) {
+ DemotedVT = MVT::i16;
+ } else {
+ DemotedVT = MVT::i32;
+ }
+
+ // Truncate the operands to the correct size. Note that these are just for
+ // type consistency and will (likely) be eliminated in later phases.
+ SDValue TruncLHS =
+ DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, LHS);
+ SDValue TruncRHS =
+ DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), DemotedVT, RHS);
+
+ unsigned Opc;
+ if (Signed) {
+ Opc = NVPTXISD::MUL_WIDE_SIGNED;
+ } else {
+ Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
+ }
+
+ return DCI.DAG.getNode(Opc, SDLoc(N), MulType, TruncLHS, TruncRHS);
+}
+
+/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
+static SDValue PerformMULCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOpt::Level OptLevel) {
+ if (OptLevel > 0) {
+ // Try mul.wide combining at OptLevel > 0
+ SDValue Ret = TryMULWIDECombine(N, DCI);
+ if (Ret.getNode())
+ return Ret;
+ }
+
+ return SDValue();
+}
+
+/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
+static SDValue PerformSHLCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ CodeGenOpt::Level OptLevel) {
+ if (OptLevel > 0) {
+ // Try mul.wide combining at OptLevel > 0
+ SDValue Ret = TryMULWIDECombine(N, DCI);
+ if (Ret.getNode())
+ return Ret;
+ }
+
+ return SDValue();
+}
+
+SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::FADD:
+ return PerformADDCombine(N, DCI, nvptxSubtarget, OptLevel);
+ case ISD::MUL:
+ return PerformMULCombine(N, DCI, OptLevel);
+ case ISD::SHL:
+ return PerformSHLCombine(N, DCI, OptLevel);
+ case ISD::AND:
+ return PerformANDCombine(N, DCI);
+ }
+ return SDValue();
+}
+
+/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
+static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+ const DataLayout *TD,
+ SmallVectorImpl<SDValue> &Results) {
+ EVT ResVT = N->getValueType(0);
+ SDLoc DL(N);
+
+ assert(ResVT.isVector() && "Vector load must have vector type");
+
+ // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+ // legal. We can (and should) split that into 2 loads of <2 x double> here
+ // but I'm leaving that as a TODO for now.
+ assert(ResVT.isSimple() && "Can only handle simple types");
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ default:
+ return;
+ case MVT::v2i8:
+ case MVT::v2i16:
+ case MVT::v2i32:
+ case MVT::v2i64:
+ case MVT::v2f32:
+ case MVT::v2f64:
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v4i32:
+ case MVT::v4f32:
+ // This is a "native" vector type
+ break;
+ }
+
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+
+ unsigned Align = LD->getAlignment();
+ unsigned PrefAlign =
+ TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+ if (Align < PrefAlign) {
+ // This load is not sufficiently aligned, so bail out and let this vector
+ // load be scalarized. Note that we may still be able to emit smaller
+ // vector loads. For example, if we are loading a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return;
+ }
+
+ EVT EltVT = ResVT.getVectorElementType();
+ unsigned NumElts = ResVT.getVectorNumElements();
+
+ // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // loaded type to i16 and propagate the "real" type as the memory type.
+ bool NeedTrunc = false;
+ if (EltVT.getSizeInBits() < 16) {
+ EltVT = MVT::i16;
+ NeedTrunc = true;
+ }
+
+ unsigned Opcode = 0;
+ SDVTList LdResVTs;
+
+ switch (NumElts) {
+ default:
+ return;
+ case 2:
+ Opcode = NVPTXISD::LoadV2;
+ LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+ break;
+ case 4: {
+ Opcode = NVPTXISD::LoadV4;
+ EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+ LdResVTs = DAG.getVTList(ListVTs);
+ break;
+ }
+ }
+
+ SmallVector<SDValue, 8> OtherOps;
+
+ // Copy regular operands
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+ OtherOps.push_back(N->getOperand(i));
+
+ // The select routine does not have access to the LoadSDNode instance, so
+ // pass along the extension information
+ OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
+
+ SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+ LD->getMemoryVT(),
+ LD->getMemOperand());
+
+ SmallVector<SDValue, 4> ScalarRes;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Res = NewLD.getValue(i);
+ if (NeedTrunc)
+ Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+ ScalarRes.push_back(Res);
+ }
+
+ SDValue LoadChain = NewLD.getValue(NumElts);
+
+ SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+
+ Results.push_back(BuildVec);
+ Results.push_back(LoadChain);
+}
+
+static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &Results) {
+ SDValue Chain = N->getOperand(0);
+ SDValue Intrin = N->getOperand(1);
+ SDLoc DL(N);
+
+ // Get the intrinsic ID
+ unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+ switch (IntrinNo) {
+ default:
+ return;
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p:
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_p: {
+ EVT ResVT = N->getValueType(0);
+
+ if (ResVT.isVector()) {
+ // Vector LDG/LDU
+
+ unsigned NumElts = ResVT.getVectorNumElements();
+ EVT EltVT = ResVT.getVectorElementType();
+
+ // Since LDU/LDG are target nodes, we cannot rely on DAG type
+ // legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // loaded type to i16 and propagate the "real" type as the memory type.
+ bool NeedTrunc = false;
+ if (EltVT.getSizeInBits() < 16) {
+ EltVT = MVT::i16;
+ NeedTrunc = true;
+ }
+
+ unsigned Opcode = 0;
+ SDVTList LdResVTs;
+
+ switch (NumElts) {
+ default:
+ return;
+ case 2:
+ switch (IntrinNo) {
+ default:
+ return;
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p:
+ Opcode = NVPTXISD::LDGV2;
+ break;
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_p:
+ Opcode = NVPTXISD::LDUV2;
+ break;
+ }
+ LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+ break;
+ case 4: {
+ switch (IntrinNo) {
+ default:
+ return;
+ case Intrinsic::nvvm_ldg_global_i:
+ case Intrinsic::nvvm_ldg_global_f:
+ case Intrinsic::nvvm_ldg_global_p:
+ Opcode = NVPTXISD::LDGV4;
+ break;
+ case Intrinsic::nvvm_ldu_global_i:
+ case Intrinsic::nvvm_ldu_global_f:
+ case Intrinsic::nvvm_ldu_global_p:
+ Opcode = NVPTXISD::LDUV4;
+ break;
+ }
+ EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+ LdResVTs = DAG.getVTList(ListVTs);
+ break;
+ }
+ }
+
+ SmallVector<SDValue, 8> OtherOps;
+
+ // Copy regular operands
+
+ OtherOps.push_back(Chain); // Chain
+ // Skip operand 1 (intrinsic ID)
+ // Others
+ for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i)
+ OtherOps.push_back(N->getOperand(i));
+
+ MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+ SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+ MemSD->getMemoryVT(),
+ MemSD->getMemOperand());
+
+ SmallVector<SDValue, 4> ScalarRes;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Res = NewLD.getValue(i);
+ if (NeedTrunc)
+ Res =
+ DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+ ScalarRes.push_back(Res);
+ }
+
+ SDValue LoadChain = NewLD.getValue(NumElts);
+
+ SDValue BuildVec =
+ DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
+
+ Results.push_back(BuildVec);
+ Results.push_back(LoadChain);
+ } else {
+ // i8 LDG/LDU
+ assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
+ "Custom handling of non-i8 ldu/ldg?");
+
+ // Just copy all operands as-is
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+ Ops.push_back(N->getOperand(i));
+
+ // Force output to i16
+ SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
+
+ MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+ // We make sure the memory type is i8, which will be used during isel
+ // to select the proper instruction.
+ SDValue NewLD =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
+ MVT::i8, MemSD->getMemOperand());
+
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+ NewLD.getValue(0)));
+ Results.push_back(NewLD.getValue(1));
+ }
+ }
+ }
+}
+
+void NVPTXTargetLowering::ReplaceNodeResults(
+ SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+ switch (N->getOpcode()) {
+ default:
+ report_fatal_error("Unhandled custom legalization");
+ case ISD::LOAD:
+ ReplaceLoadVector(N, DAG, getDataLayout(), Results);
+ return;
+ case ISD::INTRINSIC_W_CHAIN:
+ ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
+ return;
+ }
+}
+
+// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
+void NVPTXSection::anchor() {}
+
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
+ delete TextSection;
+ delete DataSection;
+ delete BSSSection;
+ delete ReadOnlySection;
+
+ delete StaticCtorSection;
+ delete StaticDtorSection;
+ delete LSDASection;
+ delete EHFrameSection;
+ delete DwarfAbbrevSection;
+ delete DwarfInfoSection;
+ delete DwarfLineSection;
+ delete DwarfFrameSection;
+ delete DwarfPubTypesSection;
+ delete DwarfDebugInlineSection;
+ delete DwarfStrSection;
+ delete DwarfLocSection;
+ delete DwarfARangesSection;
+ delete DwarfRangesSection;
+ delete DwarfMacroInfoSection;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
new file mode 100644
index 000000000000..bef6ed9faad6
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -0,0 +1,541 @@
+//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXISELLOWERING_H
+#define NVPTXISELLOWERING_H
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace NVPTXISD {
+enum NodeType {
+ // Start the numbering from where ISD NodeType finishes.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+ Wrapper,
+ CALL,
+ RET_FLAG,
+ LOAD_PARAM,
+ DeclareParam,
+ DeclareScalarParam,
+ DeclareRetParam,
+ DeclareRet,
+ DeclareScalarRet,
+ PrintCall,
+ PrintCallUni,
+ CallArgBegin,
+ CallArg,
+ LastCallArg,
+ CallArgEnd,
+ CallVoid,
+ CallVal,
+ CallSymbol,
+ Prototype,
+ MoveParam,
+ PseudoUseParam,
+ RETURN,
+ CallSeqBegin,
+ CallSeqEnd,
+ CallPrototype,
+ FUN_SHFL_CLAMP,
+ FUN_SHFR_CLAMP,
+ MUL_WIDE_SIGNED,
+ MUL_WIDE_UNSIGNED,
+ IMAD,
+ Dummy,
+
+ LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LoadV4,
+ LDGV2, // LDG.v2
+ LDGV4, // LDG.v4
+ LDUV2, // LDU.v2
+ LDUV4, // LDU.v4
+ StoreV2,
+ StoreV4,
+ LoadParam,
+ LoadParamV2,
+ LoadParamV4,
+ StoreParam,
+ StoreParamV2,
+ StoreParamV4,
+ StoreParamS32, // to sext and store a <32bit value, not used currently
+ StoreParamU32, // to zext and store a <32bit value, not used currently
+ StoreRetval,
+ StoreRetvalV2,
+ StoreRetvalV4,
+
+ // Texture intrinsics
+ Tex1DFloatS32,
+ Tex1DFloatFloat,
+ Tex1DFloatFloatLevel,
+ Tex1DFloatFloatGrad,
+ Tex1DS32S32,
+ Tex1DS32Float,
+ Tex1DS32FloatLevel,
+ Tex1DS32FloatGrad,
+ Tex1DU32S32,
+ Tex1DU32Float,
+ Tex1DU32FloatLevel,
+ Tex1DU32FloatGrad,
+ Tex1DArrayFloatS32,
+ Tex1DArrayFloatFloat,
+ Tex1DArrayFloatFloatLevel,
+ Tex1DArrayFloatFloatGrad,
+ Tex1DArrayS32S32,
+ Tex1DArrayS32Float,
+ Tex1DArrayS32FloatLevel,
+ Tex1DArrayS32FloatGrad,
+ Tex1DArrayU32S32,
+ Tex1DArrayU32Float,
+ Tex1DArrayU32FloatLevel,
+ Tex1DArrayU32FloatGrad,
+ Tex2DFloatS32,
+ Tex2DFloatFloat,
+ Tex2DFloatFloatLevel,
+ Tex2DFloatFloatGrad,
+ Tex2DS32S32,
+ Tex2DS32Float,
+ Tex2DS32FloatLevel,
+ Tex2DS32FloatGrad,
+ Tex2DU32S32,
+ Tex2DU32Float,
+ Tex2DU32FloatLevel,
+ Tex2DU32FloatGrad,
+ Tex2DArrayFloatS32,
+ Tex2DArrayFloatFloat,
+ Tex2DArrayFloatFloatLevel,
+ Tex2DArrayFloatFloatGrad,
+ Tex2DArrayS32S32,
+ Tex2DArrayS32Float,
+ Tex2DArrayS32FloatLevel,
+ Tex2DArrayS32FloatGrad,
+ Tex2DArrayU32S32,
+ Tex2DArrayU32Float,
+ Tex2DArrayU32FloatLevel,
+ Tex2DArrayU32FloatGrad,
+ Tex3DFloatS32,
+ Tex3DFloatFloat,
+ Tex3DFloatFloatLevel,
+ Tex3DFloatFloatGrad,
+ Tex3DS32S32,
+ Tex3DS32Float,
+ Tex3DS32FloatLevel,
+ Tex3DS32FloatGrad,
+ Tex3DU32S32,
+ Tex3DU32Float,
+ Tex3DU32FloatLevel,
+ Tex3DU32FloatGrad,
+ TexCubeFloatFloat,
+ TexCubeFloatFloatLevel,
+ TexCubeS32Float,
+ TexCubeS32FloatLevel,
+ TexCubeU32Float,
+ TexCubeU32FloatLevel,
+ TexCubeArrayFloatFloat,
+ TexCubeArrayFloatFloatLevel,
+ TexCubeArrayS32Float,
+ TexCubeArrayS32FloatLevel,
+ TexCubeArrayU32Float,
+ TexCubeArrayU32FloatLevel,
+ Tld4R2DFloatFloat,
+ Tld4G2DFloatFloat,
+ Tld4B2DFloatFloat,
+ Tld4A2DFloatFloat,
+ Tld4R2DS64Float,
+ Tld4G2DS64Float,
+ Tld4B2DS64Float,
+ Tld4A2DS64Float,
+ Tld4R2DU64Float,
+ Tld4G2DU64Float,
+ Tld4B2DU64Float,
+ Tld4A2DU64Float,
+ TexUnified1DFloatS32,
+ TexUnified1DFloatFloat,
+ TexUnified1DFloatFloatLevel,
+ TexUnified1DFloatFloatGrad,
+ TexUnified1DS32S32,
+ TexUnified1DS32Float,
+ TexUnified1DS32FloatLevel,
+ TexUnified1DS32FloatGrad,
+ TexUnified1DU32S32,
+ TexUnified1DU32Float,
+ TexUnified1DU32FloatLevel,
+ TexUnified1DU32FloatGrad,
+ TexUnified1DArrayFloatS32,
+ TexUnified1DArrayFloatFloat,
+ TexUnified1DArrayFloatFloatLevel,
+ TexUnified1DArrayFloatFloatGrad,
+ TexUnified1DArrayS32S32,
+ TexUnified1DArrayS32Float,
+ TexUnified1DArrayS32FloatLevel,
+ TexUnified1DArrayS32FloatGrad,
+ TexUnified1DArrayU32S32,
+ TexUnified1DArrayU32Float,
+ TexUnified1DArrayU32FloatLevel,
+ TexUnified1DArrayU32FloatGrad,
+ TexUnified2DFloatS32,
+ TexUnified2DFloatFloat,
+ TexUnified2DFloatFloatLevel,
+ TexUnified2DFloatFloatGrad,
+ TexUnified2DS32S32,
+ TexUnified2DS32Float,
+ TexUnified2DS32FloatLevel,
+ TexUnified2DS32FloatGrad,
+ TexUnified2DU32S32,
+ TexUnified2DU32Float,
+ TexUnified2DU32FloatLevel,
+ TexUnified2DU32FloatGrad,
+ TexUnified2DArrayFloatS32,
+ TexUnified2DArrayFloatFloat,
+ TexUnified2DArrayFloatFloatLevel,
+ TexUnified2DArrayFloatFloatGrad,
+ TexUnified2DArrayS32S32,
+ TexUnified2DArrayS32Float,
+ TexUnified2DArrayS32FloatLevel,
+ TexUnified2DArrayS32FloatGrad,
+ TexUnified2DArrayU32S32,
+ TexUnified2DArrayU32Float,
+ TexUnified2DArrayU32FloatLevel,
+ TexUnified2DArrayU32FloatGrad,
+ TexUnified3DFloatS32,
+ TexUnified3DFloatFloat,
+ TexUnified3DFloatFloatLevel,
+ TexUnified3DFloatFloatGrad,
+ TexUnified3DS32S32,
+ TexUnified3DS32Float,
+ TexUnified3DS32FloatLevel,
+ TexUnified3DS32FloatGrad,
+ TexUnified3DU32S32,
+ TexUnified3DU32Float,
+ TexUnified3DU32FloatLevel,
+ TexUnified3DU32FloatGrad,
+ TexUnifiedCubeFloatFloat,
+ TexUnifiedCubeFloatFloatLevel,
+ TexUnifiedCubeS32Float,
+ TexUnifiedCubeS32FloatLevel,
+ TexUnifiedCubeU32Float,
+ TexUnifiedCubeU32FloatLevel,
+ TexUnifiedCubeArrayFloatFloat,
+ TexUnifiedCubeArrayFloatFloatLevel,
+ TexUnifiedCubeArrayS32Float,
+ TexUnifiedCubeArrayS32FloatLevel,
+ TexUnifiedCubeArrayU32Float,
+ TexUnifiedCubeArrayU32FloatLevel,
+ Tld4UnifiedR2DFloatFloat,
+ Tld4UnifiedG2DFloatFloat,
+ Tld4UnifiedB2DFloatFloat,
+ Tld4UnifiedA2DFloatFloat,
+ Tld4UnifiedR2DS64Float,
+ Tld4UnifiedG2DS64Float,
+ Tld4UnifiedB2DS64Float,
+ Tld4UnifiedA2DS64Float,
+ Tld4UnifiedR2DU64Float,
+ Tld4UnifiedG2DU64Float,
+ Tld4UnifiedB2DU64Float,
+ Tld4UnifiedA2DU64Float,
+
+ // Surface intrinsics
+ Suld1DI8Clamp,
+ Suld1DI16Clamp,
+ Suld1DI32Clamp,
+ Suld1DI64Clamp,
+ Suld1DV2I8Clamp,
+ Suld1DV2I16Clamp,
+ Suld1DV2I32Clamp,
+ Suld1DV2I64Clamp,
+ Suld1DV4I8Clamp,
+ Suld1DV4I16Clamp,
+ Suld1DV4I32Clamp,
+
+ Suld1DArrayI8Clamp,
+ Suld1DArrayI16Clamp,
+ Suld1DArrayI32Clamp,
+ Suld1DArrayI64Clamp,
+ Suld1DArrayV2I8Clamp,
+ Suld1DArrayV2I16Clamp,
+ Suld1DArrayV2I32Clamp,
+ Suld1DArrayV2I64Clamp,
+ Suld1DArrayV4I8Clamp,
+ Suld1DArrayV4I16Clamp,
+ Suld1DArrayV4I32Clamp,
+
+ Suld2DI8Clamp,
+ Suld2DI16Clamp,
+ Suld2DI32Clamp,
+ Suld2DI64Clamp,
+ Suld2DV2I8Clamp,
+ Suld2DV2I16Clamp,
+ Suld2DV2I32Clamp,
+ Suld2DV2I64Clamp,
+ Suld2DV4I8Clamp,
+ Suld2DV4I16Clamp,
+ Suld2DV4I32Clamp,
+
+ Suld2DArrayI8Clamp,
+ Suld2DArrayI16Clamp,
+ Suld2DArrayI32Clamp,
+ Suld2DArrayI64Clamp,
+ Suld2DArrayV2I8Clamp,
+ Suld2DArrayV2I16Clamp,
+ Suld2DArrayV2I32Clamp,
+ Suld2DArrayV2I64Clamp,
+ Suld2DArrayV4I8Clamp,
+ Suld2DArrayV4I16Clamp,
+ Suld2DArrayV4I32Clamp,
+
+ Suld3DI8Clamp,
+ Suld3DI16Clamp,
+ Suld3DI32Clamp,
+ Suld3DI64Clamp,
+ Suld3DV2I8Clamp,
+ Suld3DV2I16Clamp,
+ Suld3DV2I32Clamp,
+ Suld3DV2I64Clamp,
+ Suld3DV4I8Clamp,
+ Suld3DV4I16Clamp,
+ Suld3DV4I32Clamp,
+
+ Suld1DI8Trap,
+ Suld1DI16Trap,
+ Suld1DI32Trap,
+ Suld1DI64Trap,
+ Suld1DV2I8Trap,
+ Suld1DV2I16Trap,
+ Suld1DV2I32Trap,
+ Suld1DV2I64Trap,
+ Suld1DV4I8Trap,
+ Suld1DV4I16Trap,
+ Suld1DV4I32Trap,
+
+ Suld1DArrayI8Trap,
+ Suld1DArrayI16Trap,
+ Suld1DArrayI32Trap,
+ Suld1DArrayI64Trap,
+ Suld1DArrayV2I8Trap,
+ Suld1DArrayV2I16Trap,
+ Suld1DArrayV2I32Trap,
+ Suld1DArrayV2I64Trap,
+ Suld1DArrayV4I8Trap,
+ Suld1DArrayV4I16Trap,
+ Suld1DArrayV4I32Trap,
+
+ Suld2DI8Trap,
+ Suld2DI16Trap,
+ Suld2DI32Trap,
+ Suld2DI64Trap,
+ Suld2DV2I8Trap,
+ Suld2DV2I16Trap,
+ Suld2DV2I32Trap,
+ Suld2DV2I64Trap,
+ Suld2DV4I8Trap,
+ Suld2DV4I16Trap,
+ Suld2DV4I32Trap,
+
+ Suld2DArrayI8Trap,
+ Suld2DArrayI16Trap,
+ Suld2DArrayI32Trap,
+ Suld2DArrayI64Trap,
+ Suld2DArrayV2I8Trap,
+ Suld2DArrayV2I16Trap,
+ Suld2DArrayV2I32Trap,
+ Suld2DArrayV2I64Trap,
+ Suld2DArrayV4I8Trap,
+ Suld2DArrayV4I16Trap,
+ Suld2DArrayV4I32Trap,
+
+ Suld3DI8Trap,
+ Suld3DI16Trap,
+ Suld3DI32Trap,
+ Suld3DI64Trap,
+ Suld3DV2I8Trap,
+ Suld3DV2I16Trap,
+ Suld3DV2I32Trap,
+ Suld3DV2I64Trap,
+ Suld3DV4I8Trap,
+ Suld3DV4I16Trap,
+ Suld3DV4I32Trap,
+
+ Suld1DI8Zero,
+ Suld1DI16Zero,
+ Suld1DI32Zero,
+ Suld1DI64Zero,
+ Suld1DV2I8Zero,
+ Suld1DV2I16Zero,
+ Suld1DV2I32Zero,
+ Suld1DV2I64Zero,
+ Suld1DV4I8Zero,
+ Suld1DV4I16Zero,
+ Suld1DV4I32Zero,
+
+ Suld1DArrayI8Zero,
+ Suld1DArrayI16Zero,
+ Suld1DArrayI32Zero,
+ Suld1DArrayI64Zero,
+ Suld1DArrayV2I8Zero,
+ Suld1DArrayV2I16Zero,
+ Suld1DArrayV2I32Zero,
+ Suld1DArrayV2I64Zero,
+ Suld1DArrayV4I8Zero,
+ Suld1DArrayV4I16Zero,
+ Suld1DArrayV4I32Zero,
+
+ Suld2DI8Zero,
+ Suld2DI16Zero,
+ Suld2DI32Zero,
+ Suld2DI64Zero,
+ Suld2DV2I8Zero,
+ Suld2DV2I16Zero,
+ Suld2DV2I32Zero,
+ Suld2DV2I64Zero,
+ Suld2DV4I8Zero,
+ Suld2DV4I16Zero,
+ Suld2DV4I32Zero,
+
+ Suld2DArrayI8Zero,
+ Suld2DArrayI16Zero,
+ Suld2DArrayI32Zero,
+ Suld2DArrayI64Zero,
+ Suld2DArrayV2I8Zero,
+ Suld2DArrayV2I16Zero,
+ Suld2DArrayV2I32Zero,
+ Suld2DArrayV2I64Zero,
+ Suld2DArrayV4I8Zero,
+ Suld2DArrayV4I16Zero,
+ Suld2DArrayV4I32Zero,
+
+ Suld3DI8Zero,
+ Suld3DI16Zero,
+ Suld3DI32Zero,
+ Suld3DI64Zero,
+ Suld3DV2I8Zero,
+ Suld3DV2I16Zero,
+ Suld3DV2I32Zero,
+ Suld3DV2I64Zero,
+ Suld3DV4I8Zero,
+ Suld3DV4I16Zero,
+ Suld3DV4I32Zero
+};
+}
+
+class NVPTXSubtarget;
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+class NVPTXTargetLowering : public TargetLowering {
+public:
+ explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
+ SelectionDAG &DAG) const;
+
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ bool isTypeSupportedInIntrinsic(MVT VT) const;
+
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ unsigned Intrinsic) const override;
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type
+ /// Used to guide target specific optimizations, like loop strength
+ /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+ /// address mode (CodeGenPrepare.cpp)
+ bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+ /// getFunctionAlignment - Return the Log2 alignment of this function.
+ unsigned getFunctionAlignment(const Function *F) const;
+
+ EVT getSetCCResultType(LLVMContext &Ctx, EVT VT) const override {
+ if (VT.isVector())
+ return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
+ return MVT::i1;
+ }
+
+ ConstraintType
+ getConstraintType(const std::string &Constraint) const override;
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const std::string &Constraint,
+ MVT VT) const override;
+
+ SDValue LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ std::string getPrototype(Type *, const ArgListTy &,
+ const SmallVectorImpl<ISD::OutputArg> &,
+ unsigned retAlignment,
+ const ImmutableCallSite *CS) const;
+
+ SDValue
+ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
+ SelectionDAG &DAG) const override;
+
+ void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ NVPTXTargetMachine *nvTM;
+
+ // PTX always uses 32-bit shift amounts
+ MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+
+ TargetLoweringBase::LegalizeTypeAction
+ getPreferredVectorAction(EVT VT) const override;
+
+ bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
+ return true;
+ }
+
+private:
+ const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
+
+ SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx,
+ EVT = MVT::i32) const;
+ SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
+ SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
+
+ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
+ Type *Ty, unsigned Idx) const;
+};
+} // namespace llvm
+
+#endif // NVPTXISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
new file mode 100644
index 000000000000..a98fb37f6e25
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -0,0 +1,178 @@
+//===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR-level optimizations of image access code,
+// including:
+//
+// 1. Eliminate istypep intrinsics when image access qualifier is known
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXImageOptimizer : public FunctionPass {
+private:
+ static char ID;
+ SmallVector<Instruction*, 4> InstrToDelete;
+
+public:
+ NVPTXImageOptimizer();
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ bool replaceIsTypePSampler(Instruction &I);
+ bool replaceIsTypePSurface(Instruction &I);
+ bool replaceIsTypePTexture(Instruction &I);
+ Value *cleanupValue(Value *V);
+ void replaceWith(Instruction *From, ConstantInt *To);
+};
+}
+
+char NVPTXImageOptimizer::ID = 0;
+
+NVPTXImageOptimizer::NVPTXImageOptimizer()
+ : FunctionPass(ID) {}
+
+bool NVPTXImageOptimizer::runOnFunction(Function &F) {
+ bool Changed = false;
+ InstrToDelete.clear();
+
+ // Look for call instructions in the function
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;
+ ++BI) {
+ for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+ I != E; ++I) {
+ Instruction &Instr = *I;
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Function *CalledF = CI->getCalledFunction();
+ if (CalledF && CalledF->isIntrinsic()) {
+ // This is an intrinsic function call, check if its an istypep
+ switch (CalledF->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::nvvm_istypep_sampler:
+ Changed |= replaceIsTypePSampler(Instr);
+ break;
+ case Intrinsic::nvvm_istypep_surface:
+ Changed |= replaceIsTypePSurface(Instr);
+ break;
+ case Intrinsic::nvvm_istypep_texture:
+ Changed |= replaceIsTypePTexture(Instr);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // Delete any istypep instances we replaced in the IR
+ for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i)
+ InstrToDelete[i]->eraseFromParent();
+
+ return Changed;
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
+ Value *TexHandle = cleanupValue(I.getOperand(0));
+ if (isSampler(*TexHandle)) {
+ // This is an OpenCL sampler, so it must be a samplerref
+ replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+ return true;
+ } else if (isImageWriteOnly(*TexHandle) ||
+ isImageReadWrite(*TexHandle) ||
+ isImageReadOnly(*TexHandle)) {
+ // This is an OpenCL image, so it cannot be a samplerref
+ replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+ return true;
+ } else {
+ // The image type is unknown, so we cannot eliminate the intrinsic
+ return false;
+ }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSurface(Instruction &I) {
+ Value *TexHandle = cleanupValue(I.getOperand(0));
+ if (isImageReadWrite(*TexHandle) ||
+ isImageWriteOnly(*TexHandle)) {
+ // This is an OpenCL read-only/read-write image, so it must be a surfref
+ replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+ return true;
+ } else if (isImageReadOnly(*TexHandle) ||
+ isSampler(*TexHandle)) {
+ // This is an OpenCL read-only/ imageor sampler, so it cannot be
+ // a surfref
+ replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+ return true;
+ } else {
+ // The image type is unknown, so we cannot eliminate the intrinsic
+ return false;
+ }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
+ Value *TexHandle = cleanupValue(I.getOperand(0));
+ if (isImageReadOnly(*TexHandle)) {
+ // This is an OpenCL read-only image, so it must be a texref
+ replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+ return true;
+ } else if (isImageWriteOnly(*TexHandle) ||
+ isImageReadWrite(*TexHandle) ||
+ isSampler(*TexHandle)) {
+ // This is an OpenCL read-write/write-only image or a sampler, so it
+ // cannot be a texref
+ replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+ return true;
+ } else {
+ // The image type is unknown, so we cannot eliminate the intrinsic
+ return false;
+ }
+}
+
+void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
+ // We implement "poor man's DCE" here to make sure any code that is no longer
+ // live is actually unreachable and can be trivially eliminated by the
+ // unreachable block elimination pass.
+ for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
+ UI != UE; ++UI) {
+ if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
+ if (BI->isUnconditional()) continue;
+ BasicBlock *Dest;
+ if (To->isZero())
+ // Get false block
+ Dest = BI->getSuccessor(1);
+ else
+ // Get true block
+ Dest = BI->getSuccessor(0);
+ BranchInst::Create(Dest, BI);
+ InstrToDelete.push_back(BI);
+ }
+ }
+ From->replaceAllUsesWith(To);
+ InstrToDelete.push_back(From);
+}
+
+Value *NVPTXImageOptimizer::cleanupValue(Value *V) {
+ if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+ return cleanupValue(EVI->getAggregateOperand());
+ }
+ return V;
+}
+
+FunctionPass *llvm::createNVPTXImageOptimizerPass() {
+ return new NVPTXImageOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
new file mode 100644
index 000000000000..ffcb5d5273a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -0,0 +1,59 @@
+//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Describe NVPTX instructions format
+//
+//===----------------------------------------------------------------------===//
+
+// Vector instruction type enum
+class VecInstTypeEnum<bits<4> val> {
+ bits<4> Value=val;
+}
+def VecNOP : VecInstTypeEnum<0>;
+
+// Generic NVPTX Format
+
+class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : Instruction {
+ field bits<14> Inst;
+
+ let Namespace = "NVPTX";
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ let AsmString = asmstr;
+ let Pattern = pattern;
+
+ // TSFlagFields
+ bits<4> VecInstType = VecNOP.Value;
+ bit IsSimpleMove = 0;
+ bit IsLoad = 0;
+ bit IsStore = 0;
+
+ bit IsTex = 0;
+ bit IsSust = 0;
+ bit IsSurfTexQuery = 0;
+ bit IsTexModeUnified = 0;
+
+ // The following field is encoded as log2 of the vector size minus one,
+ // with 0 meaning the operation is not a surface instruction. For example,
+ // if IsSuld == 2, then the instruction is a suld instruction with vector size
+ // 2**(2-1) = 2.
+ bits<2> IsSuld = 0;
+
+ let TSFlags{3-0} = VecInstType;
+ let TSFlags{4-4} = IsSimpleMove;
+ let TSFlags{5-5} = IsLoad;
+ let TSFlags{6-6} = IsStore;
+ let TSFlags{7} = IsTex;
+ let TSFlags{9-8} = IsSuld;
+ let TSFlags{10} = IsSust;
+ let TSFlags{11} = IsSurfTexQuery;
+ let TSFlags{12} = IsTexModeUnified;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
new file mode 100644
index 000000000000..b5b4fbed0799
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -0,0 +1,273 @@
+//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "NVPTXGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void NVPTXInstrInfo::anchor() {}
+
+// FIXME: Add the subtarget support on this constructor.
+NVPTXInstrInfo::NVPTXInstrInfo(NVPTXSubtarget &STI)
+ : NVPTXGenInstrInfo(), RegInfo(STI) {}
+
+void NVPTXInstrInfo::copyPhysReg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+
+ if (DestRC != SrcRC)
+ report_fatal_error("Attempted to created cross-class register copy");
+
+ if (DestRC == &NVPTX::Int32RegsRegClass)
+ BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Int1RegsRegClass)
+ BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Float32RegsRegClass)
+ BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Int16RegsRegClass)
+ BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Int64RegsRegClass)
+ BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Float64RegsRegClass)
+ BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else {
+ llvm_unreachable("Bad register copy");
+ }
+}
+
+bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &DestReg) const {
+ // Look for the appropriate part of TSFlags
+ bool isMove = false;
+
+ unsigned TSFlags =
+ (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >> NVPTX::SimpleMoveShift;
+ isMove = (TSFlags == 1);
+
+ if (isMove) {
+ MachineOperand dest = MI.getOperand(0);
+ MachineOperand src = MI.getOperand(1);
+ assert(dest.isReg() && "dest of a movrr is not a reg");
+ assert(src.isReg() && "src of a movrr is not a reg");
+
+ SrcReg = src.getReg();
+ DestReg = dest.getReg();
+ return true;
+ }
+
+ return false;
+}
+
+bool NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case NVPTX::INT_PTX_SREG_NTID_X:
+ case NVPTX::INT_PTX_SREG_NTID_Y:
+ case NVPTX::INT_PTX_SREG_NTID_Z:
+ case NVPTX::INT_PTX_SREG_TID_X:
+ case NVPTX::INT_PTX_SREG_TID_Y:
+ case NVPTX::INT_PTX_SREG_TID_Z:
+ case NVPTX::INT_PTX_SREG_CTAID_X:
+ case NVPTX::INT_PTX_SREG_CTAID_Y:
+ case NVPTX::INT_PTX_SREG_CTAID_Z:
+ case NVPTX::INT_PTX_SREG_NCTAID_X:
+ case NVPTX::INT_PTX_SREG_NCTAID_Y:
+ case NVPTX::INT_PTX_SREG_NCTAID_Z:
+ case NVPTX::INT_PTX_SREG_WARPSIZE:
+ return true;
+ }
+}
+
+bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
+ unsigned &AddrSpace) const {
+ bool isLoad = false;
+ unsigned TSFlags =
+ (MI.getDesc().TSFlags & NVPTX::isLoadMask) >> NVPTX::isLoadShift;
+ isLoad = (TSFlags == 1);
+ if (isLoad)
+ AddrSpace = getLdStCodeAddrSpace(MI);
+ return isLoad;
+}
+
+bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
+ unsigned &AddrSpace) const {
+ bool isStore = false;
+ unsigned TSFlags =
+ (MI.getDesc().TSFlags & NVPTX::isStoreMask) >> NVPTX::isStoreShift;
+ isStore = (TSFlags == 1);
+ if (isStore)
+ AddrSpace = getLdStCodeAddrSpace(MI);
+ return isStore;
+}
+
+bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
+ unsigned addrspace = 0;
+ if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
+ return false;
+ if (isLoadInstr(*MI, addrspace))
+ if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+ return false;
+ if (isStoreInstr(*MI, addrspace))
+ if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+ return false;
+ return true;
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target). Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+/// just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+/// the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+/// an successor block, it sets TBB to be the branch destination block and a
+/// list of operands that evaluate the condition. These
+/// operands can be passed to other TargetInstrInfo methods to create new
+/// branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+/// block, it returns the 'true' destination in TBB, the 'false' destination
+/// in FBB, and a list of operands that evaluate the condition. These
+/// operands can be passed to other TargetInstrInfo methods to create new
+/// branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool NVPTXInstrInfo::AnalyzeBranch(
+ MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const {
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = I;
+
+ // If there is only one terminator instruction, process it.
+ if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+ if (LastInst->getOpcode() == NVPTX::GOTO) {
+ TBB = LastInst->getOperand(0).getMBB();
+ return false;
+ } else if (LastInst->getOpcode() == NVPTX::CBranch) {
+ // Block ends with fall-through condbranch.
+ TBB = LastInst->getOperand(1).getMBB();
+ Cond.push_back(LastInst->getOperand(0));
+ return false;
+ }
+ // Otherwise, don't know what this is.
+ return true;
+ }
+
+ // Get the instruction before it if it's a terminator.
+ MachineInstr *SecondLastInst = I;
+
+ // If there are three terminators, we don't know what sort of block this is.
+ if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+ return true;
+
+ // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
+ if (SecondLastInst->getOpcode() == NVPTX::CBranch &&
+ LastInst->getOpcode() == NVPTX::GOTO) {
+ TBB = SecondLastInst->getOperand(1).getMBB();
+ Cond.push_back(SecondLastInst->getOperand(0));
+ FBB = LastInst->getOperand(0).getMBB();
+ return false;
+ }
+
+ // If the block ends with two NVPTX:GOTOs, handle it. The second one is not
+ // executed, so remove it.
+ if (SecondLastInst->getOpcode() == NVPTX::GOTO &&
+ LastInst->getOpcode() == NVPTX::GOTO) {
+ TBB = SecondLastInst->getOperand(0).getMBB();
+ I = LastInst;
+ if (AllowModify)
+ I->eraseFromParent();
+ return false;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin())
+ return 0;
+ --I;
+ if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch)
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+
+ I = MBB.end();
+
+ if (I == MBB.begin())
+ return 1;
+ --I;
+ if (I->getOpcode() != NVPTX::CBranch)
+ return 1;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ return 2;
+}
+
+unsigned NVPTXInstrInfo::InsertBranch(
+ MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "NVPTX branch conditions have two components!");
+
+ // One-way branch.
+ if (!FBB) {
+ if (Cond.empty()) // Unconditional branch
+ BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
+ else // Conditional branch
+ BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg())
+ .addMBB(TBB);
+ return 1;
+ }
+
+ // Two-way Conditional Branch.
+ BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg()).addMBB(TBB);
+ BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
+ return 2;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
new file mode 100644
index 000000000000..2ac29748676a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -0,0 +1,78 @@
+//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXINSTRUCTIONINFO_H
+#define NVPTXINSTRUCTIONINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "NVPTXGenInstrInfo.inc"
+
+namespace llvm {
+
+class NVPTXInstrInfo : public NVPTXGenInstrInfo {
+ const NVPTXRegisterInfo RegInfo;
+ virtual void anchor();
+public:
+ explicit NVPTXInstrInfo(NVPTXSubtarget &STI);
+
+ const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+ /* The following virtual functions are used in register allocation.
+ * They are not implemented because the existing interface and the logic
+ * at the caller side do not work for the elementized vector load and store.
+ *
+ * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+ * int &FrameIndex) const;
+ * virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+ * int &FrameIndex) const;
+ * virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+ * MachineBasicBlock::iterator MBBI,
+ * unsigned SrcReg, bool isKill, int FrameIndex,
+ * const TargetRegisterClass *RC) const;
+ * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ * MachineBasicBlock::iterator MBBI,
+ * unsigned DestReg, int FrameIndex,
+ * const TargetRegisterClass *RC) const;
+ */
+
+ void copyPhysReg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+ unsigned DestReg, unsigned SrcReg, bool KillSrc) const override;
+ virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &DestReg) const;
+ bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+ bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+ bool isReadSpecialReg(MachineInstr &MI) const;
+
+ virtual bool CanTailMerge(const MachineInstr *MI) const;
+ // Branch analysis.
+ bool AnalyzeBranch(
+ MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+ unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+ unsigned InsertBranch(
+ MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
+ unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
+ return MI.getOperand(2).getImm();
+ }
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
new file mode 100644
index 000000000000..9900b8c8433f
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -0,0 +1,2741 @@
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "NVPTXInstrFormats.td"
+
+// A NOP instruction
+def NOP : NVPTXInst<(outs), (ins), "", []>;
+
+// List of vector specific properties
+def isVecLD : VecInstTypeEnum<1>;
+def isVecST : VecInstTypeEnum<2>;
+def isVecBuild : VecInstTypeEnum<3>;
+def isVecShuffle : VecInstTypeEnum<4>;
+def isVecExtract : VecInstTypeEnum<5>;
+def isVecInsert : VecInstTypeEnum<6>;
+def isVecDest : VecInstTypeEnum<7>;
+def isVecOther : VecInstTypeEnum<15>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+def brtarget : Operand<OtherVT>;
+
+// CVT conversion modes
+// These must match the enum in NVPTX.h
+def CvtNONE : PatLeaf<(i32 0x0)>;
+def CvtRNI : PatLeaf<(i32 0x1)>;
+def CvtRZI : PatLeaf<(i32 0x2)>;
+def CvtRMI : PatLeaf<(i32 0x3)>;
+def CvtRPI : PatLeaf<(i32 0x4)>;
+def CvtRN : PatLeaf<(i32 0x5)>;
+def CvtRZ : PatLeaf<(i32 0x6)>;
+def CvtRM : PatLeaf<(i32 0x7)>;
+def CvtRP : PatLeaf<(i32 0x8)>;
+
+def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
+def CvtRNI_FTZ : PatLeaf<(i32 0x11)>;
+def CvtRZI_FTZ : PatLeaf<(i32 0x12)>;
+def CvtRMI_FTZ : PatLeaf<(i32 0x13)>;
+def CvtRPI_FTZ : PatLeaf<(i32 0x14)>;
+def CvtRN_FTZ : PatLeaf<(i32 0x15)>;
+def CvtRZ_FTZ : PatLeaf<(i32 0x16)>;
+def CvtRM_FTZ : PatLeaf<(i32 0x17)>;
+def CvtRP_FTZ : PatLeaf<(i32 0x18)>;
+
+def CvtSAT : PatLeaf<(i32 0x20)>;
+def CvtSAT_FTZ : PatLeaf<(i32 0x30)>;
+
+def CvtMode : Operand<i32> {
+ let PrintMethod = "printCvtMode";
+}
+
+// Compare modes
+// These must match the enum in NVPTX.h
+def CmpEQ : PatLeaf<(i32 0)>;
+def CmpNE : PatLeaf<(i32 1)>;
+def CmpLT : PatLeaf<(i32 2)>;
+def CmpLE : PatLeaf<(i32 3)>;
+def CmpGT : PatLeaf<(i32 4)>;
+def CmpGE : PatLeaf<(i32 5)>;
+def CmpLO : PatLeaf<(i32 6)>;
+def CmpLS : PatLeaf<(i32 7)>;
+def CmpHI : PatLeaf<(i32 8)>;
+def CmpHS : PatLeaf<(i32 9)>;
+def CmpEQU : PatLeaf<(i32 10)>;
+def CmpNEU : PatLeaf<(i32 11)>;
+def CmpLTU : PatLeaf<(i32 12)>;
+def CmpLEU : PatLeaf<(i32 13)>;
+def CmpGTU : PatLeaf<(i32 14)>;
+def CmpGEU : PatLeaf<(i32 15)>;
+def CmpNUM : PatLeaf<(i32 16)>;
+def CmpNAN : PatLeaf<(i32 17)>;
+
+def CmpEQ_FTZ : PatLeaf<(i32 0x100)>;
+def CmpNE_FTZ : PatLeaf<(i32 0x101)>;
+def CmpLT_FTZ : PatLeaf<(i32 0x102)>;
+def CmpLE_FTZ : PatLeaf<(i32 0x103)>;
+def CmpGT_FTZ : PatLeaf<(i32 0x104)>;
+def CmpGE_FTZ : PatLeaf<(i32 0x105)>;
+def CmpLO_FTZ : PatLeaf<(i32 0x106)>;
+def CmpLS_FTZ : PatLeaf<(i32 0x107)>;
+def CmpHI_FTZ : PatLeaf<(i32 0x108)>;
+def CmpHS_FTZ : PatLeaf<(i32 0x109)>;
+def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>;
+def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>;
+def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>;
+def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>;
+def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>;
+def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>;
+def CmpNUM_FTZ : PatLeaf<(i32 0x110)>;
+def CmpNAN_FTZ : PatLeaf<(i32 0x111)>;
+
+def CmpMode : Operand<i32> {
+ let PrintMethod = "printCmpMode";
+}
+
+def F32ConstZero : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
+ return CurDAG->getTargetConstantFP(0.0, MVT::f32);
+ }]>;
+def F32ConstOne : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
+ return CurDAG->getTargetConstantFP(1.0, MVT::f32);
+ }]>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instruction Predicate Definitions
+//===----------------------------------------------------------------------===//
+
+
+def hasAtomRedG32 : Predicate<"Subtarget.hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget.hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget.hasAtomRedGen32()">;
+def useAtomRedG32forGen32 :
+ Predicate<"!Subtarget.hasAtomRedGen32() && Subtarget.hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget.hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget.hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget.hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget.hasAtomRedGen64()">;
+def useAtomRedG64forGen64 :
+ Predicate<"!Subtarget.hasAtomRedGen64() && Subtarget.hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">;
+def hasVote : Predicate<"Subtarget.hasVote()">;
+def hasDouble : Predicate<"Subtarget.hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget.reqPTX20()">;
+def hasLDG : Predicate<"Subtarget.hasLDG()">;
+def hasLDU : Predicate<"Subtarget.hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
+
+def doF32FTZ : Predicate<"useF32FTZ()">;
+def doNoF32FTZ : Predicate<"!useF32FTZ()">;
+
+def doMulWide : Predicate<"doMulWide">;
+
+def allowFMA : Predicate<"allowFMA()">;
+def noFMA : Predicate<"!allowFMA()">;
+
+def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
+def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
+
+def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
+def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
+
+def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
+def noHWROT32 : Predicate<"!Subtarget.hasHWROT32()">;
+
+def true : Predicate<"1">;
+
+def hasPTX31 : Predicate<"Subtarget.getPTXVersion() >= 31">;
+
+
+//===----------------------------------------------------------------------===//
+// Some Common Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+multiclass I3<string OpcStr, SDNode OpNode> {
+ def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+ Int64Regs:$b))]>;
+ def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+ def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+ Int32Regs:$b))]>;
+ def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+ def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+ Int16Regs:$b))]>;
+ def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+}
+
+multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+ def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+ Int32Regs:$b),
+ !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+ Int32Regs:$b))]>;
+ def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+}
+
+multiclass F3<string OpcStr, SDNode OpNode> {
+ def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst,
+ (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ Requires<[allowFMA]>;
+ def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst,
+ (OpNode Float64Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA]>;
+ def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst,
+ (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[allowFMA, doF32FTZ]>;
+ def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst,
+ (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA, doF32FTZ]>;
+ def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst,
+ (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[allowFMA]>;
+ def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst,
+ (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[allowFMA]>;
+}
+
+multiclass F3_rn<string OpcStr, SDNode OpNode> {
+ def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst,
+ (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ Requires<[noFMA]>;
+ def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+ [(set Float64Regs:$dst,
+ (OpNode Float64Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA]>;
+ def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst,
+ (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[noFMA, doF32FTZ]>;
+ def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst,
+ (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA, doF32FTZ]>;
+ def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst,
+ (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[noFMA]>;
+ def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+ [(set Float32Regs:$dst,
+ (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA]>;
+}
+
+multiclass F2<string OpcStr, SDNode OpNode> {
+ def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+ !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+ [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+ def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+ Requires<[doF32FTZ]>;
+ def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+ [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instructions.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// General Type Conversion
+//-----------------------------------
+
+let neverHasSideEffects = 1 in {
+// Generate a cvt to the given type from all possible types.
+// Each instance takes a CvtMode immediate that defines the conversion mode to
+// use. It can be CvtNONE to omit a conversion mode.
+multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+ def _s16 : NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s16\t$dst, $src;"),
+ []>;
+ def _u16 : NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u16\t$dst, $src;"),
+ []>;
+ def _f16 : NVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f16\t$dst, $src;"),
+ []>;
+ def _s32 : NVPTXInst<(outs RC:$dst),
+ (ins Int32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s32\t$dst, $src;"),
+ []>;
+ def _u32 : NVPTXInst<(outs RC:$dst),
+ (ins Int32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u32\t$dst, $src;"),
+ []>;
+ def _s64 : NVPTXInst<(outs RC:$dst),
+ (ins Int64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".s64\t$dst, $src;"),
+ []>;
+ def _u64 : NVPTXInst<(outs RC:$dst),
+ (ins Int64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".u64\t$dst, $src;"),
+ []>;
+ def _f32 : NVPTXInst<(outs RC:$dst),
+ (ins Float32Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f32\t$dst, $src;"),
+ []>;
+ def _f64 : NVPTXInst<(outs RC:$dst),
+ (ins Float64Regs:$src, CvtMode:$mode),
+ !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+ FromName, ".f64\t$dst, $src;"),
+ []>;
+}
+
+// Generate a cvt to all possible types.
+defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
+defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+
+// This set of cvt is different from the above. The type of the source
+// and target are the same.
+//
+def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "cvt.s16.s8 \t$dst, $src;", []>;
+def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "cvt.s32.s8 \t$dst, $src;", []>;
+def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "cvt.s32.s16 \t$dst, $src;", []>;
+def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s8 \t$dst, $src;", []>;
+def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s16 \t$dst, $src;", []>;
+def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s32 \t$dst, $src;", []>;
+}
+
+//-----------------------------------
+// Integer Arithmetic
+//-----------------------------------
+
+multiclass ADD_SUB_i1<SDNode OpNode> {
+ def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+ "xor.pred \t$dst, $a, $b;",
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+ def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+ "xor.pred \t$dst, $a, $b;",
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+}
+
+defm ADD_i1 : ADD_SUB_i1<add>;
+defm SUB_i1 : ADD_SUB_i1<sub>;
+
+
+defm ADD : I3<"add.s", add>;
+defm SUB : I3<"sub.s", sub>;
+
+defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+
+defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+
+//mul.wide PTX instruction
+def SInt32Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ if (v.isSignedIntN(32))
+ return true;
+ return false;
+}]>;
+
+def UInt32Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ if (v.isIntN(32))
+ return true;
+ return false;
+}]>;
+
+def SInt16Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ if (v.isSignedIntN(16))
+ return true;
+ return false;
+}]>;
+
+def UInt16Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ if (v.isIntN(16))
+ return true;
+ return false;
+}]>;
+
+def Int5Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ // Check if 0 <= v < 32
+ // Only then the result from (x << v) will be i32
+ if (v.sge(0) && v.slt(32))
+ return true;
+ return false;
+}]>;
+
+def Int4Const : PatLeaf<(imm), [{
+ const APInt &v = N->getAPIntValue();
+ // Check if 0 <= v < 16
+ // Only then the result from (x << v) will be i16
+ if (v.sge(0) && v.slt(16))
+ return true;
+ return false;
+}]>;
+
+def SHL2MUL32 : SDNodeXForm<imm, [{
+ const APInt &v = N->getAPIntValue();
+ APInt temp(32, 1);
+ return CurDAG->getTargetConstant(temp.shl(v), MVT::i32);
+}]>;
+
+def SHL2MUL16 : SDNodeXForm<imm, [{
+ const APInt &v = N->getAPIntValue();
+ APInt temp(16, 1);
+ return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
+}]>;
+
+def MULWIDES64
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm64
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+ "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm64
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+ "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+ (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+ Requires<[doMulWide]>;
+def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+ (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+ (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+ Requires<[doMulWide]>;
+def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+ (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+ (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+ (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+ (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+ (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+ (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+ (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+ Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+ (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+ (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+ Requires<[doMulWide]>;
+
+
+def SDTMulWide
+ : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed
+ : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned
+ : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+ (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+ (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+ (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+ (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+ (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+ (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+ (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+ Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+ (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+ Requires<[doMulWide]>;
+
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3<"mul.hi.s", mulhs>;
+defm MULTHU : I3<"mul.hi.u", mulhu>;
+
+defm SDIV : I3<"div.s", sdiv>;
+defm UDIV : I3<"div.u", udiv>;
+
+defm SREM : I3<"rem.s", srem>;
+// The ri version will not be selected as DAGCombiner::visitSREM will lower it.
+defm UREM : I3<"rem.u", urem>;
+// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
+
+def SDTIMAD
+ : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
+ SDTCisInt<2>, SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>;
+def imad
+ : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
+def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst,
+ (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
+def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst,
+ (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
+def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst,
+ (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
+def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
+ (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+ "mad.lo.s16 \t$dst, $a, $b, $c;",
+ [(set Int16Regs:$dst,
+ (imad Int16Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst,
+ (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
+def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst,
+ (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
+def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst,
+ (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
+def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+ "mad.lo.s32 \t$dst, $a, $b, $c;",
+ [(set Int32Regs:$dst,
+ (imad Int32Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst,
+ (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
+def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst,
+ (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
+def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst,
+ (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
+def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+ "mad.lo.s64 \t$dst, $a, $b, $c;",
+ [(set Int64Regs:$dst,
+ (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+
+def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "neg.s16 \t$dst, $src;",
+ [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "neg.s32 \t$dst, $src;",
+ [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "neg.s64 \t$dst, $src;",
+ [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+
+//-----------------------------------
+// Floating Point Arithmetic
+//-----------------------------------
+
+// Constant 1.0f
+def FloatConst1 : PatLeaf<(fpimm), [{
+ if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle)
+ return false;
+ float f = (float)N->getValueAPF().convertToFloat();
+ return (f==1.0f);
+}]>;
+// Constand (double)1.0
+def DoubleConst1 : PatLeaf<(fpimm), [{
+ if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble)
+ return false;
+ double d = (double)N->getValueAPF().convertToDouble();
+ return (d==1.0);
+}]>;
+
+defm FADD : F3<"add", fadd>;
+defm FSUB : F3<"sub", fsub>;
+defm FMUL : F3<"mul", fmul>;
+
+defm FADD_rn : F3_rn<"add", fadd>;
+defm FSUB_rn : F3_rn<"sub", fsub>;
+defm FMUL_rn : F3_rn<"mul", fmul>;
+
+defm FABS : F2<"abs", fabs>;
+defm FNEG : F2<"neg", fneg>;
+defm FSQRT : F2<"sqrt.rn", fsqrt>;
+
+//
+// F64 division
+//
+def FDIV641r : NVPTXInst<(outs Float64Regs:$dst),
+ (ins f64imm:$a, Float64Regs:$b),
+ "rcp.rn.f64 \t$dst, $b;",
+ [(set Float64Regs:$dst,
+ (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b),
+ "div.rn.f64 \t$dst, $a, $b;",
+ [(set Float64Regs:$dst,
+ (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b),
+ "div.rn.f64 \t$dst, $a, $b;",
+ [(set Float64Regs:$dst,
+ (fdiv Float64Regs:$a, fpimm:$b))]>;
+
+//
+// F32 Approximate reciprocal
+//
+def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r : NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Approximate division
+//
+def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.approx.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.approx.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.approx.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxri : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.approx.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Semi-accurate reciprocal
+//
+// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+//
+def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.approx.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
+//
+// F32 Semi-accurate division
+//
+def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.full.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.full.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.full.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
+def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.full.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[do_DIVF32_FULL]>;
+//
+// F32 Accurate reciprocal
+//
+def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.rn.ftz.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst),
+ (ins f32imm:$a, Float32Regs:$b),
+ "rcp.rn.f32 \t$dst, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20]>;
+//
+// F32 Accurate division
+//
+def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.rn.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.rn.ftz.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b),
+ "div.rn.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[reqPTX20]>;
+def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b),
+ "div.rn.f32 \t$dst, $a, $b;",
+ [(set Float32Regs:$dst,
+ (fdiv Float32Regs:$a, fpimm:$b))]>,
+ Requires<[reqPTX20]>;
+
+//
+// F32 rsqrt
+//
+
+def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
+ "rsqrt.approx.f32 \t$dst, $b;", []>;
+
+def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
+ (RSQRTF32approx1r Float32Regs:$b)>,
+ Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
+
+multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
+ def rrr : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set Float32Regs:$dst,
+ (fma Float32Regs:$a, Float32Regs:$b, Float32Regs:$c))]>,
+ Requires<[Pred]>;
+ def rri : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set Float32Regs:$dst,
+ (fma Float32Regs:$a, Float32Regs:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+ def rir : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set Float32Regs:$dst,
+ (fma Float32Regs:$a, fpimm:$b, Float32Regs:$c))]>,
+ Requires<[Pred]>;
+ def rii : NVPTXInst<(outs Float32Regs:$dst),
+ (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set Float32Regs:$dst,
+ (fma Float32Regs:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+}
+
+multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
+ def rrr : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set Float64Regs:$dst,
+ (fma Float64Regs:$a, Float64Regs:$b, Float64Regs:$c))]>,
+ Requires<[Pred]>;
+ def rri : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set Float64Regs:$dst,
+ (fma Float64Regs:$a, Float64Regs:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+ def rir : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set Float64Regs:$dst,
+ (fma Float64Regs:$a, fpimm:$b, Float64Regs:$c))]>,
+ Requires<[Pred]>;
+ def rii : NVPTXInst<(outs Float64Regs:$dst),
+ (ins Float64Regs:$a, f64imm:$b, f64imm:$c),
+ !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+ [(set Float64Regs:$dst,
+ (fma Float64Regs:$a, fpimm:$b, fpimm:$c))]>,
+ Requires<[Pred]>;
+}
+
+defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
+defm FMA32 : FPCONTRACT32<"fma.rn.f32", true>;
+defm FMA64 : FPCONTRACT64<"fma.rn.f64", true>;
+
+def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "sin.approx.f32 \t$dst, $src;",
+ [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
+def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "cos.approx.f32 \t$dst, $src;",
+ [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
+
+// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y))
+// e.g. "poor man's fmod()"
+
+// frem - f32 FTZ
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+ (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
+ (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
+ Float32Regs:$y))>,
+ Requires<[doF32FTZ]>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+ (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
+ (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
+ fpimm:$y))>,
+ Requires<[doF32FTZ]>;
+
+// frem - f32
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+ (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
+ (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
+ Float32Regs:$y))>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+ (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
+ (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
+ fpimm:$y))>;
+
+// frem - f64
+def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
+ (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
+ (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
+ Float64Regs:$y))>;
+def : Pat<(frem Float64Regs:$x, fpimm:$y),
+ (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
+ (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
+ fpimm:$y))>;
+
+//-----------------------------------
+// Logical Arithmetic
+//-----------------------------------
+
+multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> {
+ def b1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+ !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+ def b1ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+ !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
+ [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+ def b16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+ Int16Regs:$b))]>;
+ def b16ri: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+ def b32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+ Int32Regs:$b))]>;
+ def b32ri: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+ def b64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+ Int64Regs:$b))]>;
+ def b64ri: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+ !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+}
+
+defm OR : LOG_FORMAT<"or", or>;
+defm AND : LOG_FORMAT<"and", and>;
+defm XOR : LOG_FORMAT<"xor", xor>;
+
+def NOT1: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+ "not.pred \t$dst, $src;",
+ [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+def NOT16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "not.b16 \t$dst, $src;",
+ [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+def NOT32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "not.b32 \t$dst, $src;",
+ [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+def NOT64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "not.b64 \t$dst, $src;",
+ [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+
+// For shifts, the second src operand must be 32-bit value
+multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
+ def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+ Int32Regs:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+ Int32Regs:$b))]>;
+ def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+ (i32 imm:$b)))]>;
+ def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+ Int32Regs:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+ Int32Regs:$b))]>;
+ def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+ (i32 imm:$b)))]>;
+ def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+ (i32 imm:$b)))]>;
+ def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+ Int32Regs:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+ Int32Regs:$b))]>;
+ def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+ (i32 imm:$b)))]>;
+}
+
+defm SHL : LSHIFT_FORMAT<"shl.b", shl>;
+
+// For shifts, the second src operand must be 32-bit value
+// Need to add cvt for the 8-bits.
+multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
+ def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+ Int32Regs:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+ Int32Regs:$b))]>;
+ def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+ (i32 imm:$b)))]>;
+ def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+ Int32Regs:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+ Int32Regs:$b))]>;
+ def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+ (i32 imm:$b)))]>;
+ def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+ !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+ (i32 imm:$b)))]>;
+ def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+ Int32Regs:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+ Int32Regs:$b))]>;
+ def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+ (i32 imm:$b)))]>;
+}
+
+defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
+defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
+
+//
+// Rotate: use ptx shf instruction if available.
+//
+
+// 32 bit r2 = rotl r1, n
+// =>
+// r2 = shf.l r1, r1, n
+def ROTL32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]> ;
+
+def ROTL32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]>;
+
+// 32 bit r2 = rotr r1, n
+// =>
+// r2 = shf.r r1, r1, n
+def ROTR32imm_hw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]>;
+
+//
+// Rotate: if ptx shf instruction is not available, then use shift+add
+//
+// 32bit
+def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %lhs;\n\t",
+ !strconcat(".reg .b32 %rhs;\n\t",
+ !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t",
+ !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t",
+ !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+ !strconcat("}}", ""))))))),
+ []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(32-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]>;
+def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+ Requires<[noHWROT32]>;
+
+def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+ Int32Regs:$amt),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %lhs;\n\t",
+ !strconcat(".reg .b32 %rhs;\n\t",
+ !strconcat(".reg .b32 %amt2;\n\t",
+ !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t",
+ !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+ !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
+ !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+ !strconcat("}}", ""))))))))),
+ [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[noHWROT32]>;
+
+def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+ Int32Regs:$amt),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %lhs;\n\t",
+ !strconcat(".reg .b32 %rhs;\n\t",
+ !strconcat(".reg .b32 %amt2;\n\t",
+ !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t",
+ !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+ !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
+ !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+ !strconcat("}}", ""))))))))),
+ [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[noHWROT32]>;
+
+// 64bit
+def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+ i32imm:$amt1, i32imm:$amt2),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b64 %lhs;\n\t",
+ !strconcat(".reg .b64 %rhs;\n\t",
+ !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t",
+ !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t",
+ !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+ !strconcat("}}", ""))))))),
+ []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(64-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+ Int32Regs:$amt),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b64 %lhs;\n\t",
+ !strconcat(".reg .b64 %rhs;\n\t",
+ !strconcat(".reg .u32 %amt2;\n\t",
+ !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t",
+ !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+ !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t",
+ !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+ !strconcat("}}", ""))))))))),
+ [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+ Int32Regs:$amt),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b64 %lhs;\n\t",
+ !strconcat(".reg .b64 %rhs;\n\t",
+ !strconcat(".reg .u32 %amt2;\n\t",
+ !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t",
+ !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+ !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t",
+ !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+ !strconcat("}}", ""))))))))),
+ [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+// BFE - bit-field extract
+
+multiclass BFE<string TyStr, RegisterClass RC> {
+ // BFE supports both 32-bit and 64-bit values, but the start and length
+ // operands are always 32-bit
+ def rrr
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+ def rri
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, Int32Regs:$b, i32imm:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+ def rii
+ : NVPTXInst<(outs RC:$d),
+ (ins RC:$a, i32imm:$b, i32imm:$c),
+ !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+}
+
+defm BFE_S32 : BFE<"s32", Int32Regs>;
+defm BFE_U32 : BFE<"u32", Int32Regs>;
+defm BFE_S64 : BFE<"s64", Int64Regs>;
+defm BFE_U64 : BFE<"u64", Int64Regs>;
+
+//-----------------------------------
+// General Comparison
+//-----------------------------------
+
+// General setp instructions
+multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr : NVPTXInst<(outs Int1Regs:$dst),
+ (ins RC:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+ []>;
+ def ri : NVPTXInst<(outs Int1Regs:$dst),
+ (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+ []>;
+ def ir : NVPTXInst<(outs Int1Regs:$dst),
+ (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+ []>;
+}
+
+defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
+defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
+defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
+defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
+defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
+defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
+defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
+defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
+defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
+defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
+defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
+
+// General set instructions
+multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr : NVPTXInst<(outs Int32Regs:$dst),
+ (ins RC:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+ def ri : NVPTXInst<(outs Int32Regs:$dst),
+ (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+ def ir : NVPTXInst<(outs Int32Regs:$dst),
+ (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+ !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+}
+
+defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
+defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
+defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
+defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
+defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
+defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
+defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
+defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
+defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
+defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
+defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
+
+//-----------------------------------
+// General Selection
+//-----------------------------------
+
+// General selp instructions
+multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+ def rr : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ri : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ir : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+ def ii : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+}
+
+multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+ SDNode ImmNode> {
+ def rr : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+ def ri : NVPTXInst<(outs RC:$dst),
+ (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+ def ir : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+ def ii : NVPTXInst<(outs RC:$dst),
+ (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+ !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+ [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+}
+
+defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
+defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
+defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
+defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
+defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
+defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
+defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
+defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
+defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
+defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
+defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
+
+// Special select for predicate operands
+def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
+ (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
+ (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
+
+//
+// Funnnel shift in clamp mode
+//
+// - SDNodes are created so they can be used in the DAG code,
+// e.g. NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+//
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFL_CLAMP Int32Regs:$lo,
+ Int32Regs:$hi, Int32Regs:$amt))]>;
+
+def FUNSHFRCLAMP : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFR_CLAMP Int32Regs:$lo,
+ Int32Regs:$hi, Int32Regs:$amt))]>;
+
+//-----------------------------------
+// Data Movement (Load / Store, Move)
+//-----------------------------------
+
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+ [SDNPWantRoot]>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+ [SDNPWantRoot]>;
+
+def MEMri : Operand<i32> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops Int32Regs, i32imm);
+}
+def MEMri64 : Operand<i64> {
+ let PrintMethod = "printMemOperand";
+ let MIOperandInfo = (ops Int64Regs, i64imm);
+}
+
+def imem : Operand<iPTR> {
+ let PrintMethod = "printOperand";
+}
+
+def imemAny : Operand<iPTRAny> {
+ let PrintMethod = "printOperand";
+}
+
+def LdStCode : Operand<i32> {
+ let PrintMethod = "printLdStCode";
+}
+
+def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+
+def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+ "mov.u32 \t$dst, $a;",
+ [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+ "mov.u64 \t$dst, $a;",
+ [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+// Get pointer to local stack
+def MOV_DEPOT_ADDR
+ : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+ "mov.u32 \t$d, __local_depot$num;", []>;
+def MOV_DEPOT_ADDR_64
+ : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+ "mov.u64 \t$d, __local_depot$num;", []>;
+
+
+// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+let IsSimpleMove=1 in {
+def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+ "mov.pred \t$dst, $sss;", []>;
+def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+ "mov.u16 \t$dst, $sss;", []>;
+def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+ "mov.u32 \t$dst, $sss;", []>;
+def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+ "mov.u64 \t$dst, $sss;", []>;
+
+def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "mov.f32 \t$dst, $src;", []>;
+def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+ "mov.f64 \t$dst, $src;", []>;
+}
+def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+ "mov.pred \t$dst, $src;",
+ [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+ "mov.u16 \t$dst, $src;",
+ [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+ "mov.u32 \t$dst, $src;",
+ [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+ "mov.u64 \t$dst, $src;",
+ [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+ "mov.f32 \t$dst, $src;",
+ [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+ "mov.f64 \t$dst, $src;",
+ [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+
+//---- Copy Frame Index ----
+def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+ "add.u32 \t$dst, ${addr:add};",
+ [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+ "add.u64 \t$dst, ${addr:add};",
+ [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+
+//-----------------------------------
+// Comparison and Selection
+//-----------------------------------
+
+multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
+ Instruction setp_16rr,
+ Instruction setp_16ri,
+ Instruction setp_16ir,
+ Instruction setp_32rr,
+ Instruction setp_32ri,
+ Instruction setp_32ir,
+ Instruction setp_64rr,
+ Instruction setp_64ri,
+ Instruction setp_64ir,
+ Instruction set_16rr,
+ Instruction set_16ri,
+ Instruction set_16ir,
+ Instruction set_32rr,
+ Instruction set_32ri,
+ Instruction set_32ir,
+ Instruction set_64rr,
+ Instruction set_64ri,
+ Instruction set_64ir> {
+ // i16 -> pred
+ def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
+ (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
+ (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
+ (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+ // i32 -> pred
+ def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
+ (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
+ (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
+ (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+ // i64 -> pred
+ def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
+ (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
+ (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
+ (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+
+ // i16 -> i32
+ def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
+ (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
+ (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
+ (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+ // i32 -> i32
+ def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
+ (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
+ (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
+ (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+ // i64 -> i32
+ def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
+ (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
+ (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+ def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
+ (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+}
+
+multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
+ : ISET_FORMAT<OpNode, Mode,
+ SETP_s16rr, SETP_s16ri, SETP_s16ir,
+ SETP_s32rr, SETP_s32ri, SETP_s32ir,
+ SETP_s64rr, SETP_s64ri, SETP_s64ir,
+ SET_s16rr, SET_s16ri, SET_s16ir,
+ SET_s32rr, SET_s32ri, SET_s32ir,
+ SET_s64rr, SET_s64ri, SET_s64ir> {
+ // TableGen doesn't like empty multiclasses
+ def : PatLeaf<(i32 0)>;
+}
+
+multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
+ : ISET_FORMAT<OpNode, Mode,
+ SETP_u16rr, SETP_u16ri, SETP_u16ir,
+ SETP_u32rr, SETP_u32ri, SETP_u32ir,
+ SETP_u64rr, SETP_u64ri, SETP_u64ir,
+ SET_u16rr, SET_u16ri, SET_u16ir,
+ SET_u32rr, SET_u32ri, SET_u32ir,
+ SET_u64rr, SET_u64ri, SET_u64ir> {
+ // TableGen doesn't like empty multiclasses
+ def : PatLeaf<(i32 0)>;
+}
+
+defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
+defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
+defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
+defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
+defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
+defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
+defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
+
+// i1 compares
+def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
+ (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
+ (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
+def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
+ (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
+ (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+// i1 compare -> i32
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+ (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+ (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+
+
+multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
+ // f32 -> pred
+ def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+ (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+ (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+ (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+ (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+ def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+ (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+ (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+ // f64 -> pred
+ def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
+ (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+ def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
+ (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+ def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
+ (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+
+ // f32 -> i32
+ def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+ (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+ (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+ (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+ (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+ def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+ (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+ Requires<[doF32FTZ]>;
+ def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+ (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+ // f64 -> i32
+ def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
+ (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+ def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
+ (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+ def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
+ (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+}
+
+defm FSetGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
+defm FSetLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
+defm FSetGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
+defm FSetLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
+defm FSetEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
+defm FSetNE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
+
+defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
+defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
+defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
+defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
+defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
+defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
+
+defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
+defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
+
+//def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>,
+ SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
+def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
+def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
+def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
+def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+
+def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam",
+ SDTDeclareScalarParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam",
+ SDTDeclareParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV2 : SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV4 : SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+ [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV2 : SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV4 : SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal : SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile,
+ []>;
+def StoreRetval : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV2 : SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV4 : SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+ [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam",
+ SDTPseudoUseParamProfile,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+ !strconcat(!strconcat("ld.param", opstr),
+ "\t$dst, [retval0+$b];"),
+ []>;
+
+class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+ !strconcat(!strconcat("mov", opstr),
+ "\t$dst, retval$b;"),
+ [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+ !strconcat(!strconcat("ld.param.v2", opstr),
+ "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
+
+class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4),
+ (ins i32imm:$b),
+ !strconcat(!strconcat("ld.param.v4", opstr),
+ "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), []>;
+
+class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[param$a+$b], $val;"),
+ []>;
+
+class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+ i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("st.param.v2", opstr),
+ "\t[param$a+$b], {{$val, $val2}};"),
+ []>;
+
+class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val1, regclass:$val2,
+ regclass:$val3, i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("st.param.v4", opstr),
+ "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+ []>;
+
+class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[func_retval0+$a], $val;"),
+ []>;
+
+class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+ !strconcat(!strconcat("st.param.v2", opstr),
+ "\t[func_retval0+$a], {{$val, $val2}};"),
+ []>;
+
+class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$val, regclass:$val2, regclass:$val3,
+ regclass:$val4, i32imm:$a),
+ !strconcat(!strconcat("st.param.v4", opstr),
+ "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+ []>;
+
+def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+"call (retval0), ",
+ [(PrintCall (i32 1))]>;
+def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1), ",
+ [(PrintCall (i32 2))]>;
+def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2), ",
+ [(PrintCall (i32 3))]>;
+def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3), ",
+ [(PrintCall (i32 4))]>;
+def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4), ",
+ [(PrintCall (i32 5))]>;
+def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5), ",
+ [(PrintCall (i32 6))]>;
+def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+ [(PrintCall (i32 7))]>;
+def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call (retval0, retval1, retval2, retval3, retval4",
+ ", retval5, retval6, retval7), "),
+ [(PrintCall (i32 8))]>;
+
+def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ",
+ [(PrintCall (i32 0))]>;
+
+def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins),
+"call.uni (retval0), ",
+ [(PrintCallUni (i32 1))]>;
+def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1), ",
+ [(PrintCallUni (i32 2))]>;
+def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2), ",
+ [(PrintCallUni (i32 3))]>;
+def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3), ",
+ [(PrintCallUni (i32 4))]>;
+def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4), ",
+ [(PrintCallUni (i32 5))]>;
+def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ",
+ [(PrintCallUni (i32 6))]>;
+def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+ [(PrintCallUni (i32 7))]>;
+def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4",
+ ", retval5, retval6, retval7), "),
+ [(PrintCallUni (i32 8))]>;
+
+def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ",
+ [(PrintCallUni (i32 0))]>;
+
+def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">;
+def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">;
+def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">;
+def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">;
+def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">;
+def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">;
+def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">;
+def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
+def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
+def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
+def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
+def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
+def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
+def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
+def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
+def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
+
+def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">;
+def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
+
+def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
+def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
+def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
+def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
+def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">;
+
+// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
+//def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">;
+def StoreParamV4I32 : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2,
+ Int32Regs:$val3, Int32Regs:$val4,
+ i32imm:$a, i32imm:$b),
+ "st.param.v4.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+ []>;
+
+def StoreParamV4I16 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
+ Int16Regs:$val3, Int16Regs:$val4,
+ i32imm:$a, i32imm:$b),
+ "st.param.v4.b16\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+ []>;
+
+def StoreParamV4I8 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
+ Int16Regs:$val3, Int16Regs:$val4,
+ i32imm:$a, i32imm:$b),
+ "st.param.v4.b8\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+ []>;
+
+def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">;
+def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">;
+// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
+//def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
+def StoreParamV4F32 : NVPTXInst<(outs),
+ (ins Float32Regs:$val, Float32Regs:$val2,
+ Float32Regs:$val3, Float32Regs:$val4,
+ i32imm:$a, i32imm:$b),
+ "st.param.v4.f32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+ []>;
+
+
+def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
+def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
+def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
+def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
+def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
+def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
+def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">;
+def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
+def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
+def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
+
+def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
+def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
+def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
+def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
+def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
+
+def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+
+class CallArgInst<NVPTXRegClass regclass> :
+ NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+ [(CallArg (i32 0), regclass:$a)]>;
+
+class LastCallArgInst<NVPTXRegClass regclass> :
+ NVPTXInst<(outs), (ins regclass:$a), "$a",
+ [(LastCallArg (i32 0), regclass:$a)]>;
+
+def CallArgI64 : CallArgInst<Int64Regs>;
+def CallArgI32 : CallArgInst<Int32Regs>;
+def CallArgI16 : CallArgInst<Int16Regs>;
+
+def CallArgF64 : CallArgInst<Float64Regs>;
+def CallArgF32 : CallArgInst<Float32Regs>;
+
+def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+
+def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+
+def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+ [(CallArg (i32 0), (i32 imm:$a))]>;
+def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+ [(LastCallArg (i32 0), (i32 imm:$a))]>;
+
+def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+ [(CallArg (i32 1), (i32 imm:$a))]>;
+def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+ [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr),
+ "$addr, ",
+ [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr),
+ "$addr, ",
+ [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+ "$addr, ",
+ [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val),
+ ", prototype_$val;",
+ [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst : NVPTXInst<(outs),
+ (ins i32imm:$align, i32imm:$size, i32imm:$num),
+ ".param .align $align .b8 retval$num[$size];",
+ [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+ ".param .b$size retval$num;",
+ [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+ ".reg .b$size retval$num;",
+ [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst : NVPTXInst<(outs),
+ (ins i32imm:$align, i32imm:$a, i32imm:$size),
+ ".param .align $align .b8 param$a[$size];",
+ [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+ ".param .b$size param$a;",
+ [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+ ".reg .b$size param$a;",
+ [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+
+class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+ NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+ !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"),
+ [(set regclass:$dst, (MoveParam regclass:$src))]>;
+
+def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "cvt.u16.u32\t$dst, $src;",
+ [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+
+class PseudoUseParamInst<NVPTXRegClass regclass> :
+ NVPTXInst<(outs), (ins regclass:$src),
+ "// Pseudo use of $src",
+ [(PseudoUseParam regclass:$src)]>;
+
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+
+
+//
+// Load / Store Handling
+//
+multiclass LD<NVPTXRegClass regclass> {
+ def _avar : NVPTXInst<(outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t$dst, [$addr];"), []>;
+ def _areg : NVPTXInst<(outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t$dst, [$addr];"), []>;
+ def _areg_64 : NVPTXInst<(outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
+ " \t$dst, [$addr];"), []>;
+ def _ari : NVPTXInst<(outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t$dst, [$addr+$offset];"), []>;
+ def _ari_64 : NVPTXInst<(outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
+ " \t$dst, [$addr+$offset];"), []>;
+ def _asi : NVPTXInst<(outs regclass:$dst),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t$dst, [$addr+$offset];"), []>;
+}
+
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LD_i8 : LD<Int16Regs>;
+defm LD_i16 : LD<Int16Regs>;
+defm LD_i32 : LD<Int32Regs>;
+defm LD_i64 : LD<Int64Regs>;
+defm LD_f32 : LD<Float32Regs>;
+defm LD_f64 : LD<Float64Regs>;
+}
+
+multiclass ST<NVPTXRegClass regclass> {
+ def _avar : NVPTXInst<(outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+ " \t[$addr], $src;"), []>;
+ def _areg : NVPTXInst<(outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+ " \t[$addr], $src;"), []>;
+ def _areg_64 : NVPTXInst<(outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
+ "\t[$addr], $src;"), []>;
+ def _ari : NVPTXInst<(outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+ " \t[$addr+$offset], $src;"), []>;
+ def _ari_64 : NVPTXInst<(outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
+ "\t[$addr+$offset], $src;"), []>;
+ def _asi : NVPTXInst<(outs),
+ (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+ " \t[$addr+$offset], $src;"), []>;
+}
+
+let mayStore=1, neverHasSideEffects=1 in {
+defm ST_i8 : ST<Int16Regs>;
+defm ST_i16 : ST<Int16Regs>;
+defm ST_i32 : ST<Int32Regs>;
+defm ST_i64 : ST<Int64Regs>;
+defm ST_f32 : ST<Float32Regs>;
+defm ST_f64 : ST<Float64Regs>;
+}
+
+// The following is used only in and after vector elementizations.
+// Vector elementization happens at the machine instruction level, so the
+// following instruction
+// never appears in the DAG.
+multiclass LD_VEC<NVPTXRegClass regclass> {
+ def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+ def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+ def _v2_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+ def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+ def _v2_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+ def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+ def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+ regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+ def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+ def _v4_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+ regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+ def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+ []>;
+ def _v4_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+ regclass:$dst3, regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+ []>;
+ def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4),
+ (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+ []>;
+}
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LDV_i8 : LD_VEC<Int16Regs>;
+defm LDV_i16 : LD_VEC<Int16Regs>;
+defm LDV_i32 : LD_VEC<Int32Regs>;
+defm LDV_i64 : LD_VEC<Int64Regs>;
+defm LDV_f32 : LD_VEC<Float32Regs>;
+defm LDV_f64 : LD_VEC<Float64Regs>;
+}
+
+multiclass ST_VEC<NVPTXRegClass regclass> {
+ def _v2_avar : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+ def _v2_areg : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+ def _v2_areg_64 : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+ def _v2_ari : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+ i32imm:$offset),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+ def _v2_ari_64 : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+ i32imm:$offset),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+ def _v2_asi : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+ i32imm:$offset),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+ def _v4_avar : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+ def _v4_areg : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+ def _v4_areg_64 : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+ def _v4_ari : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+ []>;
+ def _v4_ari_64 : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+ []>;
+ def _v4_asi : NVPTXInst<(outs),
+ (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+ LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+ "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+ []>;
+}
+let mayStore=1, neverHasSideEffects=1 in {
+defm STV_i8 : ST_VEC<Int16Regs>;
+defm STV_i16 : ST_VEC<Int16Regs>;
+defm STV_i32 : ST_VEC<Int32Regs>;
+defm STV_i64 : ST_VEC<Int64Regs>;
+defm STV_f32 : ST_VEC<Float32Regs>;
+defm STV_f64 : ST_VEC<Float64Regs>;
+}
+
+
+//---- Conversion ----
+
+class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+ NVPTXRegClass regclassOut> :
+ NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+ !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+ [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+
+def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+
+// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
+// we cannot specify floating-point literals in isel patterns. Therefore, we
+// use an integer selp to select either 1 or 0 and then cvt to floating-point.
+
+// sint -> f32
+def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
+ (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
+ (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
+ (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
+ (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f32
+def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
+ (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
+ (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
+ (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
+ (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+
+// sint -> f64
+def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
+ (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
+ (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
+ (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
+ (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f64
+def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
+ (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
+ (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
+ (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
+ (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+
+
+// f32 -> sint
+def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
+ (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+ (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+ (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+ (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+ (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+ (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+ (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f32 -> uint
+def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
+ (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+ (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+ (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+ (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+ (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+ (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+ (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f64 -> sint
+def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
+ (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
+ (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
+ (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
+ (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+
+// f64 -> uint
+def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
+ (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
+ (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
+ (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
+ (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+
+// sext i1
+def : Pat<(i16 (sext Int1Regs:$a)),
+ (SELP_s16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (sext Int1Regs:$a)),
+ (SELP_s32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (sext Int1Regs:$a)),
+ (SELP_s64ii -1, 0, Int1Regs:$a)>;
+
+// zext i1
+def : Pat<(i16 (zext Int1Regs:$a)),
+ (SELP_u16ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (zext Int1Regs:$a)),
+ (SELP_u32ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (zext Int1Regs:$a)),
+ (SELP_u64ii 1, 0, Int1Regs:$a)>;
+
+// anyext i1
+def : Pat<(i16 (anyext Int1Regs:$a)),
+ (SELP_u16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (anyext Int1Regs:$a)),
+ (SELP_u32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (anyext Int1Regs:$a)),
+ (SELP_u64ii -1, 0, Int1Regs:$a)>;
+
+// sext i16
+def : Pat<(i32 (sext Int16Regs:$a)),
+ (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (sext Int16Regs:$a)),
+ (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+
+// zext i16
+def : Pat<(i32 (zext Int16Regs:$a)),
+ (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (zext Int16Regs:$a)),
+ (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// anyext i16
+def : Pat<(i32 (anyext Int16Regs:$a)),
+ (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (anyext Int16Regs:$a)),
+ (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// sext i32
+def : Pat<(i64 (sext Int32Regs:$a)),
+ (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+
+// zext i32
+def : Pat<(i64 (zext Int32Regs:$a)),
+ (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+// anyext i32
+def : Pat<(i64 (anyext Int32Regs:$a)),
+ (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+
+// truncate i64
+def : Pat<(i32 (trunc Int64Regs:$a)),
+ (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i16 (trunc Int64Regs:$a)),
+ (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int64Regs:$a)),
+ (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i32
+def : Pat<(i16 (trunc Int32Regs:$a)),
+ (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int32Regs:$a)),
+ (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i16
+def : Pat<(i1 (trunc Int16Regs:$a)),
+ (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+
+// sext_inreg
+def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+
+
+// Select instructions with 32-bit predicates
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+ (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+ (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+ (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+ (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+ (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
+ (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+
+
+// pack a set of smaller int registers to a larger int register
+def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+ (ins Int16Regs:$s1, Int16Regs:$s2,
+ Int16Regs:$s3, Int16Regs:$s4),
+ "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};",
+ []>;
+def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+ (ins Int16Regs:$s1, Int16Regs:$s2),
+ "mov.b32\t$d, {{$s1, $s2}};",
+ []>;
+def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+ (ins Int32Regs:$s1, Int32Regs:$s2),
+ "mov.b64\t$d, {{$s1, $s2}};",
+ []>;
+def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+ (ins Float32Regs:$s1, Float32Regs:$s2),
+ "mov.b64\t$d, {{$s1, $s2}};",
+ []>;
+
+// unpack a larger int register to a set of smaller int registers
+def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+ Int16Regs:$d3, Int16Regs:$d4),
+ (ins Int64Regs:$s),
+ "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;",
+ []>;
+def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+ (ins Int32Regs:$s),
+ "mov.b32\t{{$d1, $d2}}, $s;",
+ []>;
+def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+ (ins Int64Regs:$s),
+ "mov.b64\t{{$d1, $d2}}, $s;",
+ []>;
+def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+ (ins Float64Regs:$s),
+ "mov.b64\t{{$d1, $d2}}, $s;",
+ []>;
+
+// Count leading zeros
+def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+ "clz.b32\t$d, $a;",
+ []>;
+def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "clz.b64\t$d, $a;",
+ []>;
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctlz Int32Regs:$a),
+ (CLZr32 Int32Regs:$a)>;
+def : Pat<(ctlz_zero_undef Int32Regs:$a),
+ (CLZr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctlz Int64Regs:$a),
+ (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(ctlz_zero_undef Int64Regs:$a),
+ (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
+// than 16 bits to store). We also need to subtract 16 because the
+// high-order 16 zeros were counted.
+def : Pat<(ctlz Int16Regs:$a),
+ (SUBi16ri (CVT_u16_u32 (CLZr32
+ (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+ CvtNONE), 16)>;
+def : Pat<(ctlz_zero_undef Int16Regs:$a),
+ (SUBi16ri (CVT_u16_u32 (CLZr32
+ (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+ CvtNONE), 16)>;
+
+// Population count
+def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+ "popc.b32\t$d, $a;",
+ []>;
+def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "popc.b64\t$d, $a;",
+ []>;
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctpop Int32Regs:$a),
+ (POPCr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctpop Int64Regs:$a),
+ (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
+// than 16 bits to store)
+def : Pat<(ctpop Int16Regs:$a),
+ (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+ CvtNONE)>;
+
+// fround f64 -> f32
+def : Pat<(f32 (fround Float64Regs:$a)),
+ (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fround Float64Regs:$a)),
+ (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+
+// fextend f32 -> f64
+def : Pat<(f64 (fextend Float32Regs:$a)),
+ (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fextend Float32Regs:$a)),
+ (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+
+def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+//-----------------------------------
+// Control-flow
+//-----------------------------------
+
+let isTerminator=1 in {
+ let isReturn=1, isBarrier=1 in
+ def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+
+ let isBranch=1 in
+ def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+ "@$a bra \t$target;",
+ [(brcond Int1Regs:$a, bb:$target)]>;
+ let isBranch=1 in
+ def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+ "@!$a bra \t$target;",
+ []>;
+
+ let isBranch=1, isBarrier=1 in
+ def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+ "bra.uni \t$target;",
+ [(br bb:$target)]>;
+}
+
+def : Pat<(brcond Int32Regs:$a, bb:$target),
+ (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+
+// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+// conditional branch if
+// the target block is the next block so that the code can fall through to the
+// target block.
+// The invertion is done by 'xor condition, 1', which will be translated to
+// (setne condition, -1).
+// Since ptx supports '@!pred bra target', we should use it.
+def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+ (CBranchOther Int1Regs:$a, bb:$target)>;
+
+// Call
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+ [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPSideEffect]>;
+
+def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def calltarget : Operand<i32>;
+let isCall=1 in {
+ def CALL : NVPTXInst<(outs), (ins calltarget:$dst),
+ "call \t$dst, (1);", []>;
+}
+
+def : Pat<(call tglobaladdr:$dst),
+ (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+ (CALL texternalsym:$dst)>;
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+ : NVPTXInst<outs, ins, asmstr, pattern>;
+
+// @TODO: We use some tricks here to emit curly braces. Can we clean this up
+// a bit without TableGen modifications?
+def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt),
+ "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}",
+ [(callseq_start timm:$amt)]>;
+def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "\n\t//{{\n\t}}// Callseq End $amt1",
+ [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// trap instruction
+
+def trapinst : NVPTXInst<(outs), (ins),
+ "trap;",
+ [(trap)]>;
+
+// Call prototype wrapper
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype
+ : SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+ [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> {
+ let PrintMethod = "printProtoIdent";
+}
+def CALL_PROTOTYPE
+ : NVPTXInst<(outs), (ins ProtoIdent:$ident),
+ "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
+
+include "NVPTXIntrinsics.td"
+
+
+//-----------------------------------
+// Notes
+//-----------------------------------
+// BSWAP is currently expanded. The following is a more efficient
+// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+// - for sm_20, use pmpt (use vector scalar mov to get the pack and
+// unpack). sm_20 supports native 32-bit register, but not native 16-bit
+// register.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
new file mode 100644
index 000000000000..14e51aa309ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -0,0 +1,7047 @@
+//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def immFloat0 : PatLeaf<(fpimm), [{
+ float f = (float)N->getValueAPF().convertToFloat();
+ return (f==0.0f);
+}]>;
+
+def immFloat1 : PatLeaf<(fpimm), [{
+ float f = (float)N->getValueAPF().convertToFloat();
+ return (f==1.0f);
+}]>;
+
+def immDouble0 : PatLeaf<(fpimm), [{
+ double d = (double)N->getValueAPF().convertToDouble();
+ return (d==0.0);
+}]>;
+
+def immDouble1 : PatLeaf<(fpimm), [{
+ double d = (double)N->getValueAPF().convertToDouble();
+ return (d==1.0);
+}]>;
+
+
+
+//-----------------------------------
+// Synchronization Functions
+//-----------------------------------
+def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
+ "bar.sync \t0;",
+ [(int_cuda_syncthreads)]>;
+def INT_BARRIER0 : NVPTXInst<(outs), (ins),
+ "bar.sync \t0;",
+ [(int_nvvm_barrier0)]>;
+def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+ !strconcat("{{ \n\t",
+ !strconcat(".reg .pred \t%p1; \n\t",
+ !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+ !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+ !strconcat("}}", ""))))),
+ [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
+def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+ !strconcat("{{ \n\t",
+ !strconcat(".reg .pred \t%p1; \n\t",
+ !strconcat(".reg .pred \t%p2; \n\t",
+ !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+ !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
+ !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+ !strconcat("}}", ""))))))),
+ [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
+def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+ !strconcat("{{ \n\t",
+ !strconcat(".reg .pred \t%p1; \n\t",
+ !strconcat(".reg .pred \t%p2; \n\t",
+ !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+ !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
+ !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+ !strconcat("}}", ""))))))),
+ [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
+
+
+//-----------------------------------
+// Explicit Memory Fence Functions
+//-----------------------------------
+class MEMBAR<string StrOp, Intrinsic IntOP> :
+ NVPTXInst<(outs), (ins),
+ StrOp, [(IntOP)]>;
+
+def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
+def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
+def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+
+
+//-----------------------------------
+// Math Functions
+//-----------------------------------
+
+// Map min(1.0, max(0.0, x)) to sat(x)
+// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x is
+// NaN
+// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
+// Same story for fmax, fmin.
+
+def : Pat<(int_nvvm_fmin_f immFloat1,
+ (int_nvvm_fmax_f immFloat0, Float32Regs:$a)),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f immFloat1,
+ (int_nvvm_fmax_f Float32Regs:$a, immFloat0)),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+ (int_nvvm_fmax_f immFloat0, Float32Regs:$a), immFloat1),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+ (int_nvvm_fmax_f Float32Regs:$a, immFloat0), immFloat1),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+
+def : Pat<(int_nvvm_fmin_d immDouble1,
+ (int_nvvm_fmax_d immDouble0, Float64Regs:$a)),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d immDouble1,
+ (int_nvvm_fmax_d Float64Regs:$a, immDouble0)),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+ (int_nvvm_fmax_d immDouble0, Float64Regs:$a), immDouble1),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+ (int_nvvm_fmax_d Float64Regs:$a, immDouble0), immDouble1),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+
+
+// We need a full string for OpcStr here because we need to deal with case like
+// INT_PTX_RECIP.
+class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
+ NVPTXRegClass src_regclass, Intrinsic IntOP>
+ : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
+ OpcStr,
+ [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+
+// We need a full string for OpcStr here because we need to deal with the case
+// like INT_PTX_NATIVE_POWR_F.
+class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
+ NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+ : NVPTXInst<(outs t_regclass:$dst),
+ (ins s0_regclass:$src0, s1_regclass:$src1),
+ OpcStr,
+ [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+
+class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
+ NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
+ NVPTXRegClass s2_regclass, Intrinsic IntOP>
+ : NVPTXInst<(outs t_regclass:$dst),
+ (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
+ OpcStr,
+ [(set t_regclass:$dst,
+ (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+
+//
+// MISC
+//
+
+def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+ int_nvvm_clz_i>;
+def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+ int_nvvm_clz_ll>;
+
+def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+ int_nvvm_popc_i>;
+def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+ int_nvvm_popc_ll>;
+
+def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
+ Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+
+//
+// Min Max
+//
+
+def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_min_i>;
+def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_min_ui>;
+
+def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_min_ll>;
+def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_min_ull>;
+
+def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_max_i>;
+def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_max_ui>;
+
+def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_max_ll>;
+def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_max_ull>;
+
+def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
+ Float32Regs, Float32Regs, int_nvvm_fmin_f>;
+def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+
+def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
+ Float32Regs, Float32Regs, int_nvvm_fmax_f>;
+def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+
+def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
+ Float64Regs, Float64Regs, int_nvvm_fmin_d>;
+def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
+ Float64Regs, Float64Regs, int_nvvm_fmax_d>;
+
+//
+// Multiplication
+//
+
+def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
+def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
+ Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
+
+def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
+def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
+ Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
+
+def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
+def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
+def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
+def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
+def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
+def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
+def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
+def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
+
+def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
+def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
+def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
+def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
+
+def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
+ Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
+def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
+ Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
+
+//
+// Div
+//
+
+def INT_NVVM_DIV_APPROX_FTZ_F
+ : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
+ Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
+def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
+
+def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
+def INT_NVVM_DIV_RN_F : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
+def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
+def INT_NVVM_DIV_RZ_F : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
+def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
+def INT_NVVM_DIV_RM_F : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
+def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
+def INT_NVVM_DIV_RP_F : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
+
+def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
+def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
+def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
+def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
+
+//
+// Brev
+//
+
+def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+ int_nvvm_brev32>;
+def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
+ int_nvvm_brev64>;
+
+//
+// Sad
+//
+
+def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
+ Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
+def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
+ Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
+
+//
+// Floor Ceil
+//
+
+def : Pat<(int_nvvm_floor_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_floor_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_floor_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+
+def : Pat<(int_nvvm_ceil_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_ceil_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRPI)>;
+def : Pat<(int_nvvm_ceil_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+
+//
+// Abs
+//
+
+def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
+ int_nvvm_abs_i>;
+def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
+ int_nvvm_abs_ll>;
+
+def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_fabs_ftz_f>;
+def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_fabs_f>;
+
+def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_fabs_d>;
+
+//
+// Round
+//
+
+def : Pat<(int_nvvm_round_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_round_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_round_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+//
+// Trunc
+//
+
+def : Pat<(int_nvvm_trunc_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_trunc_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_trunc_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+
+//
+// Saturate
+//
+
+def : Pat<(int_nvvm_saturate_ftz_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT_FTZ)>;
+def : Pat<(int_nvvm_saturate_f Float32Regs:$a),
+ (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_saturate_d Float64Regs:$a),
+ (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+
+//
+// Exp2 Log2
+//
+
+def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>;
+def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
+def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
+ Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
+def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
+def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
+def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
+ Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
+
+//
+// Sin Cos
+//
+
+def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>;
+def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sin_approx_f>;
+
+def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>;
+def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
+
+//
+// Fma
+//
+
+def INT_NVVM_FMA_RN_FTZ_F
+ : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
+def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
+ Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
+def INT_NVVM_FMA_RZ_FTZ_F
+ : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
+def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
+ Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
+def INT_NVVM_FMA_RM_FTZ_F
+ : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
+def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
+ Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
+def INT_NVVM_FMA_RP_FTZ_F
+ : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
+def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
+ Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
+
+def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
+ Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
+def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
+ Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
+def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
+ Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
+def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
+ Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+
+//
+// Rcp
+//
+
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>;
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>;
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>;
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>;
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>;
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>;
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>;
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
+
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_rcp_rn_d>;
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_rcp_rz_d>;
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_rcp_rm_d>;
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_rcp_rp_d>;
+
+def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
+ Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
+
+//
+// Sqrt
+//
+
+def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>;
+def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_sqrt_rn_f>;
+def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>;
+def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_sqrt_rz_f>;
+def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>;
+def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_sqrt_rm_f>;
+def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>;
+def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs,
+ Float32Regs, int_nvvm_sqrt_rp_f>;
+def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>;
+def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>;
+
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_sqrt_rn_d>;
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_sqrt_rz_d>;
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_sqrt_rm_d>;
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
+ Float64Regs, int_nvvm_sqrt_rp_d>;
+
+// nvvm_sqrt intrinsic
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+
+//
+// Rsqrt
+//
+
+def INT_NVVM_RSQRT_APPROX_FTZ_F
+ : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs,
+ int_nvvm_rsqrt_approx_ftz_f>;
+def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
+ Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>;
+def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
+ Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>;
+
+//
+// Add
+//
+
+def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
+def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
+def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
+def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
+def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
+def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
+def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
+def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
+ Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
+
+def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
+def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
+def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
+def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
+ Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+
+//
+// Convert
+//
+
+def : Pat<(int_nvvm_d2f_rn_ftz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>;
+def : Pat<(int_nvvm_d2f_rn Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_d2f_rz_ftz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRZ_FTZ)>;
+def : Pat<(int_nvvm_d2f_rz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_d2f_rm_ftz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRM_FTZ)>;
+def : Pat<(int_nvvm_d2f_rm Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_d2f_rp_ftz Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRP_FTZ)>;
+def : Pat<(int_nvvm_d2f_rp Float64Regs:$a),
+ (CVT_f32_f64 Float64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_d2i_rn Float64Regs:$a),
+ (CVT_s32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2i_rz Float64Regs:$a),
+ (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2i_rm Float64Regs:$a),
+ (CVT_s32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2i_rp Float64Regs:$a),
+ (CVT_s32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ui_rn Float64Regs:$a),
+ (CVT_u32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ui_rz Float64Regs:$a),
+ (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ui_rm Float64Regs:$a),
+ (CVT_u32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ui_rp Float64Regs:$a),
+ (CVT_u32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2d_rn Int32Regs:$a),
+ (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2d_rz Int32Regs:$a),
+ (CVT_f64_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2d_rm Int32Regs:$a),
+ (CVT_f64_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2d_rp Int32Regs:$a),
+ (CVT_f64_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2d_rn Int32Regs:$a),
+ (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2d_rz Int32Regs:$a),
+ (CVT_f64_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2d_rm Int32Regs:$a),
+ (CVT_f64_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2d_rp Int32Regs:$a),
+ (CVT_f64_u32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_f2i_rn_ftz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rn Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2i_rz_ftz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2i_rm_ftz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rm Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2i_rp_ftz Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rp Float32Regs:$a),
+ (CVT_s32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ui_rn_ftz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rn Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ui_rz_ftz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ui_rm_ftz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rm Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ui_rp_ftz Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rp Float32Regs:$a),
+ (CVT_u32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2f_rn Int32Regs:$a),
+ (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2f_rz Int32Regs:$a),
+ (CVT_f32_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2f_rm Int32Regs:$a),
+ (CVT_f32_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2f_rp Int32Regs:$a),
+ (CVT_f32_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2f_rn Int32Regs:$a),
+ (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2f_rz Int32Regs:$a),
+ (CVT_f32_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2f_rm Int32Regs:$a),
+ (CVT_f32_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
+ (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
+
+def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
+ Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
+
+def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b32 %temp; \n\t",
+ !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
+ "}}"))),
+ Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b32 %temp; \n\t",
+ !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
+ "}}"))),
+ Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+
+def : Pat<(int_nvvm_f2ll_rn_ftz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rn Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ll_rz_ftz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ll_rm_ftz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rm Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ll_rp_ftz Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rp Float32Regs:$a),
+ (CVT_s64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ull_rn_ftz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rn Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ull_rz_ftz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ull_rm_ftz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rm Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ull_rp_ftz Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rp Float32Regs:$a),
+ (CVT_u64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ll_rn Float64Regs:$a),
+ (CVT_s64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ll_rz Float64Regs:$a),
+ (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ll_rm Float64Regs:$a),
+ (CVT_s64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ll_rp Float64Regs:$a),
+ (CVT_s64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ull_rn Float64Regs:$a),
+ (CVT_u64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ull_rz Float64Regs:$a),
+ (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ull_rm Float64Regs:$a),
+ (CVT_u64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ull_rp Float64Regs:$a),
+ (CVT_u64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_ll2f_rn Int64Regs:$a),
+ (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2f_rz Int64Regs:$a),
+ (CVT_f32_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2f_rm Int64Regs:$a),
+ (CVT_f32_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2f_rp Int64Regs:$a),
+ (CVT_f32_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2f_rn Int64Regs:$a),
+ (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2f_rz Int64Regs:$a),
+ (CVT_f32_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2f_rm Int64Regs:$a),
+ (CVT_f32_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2f_rp Int64Regs:$a),
+ (CVT_f32_u64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ll2d_rn Int64Regs:$a),
+ (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2d_rz Int64Regs:$a),
+ (CVT_f64_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2d_rm Int64Regs:$a),
+ (CVT_f64_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2d_rp Int64Regs:$a),
+ (CVT_f64_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2d_rn Int64Regs:$a),
+ (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2d_rz Int64Regs:$a),
+ (CVT_f64_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2d_rm Int64Regs:$a),
+ (CVT_f64_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
+ (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
+
+
+// FIXME: Ideally, we could use these patterns instead of the scope-creating
+// patterns, but ptxas does not like these since .s16 is not compatible with
+// .f16. The solution is to use .bXX for all integer register types, but we
+// are not there yet.
+//def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
+// (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
+//def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
+// (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+//
+//def : Pat<(int_nvvm_h2f Int16Regs:$a),
+// (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+
+def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b16 %temp;\n\t",
+ !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
+ !strconcat("mov.b16 \t$dst, %temp;\n",
+ "}}")))),
+ Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
+def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b16 %temp;\n\t",
+ !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
+ !strconcat("mov.b16 \t$dst, %temp;\n",
+ "}}")))),
+ Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
+
+def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
+ !strconcat(".reg .b16 %temp;\n\t",
+ !strconcat("mov.b16 \t%temp, $src0;\n\t",
+ !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
+ "}}")))),
+ Float32Regs, Int16Regs, int_nvvm_h2f>;
+
+def : Pat<(f32 (f16_to_fp Int16Regs:$a)),
+ (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
+ (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
+ (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+
+def : Pat<(f64 (f16_to_fp Int16Regs:$a)),
+ (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float64Regs:$a)),
+ (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+
+//
+// Bitcast
+//
+
+def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs,
+ Float32Regs, int_nvvm_bitcast_f2i>;
+def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs,
+ Int32Regs, int_nvvm_bitcast_i2f>;
+
+def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs,
+ Int64Regs, int_nvvm_bitcast_ll2d>;
+def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs,
+ Float64Regs, int_nvvm_bitcast_d2ll>;
+
+//-----------------------------------
+// Atomic Functions
+//-----------------------------------
+
+class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+}]>;
+class ATOMIC_SHARED_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+}]>;
+class ATOMIC_GENERIC_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+}]>;
+
+multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+ string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+ Operand IMMType, SDNode IMM, Predicate Pred> {
+ def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+ Requires<[Pred]>;
+ def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+ Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+ string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+ defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, IMM, Pred>;
+ defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, IMM, Pred>;
+}
+
+// has 2 operands, neg the second one
+multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+ string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+ Operand IMMType, Predicate Pred> {
+ def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+ !strconcat("{{ \n\t",
+ !strconcat(".reg \t.s",
+ !strconcat(TypeStr,
+ !strconcat(" temp; \n\t",
+ !strconcat("neg.s",
+ !strconcat(TypeStr,
+ !strconcat(" \ttemp, $b; \n\t",
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(".u",
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], temp; \n\t",
+ !strconcat("}}", "")))))))))))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+ Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
+ string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
+ Predicate Pred> {
+ defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, Pred> ;
+ defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, Pred> ;
+}
+
+// has 3 operands
+multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+ string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+ Operand IMMType, Predicate Pred> {
+ def reg : NVPTXInst<(outs regclass:$dst),
+ (ins ptrclass:$addr, regclass:$b, regclass:$c),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+ [(set regclass:$dst,
+ (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+ Requires<[Pred]>;
+ def imm1 : NVPTXInst<(outs regclass:$dst),
+ (ins ptrclass:$addr, IMMType:$b, regclass:$c),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+ Requires<[Pred]>;
+ def imm2 : NVPTXInst<(outs regclass:$dst),
+ (ins ptrclass:$addr, regclass:$b, IMMType:$c),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+ Requires<[Pred]>;
+ def imm3 : NVPTXInst<(outs regclass:$dst),
+ (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
+ !strconcat("atom",
+ !strconcat(SpaceStr,
+ !strconcat(OpcStr,
+ !strconcat(TypeStr,
+ !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+ [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+ Requires<[Pred]>;
+}
+multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+ string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+ defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, Pred>;
+ defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+ IntOp, IMMType, Pred>;
+}
+
+// atom_add
+
+def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
+ atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
+ atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
+ atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
+ atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
+ atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
+ atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+ ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
+ atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
+ atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
+ atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+
+// atom_sub
+
+def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_sub_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
+ atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
+ atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
+ atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
+ ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
+ atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
+ atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
+ atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
+ ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// atom_swap
+
+def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_swap_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
+ atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
+ atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
+ atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+ ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
+ atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
+ atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
+ atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_max
+
+def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+ , (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+ , (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umax_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+ ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+ ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
+ atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+ ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+ ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+ ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
+ atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+ ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
+ atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+ ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+ ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+ ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
+ atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_min
+
+def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_umin_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+ ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+ ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
+ atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+ ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+ ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+ ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
+ atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+ ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
+ atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+ ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+ ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+ ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
+ atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+ ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_inc atom_dec
+
+def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
+ atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
+ atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
+ atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
+ atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
+ atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
+ atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+ ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_and
+
+def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_and_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
+ atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
+ atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
+ atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+ ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
+ atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
+ atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
+ atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_or
+
+def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_or_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
+ atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
+ atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+ ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
+ atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
+ atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
+ atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
+ atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>;
+
+// atom_xor
+
+def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+ (atomic_load_xor_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
+ atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
+ atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
+ atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+ ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
+ atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
+ atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
+ atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+ ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_cas
+
+def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+ (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+
+defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
+ atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
+ atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
+ atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
+ ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
+ atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
+ atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
+ atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
+ ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+
+
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
+class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> :
+ NVPTXInst<(outs regclassOut:$dst), (ins),
+ OpStr,
+ [(set regclassOut:$dst, (IntOp))]>;
+
+def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs,
+ int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs,
+ int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs,
+ int_nvvm_read_ptx_sreg_tid_z>;
+
+def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs,
+ int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs,
+ int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs,
+ int_nvvm_read_ptx_sreg_ntid_z>;
+
+def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs,
+ int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs,
+ int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs,
+ int_nvvm_read_ptx_sreg_ctaid_z>;
+
+def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs,
+ int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs,
+ int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs,
+ int_nvvm_read_ptx_sreg_nctaid_z>;
+
+def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
+ int_nvvm_read_ptx_sreg_warpsize>;
+
+
+//-----------------------------------
+// Support for ldu on sm_20 or later
+//-----------------------------------
+
+// Scalar
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
+ def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+ def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+ def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+ def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+ def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+ !strconcat("ldu.global.", TyStr),
+ []>, Requires<[hasLDU]>;
+}
+
+defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+
+// vector
+
+// Elementized vector ldu
+multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins Int32Regs:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins Int64Regs:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins MEMri:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins MEMri64:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins imemAny:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+}
+
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins Int32Regs:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins Int64Regs:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins MEMri:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins MEMri64:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins imemAny:$src),
+ !strconcat("ldu.global.", TyStr), []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE
+ : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE
+ : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE
+ : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE
+ : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE
+ : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE
+ : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDU_G_v4i8_ELE
+ : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE
+ : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+ Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE
+ : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+ Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE
+ : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+ Float32Regs>;
+
+
+//-----------------------------------
+// Support for ldg on sm_35 or later
+//-----------------------------------
+
+multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
+ def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+ def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+ def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+ def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+ def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+ !strconcat("ld.global.nc.", TyStr),
+ []>, Requires<[hasLDG]>;
+}
+
+defm INT_PTX_LDG_GLOBAL_i8
+ : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i16
+ : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i32
+ : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_i64
+ : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f32
+ : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDG_GLOBAL_f64
+ : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDG_GLOBAL_p32
+ : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_p64
+ : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+
+// vector
+
+// Elementized vector ldg
+multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins Int32Regs:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins Int64Regs:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins MEMri:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins MEMri64:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins imemAny:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins Int32Regs:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins Int64Regs:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins MEMri:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins MEMri64:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins imemAny:$src),
+ !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
+defm INT_PTX_LDG_G_v2i8_ELE
+ : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE
+ : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE
+ : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE
+ : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE
+ : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE
+ : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDG_G_v4i8_ELE
+ : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE
+ : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE
+ : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE
+ : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+
+
+multiclass NG_TO_G<string Str, Intrinsic Intrin> {
+ def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+ Requires<[hasGenericLdSt]>;
+ def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+ [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+ Requires<[hasGenericLdSt]>;
+
+// @TODO: Are these actually needed? I believe global addresses will be copied
+// to register values anyway.
+ /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+ !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+ [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+ Requires<[hasGenericLdSt]>;
+ def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+ !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+ [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+ Requires<[hasGenericLdSt]>;*/
+
+ def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ "mov.u32 \t$result, $src;",
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+ def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ "mov.u64 \t$result, $src;",
+ [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+
+// @TODO: Are these actually needed? I believe global addresses will be copied
+// to register values anyway.
+ /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
+ "mov.u32 \t$result, $src;",
+ [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
+ def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+ "mov.u64 \t$result, $src;",
+ [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+}
+
+multiclass G_TO_NG<string Str, Intrinsic Intrin> {
+ def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+ Requires<[hasGenericLdSt]>;
+ def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+ [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+ Requires<[hasGenericLdSt]>;
+ def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ "mov.u32 \t$result, $src;",
+ [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+ def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ "mov.u64 \t$result, $src;",
+ [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+}
+
+defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
+defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
+defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>;
+
+defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
+defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
+defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
+defm cvta_to_const : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>;
+
+
+// nvvm.ptr.gen.to.param
+def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
+ (ins Int32Regs:$src),
+ "mov.u32 \t$result, $src;",
+ [(set Int32Regs:$result,
+ (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
+def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
+ (ins Int64Regs:$src),
+ "mov.u64 \t$result, $src;",
+ [(set Int64Regs:$result,
+ (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
+
+
+// nvvm.move intrinsicc
+def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
+ "mov.b16 \t$r, $s;",
+ [(set Int16Regs:$r,
+ (int_nvvm_move_i16 Int16Regs:$s))]>;
+def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+ "mov.b32 \t$r, $s;",
+ [(set Int32Regs:$r,
+ (int_nvvm_move_i32 Int32Regs:$s))]>;
+def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+ "mov.b64 \t$r, $s;",
+ [(set Int64Regs:$r,
+ (int_nvvm_move_i64 Int64Regs:$s))]>;
+def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
+ "mov.f32 \t$r, $s;",
+ [(set Float32Regs:$r,
+ (int_nvvm_move_float Float32Regs:$s))]>;
+def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
+ "mov.f64 \t$r, $s;",
+ [(set Float64Regs:$r,
+ (int_nvvm_move_double Float64Regs:$s))]>;
+def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+ "mov.u32 \t$r, $s;",
+ [(set Int32Regs:$r,
+ (int_nvvm_move_ptr Int32Regs:$s))]>;
+def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+ "mov.u64 \t$r, $s;",
+ [(set Int64Regs:$r,
+ (int_nvvm_move_ptr Int64Regs:$s))]>;
+
+// @TODO: Are these actually needed, or will we always just see symbols
+// copied to registers first?
+/*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s),
+ "mov.u32 \t$r, $s;",
+ [(set Int32Regs:$r,
+ (int_nvvm_move_ptr texternalsym:$s))]>;
+def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
+ "mov.u64 \t$r, $s;",
+ [(set Int64Regs:$r,
+ (int_nvvm_move_ptr texternalsym:$s))]>;*/
+
+
+// MoveParam %r1, param
+// ptr_local_to_gen %r2, %r1
+// ptr_gen_to_local %r3, %r2
+// ->
+// mov %r1, param
+
+// @TODO: Revisit this. There is a type
+// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
+// instructions are not currently defined. However, we can use the ptr
+// variants and the asm printer will do the right thing.
+def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+ (MoveParam texternalsym:$src)))),
+ (nvvm_move_ptr64 texternalsym:$src)>;
+def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+ (MoveParam texternalsym:$src)))),
+ (nvvm_move_ptr32 texternalsym:$src)>;
+
+def texsurf_handles
+ : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+ "mov.u64 \t$result, $src;", []>;
+
+//-----------------------------------
+// Compiler Error Warn
+// - Just ignore them in codegen
+//-----------------------------------
+
+def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+ "// llvm.nvvm.compiler.warn()",
+ [(int_nvvm_compiler_warn Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+ "// llvm.nvvm.compiler.warn()",
+ [(int_nvvm_compiler_warn Int64Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+ "// llvm.nvvm.compiler.error()",
+ [(int_nvvm_compiler_error Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+ "// llvm.nvvm.compiler.error()",
+ [(int_nvvm_compiler_error Int64Regs:$a)]>;
+
+
+// isspacep
+
+def ISSPACEP_CONST_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.const \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>,
+ Requires<[hasPTX31]>;
+def ISSPACEP_CONST_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.const \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>,
+ Requires<[hasPTX31]>;
+def ISSPACEP_GLOBAL_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.global \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>;
+def ISSPACEP_GLOBAL_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.global \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>;
+def ISSPACEP_LOCAL_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.local \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>;
+def ISSPACEP_LOCAL_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.local \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>;
+def ISSPACEP_SHARED_32
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep.shared \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>;
+def ISSPACEP_SHARED_64
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep.shared \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>;
+
+
+// Special register reads
+def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
+ (ins SpecialRegs:$r),
+ "mov.b32\t$d, $r;", []>;
+
+def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
+
+
+// rotate builtin support
+
+def ROTATE_B32_HW_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst,
+ (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]> ;
+
+def ROTATE_B32_HW_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst,
+ (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
+ (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]> ;
+
+def GET_LO_INT64
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %dummy;\n\t",
+ !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
+ !strconcat("}}", "")))),
+ []> ;
+
+def GET_HI_INT64
+ : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ !strconcat(".reg .b32 %dummy;\n\t",
+ !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
+ !strconcat("}}", "")))),
+ []> ;
+
+def PACK_TWO_INT32
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+ "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+
+def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
+ (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src))> ;
+
+// funnel shift, requires >= sm_32
+def SHF_L_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+def SHF_L_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+def SHF_R_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+def SHF_R_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+// HW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (PACK_TWO_INT32
+ (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), imm:$amt),
+ (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
+ Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+ (PACK_TWO_INT32
+ (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
+ (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+ Requires<[hasHWROT32]>;
+
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (PACK_TWO_INT32
+ (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), imm:$amt),
+ (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
+ Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+ (PACK_TWO_INT32
+ (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
+ (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+ Requires<[hasHWROT32]>;
+
+// SW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+ (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+ (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]>;
+
+
+//-----------------------------------
+// Texture Intrinsics
+//-----------------------------------
+
+// NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be
+// also defined in NVPTXReplaceImageHandles.cpp
+
+// texmode_independent
+let IsTex = 1, IsTexModeUnified = 0 in {
+// Texture fetch instructions using handles
+def TEX_1D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+ "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+ "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
+ "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], $lod;",
+ []>;
+def TEX_1D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_1D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+ "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+ "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], $lod;",
+ []>;
+def TEX_1D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_1D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+ "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+ "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+ []>;
+def TEX_1D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], $lod;",
+ []>;
+def TEX_1D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+
+def TEX_1D_ARRAY_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_1D_ARRAY_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_1D_ARRAY_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_1D_ARRAY_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_1D_ARRAY_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}];",
+ []>;
+def TEX_1D_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_1D_ARRAY_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+
+def TEX_2D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_2D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_2D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_2D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_2D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TEX_2D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_2D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+
+def TEX_2D_ARRAY_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_2D_ARRAY_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_2D_ARRAY_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_2D_ARRAY_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_2D_ARRAY_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_2D_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_2D_ARRAY_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+
+def TEX_3D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_3D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+def TEX_3D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_3D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+def TEX_3D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_3D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_3D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+
+def TEX_CUBE_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_CUBE_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_CUBE_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_CUBE_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_CUBE_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_CUBE_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+
+def TEX_CUBE_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_CUBE_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+def TEX_CUBE_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_CUBE_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+def TEX_CUBE_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_CUBE_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+
+def TLD4_R_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_G_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_B_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_A_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_R_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_G_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_B_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_A_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_R_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_G_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_B_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+def TLD4_A_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, $s, \\{$x, $y\\}];",
+ []>;
+}
+
+
+// texmode_unified
+let IsTex = 1, IsTexModeUnified = 1 in {
+// Texture fetch instructions using handles
+def TEX_UNIFIED_1D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x),
+ "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x),
+ "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod),
+ "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_UNIFIED_1D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x),
+ "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x),
+ "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_UNIFIED_1D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x),
+ "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x),
+ "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+ []>;
+def TEX_UNIFIED_1D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+
+def TEX_UNIFIED_1D_ARRAY_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+ "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+ "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}];",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$lod),
+ "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], $lod;",
+ []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$gradx, Float32Regs:$grady),
+ "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+ []>;
+
+def TEX_UNIFIED_2D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_UNIFIED_2D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_UNIFIED_2D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+ "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$lod),
+ "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+
+def TEX_UNIFIED_2D_ARRAY_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$y),
+ "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y),
+ "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}];",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y, Float32Regs:$lod),
+ "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+ []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+ Float32Regs:$y,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$grady0, Float32Regs:$grady1),
+ "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+ "\\{$grady0, $grady1\\};",
+ []>;
+
+def TEX_UNIFIED_3D_F32_S32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_3D_F32_F32_GRAD
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+def TEX_UNIFIED_3D_S32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_3D_S32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+def TEX_UNIFIED_3D_U32_S32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$z),
+ "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z),
+ "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_3D_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z, Float32Regs:$lod),
+ "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_3D_U32_F32_GRAD
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+ Float32Regs:$z,
+ Float32Regs:$gradx0, Float32Regs:$gradx1,
+ Float32Regs:$gradx2, Float32Regs:$grady0,
+ Float32Regs:$grady1, Float32Regs:$grady2),
+ "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], "
+ "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+ "\\{$grady0, $grady1, $grady2, $grady2\\};",
+ []>;
+
+def TEX_UNIFIED_CUBE_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_CUBE_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_CUBE_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+ []>;
+
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
+ : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+ Float32Regs:$b, Float32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+ "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}];",
+ []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$t, Int32Regs:$l,
+ Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+ Float32Regs:$lod),
+ "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+ "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+ []>;
+
+def TLD4_UNIFIED_R_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_G_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_B_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_A_2D_F32_F32
+ : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+ Float32Regs:$v2, Float32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_R_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_G_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_B_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_A_2D_S32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_R_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_G_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_B_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+def TLD4_UNIFIED_A_2D_U32_F32
+ : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+ Int32Regs:$v2, Int32Regs:$v3),
+ (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+ "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+ "[$t, \\{$x, $y\\}];",
+ []>;
+}
+
+
+
+//=== Surface load instructions
+// .clamp variant
+let IsSuld = 1 in {
+def SULD_1D_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b8.clamp \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b16.clamp \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b32.clamp \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b64.clamp \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b8.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b16.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b32.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b64.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b8.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b16.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b32.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b64.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b8.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b16.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b32.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b64.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b8.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b16.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b32.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b64.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b8.clamp \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b16.clamp \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b32.clamp \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b64.clamp \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_V2I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I64_CLAMP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+
+def SULD_3D_V4I8_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I16_CLAMP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I32_CLAMP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+
+// .trap variant
+let IsSuld = 1 in {
+def SULD_1D_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b64.trap \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b64.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b64.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b64.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b64.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b64.trap \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_V2I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I64_TRAP
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+
+def SULD_3D_V4I8_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I16_TRAP
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I32_TRAP
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+// .zero variant
+let IsSuld = 1 in {
+def SULD_1D_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b8.zero \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b16.zero \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b32.zero \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.b64.zero \\{$r\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b8.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b16.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b32.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.b64.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b8.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b16.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b32.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.b64.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b8.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b16.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b32.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.b64.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b8.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b16.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b32.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.b64.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b8.zero \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b16.zero \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b32.zero \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v2.b64.zero \\{$r, $g\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+def SULD_3D_V2I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V2I64_ZERO
+ : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+def SULD_1D_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x),
+ "suld.b.1d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+ []>;
+
+def SULD_1D_ARRAY_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+def SULD_1D_ARRAY_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+ "suld.b.a1d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x\\}];",
+ []>;
+
+def SULD_2D_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+def SULD_2D_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.2d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+ []>;
+
+def SULD_2D_ARRAY_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+def SULD_2D_ARRAY_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+ "suld.b.a2d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$l, $x, $y, $y\\}];",
+ []>;
+
+
+def SULD_3D_V4I8_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I16_ZERO
+ : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+def SULD_3D_V4I32_ZERO
+ : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+ "suld.b.3d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+ "[$s, \\{$x, $y, $z, $z\\}];",
+ []>;
+}
+
+//-----------------------------------
+// Texture Query Intrinsics
+//-----------------------------------
+
+let IsSurfTexQuery = 1 in {
+def TXQ_CHANNEL_ORDER
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.channel_order.b32 \t$d, [$a];",
+ []>;
+def TXQ_CHANNEL_DATA_TYPE
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.channel_data_type.b32 \t$d, [$a];",
+ []>;
+def TXQ_WIDTH
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.width.b32 \t$d, [$a];",
+ []>;
+def TXQ_HEIGHT
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.height.b32 \t$d, [$a];",
+ []>;
+def TXQ_DEPTH
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.depth.b32 \t$d, [$a];",
+ []>;
+def TXQ_ARRAY_SIZE
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.array_size.b32 \t$d, [$a];",
+ []>;
+def TXQ_NUM_SAMPLES
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.num_samples.b32 \t$d, [$a];",
+ []>;
+def TXQ_NUM_MIPMAP_LEVELS
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "txq.num_mipmap_levels.b32 \t$d, [$a];",
+ []>;
+}
+
+def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
+ (TXQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a),
+ (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_width Int64Regs:$a),
+ (TXQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_height Int64Regs:$a),
+ (TXQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_depth Int64Regs:$a),
+ (TXQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_array_size Int64Regs:$a),
+ (TXQ_ARRAY_SIZE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a),
+ (TXQ_NUM_SAMPLES Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
+ (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>;
+
+
+//-----------------------------------
+// Surface Query Intrinsics
+//-----------------------------------
+
+let IsSurfTexQuery = 1 in {
+def SUQ_CHANNEL_ORDER
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.channel_order.b32 \t$d, [$a];",
+ []>;
+def SUQ_CHANNEL_DATA_TYPE
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.channel_data_type.b32 \t$d, [$a];",
+ []>;
+def SUQ_WIDTH
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.width.b32 \t$d, [$a];",
+ []>;
+def SUQ_HEIGHT
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.height.b32 \t$d, [$a];",
+ []>;
+def SUQ_DEPTH
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.depth.b32 \t$d, [$a];",
+ []>;
+def SUQ_ARRAY_SIZE
+ : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "suq.array_size.b32 \t$d, [$a];",
+ []>;
+}
+
+def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
+ (SUQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a),
+ (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_width Int64Regs:$a),
+ (SUQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_height Int64Regs:$a),
+ (SUQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_depth Int64Regs:$a),
+ (SUQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_array_size Int64Regs:$a),
+ (SUQ_ARRAY_SIZE Int64Regs:$a)>;
+
+
+//===- Handle Query -------------------------------------------------------===//
+
+// TODO: These intrinsics are not yet finalized, pending PTX ISA design work
+def ISTYPEP_SAMPLER
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "istypep.samplerref \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>;
+def ISTYPEP_SURFACE
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "istypep.surfref \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>;
+def ISTYPEP_TEXTURE
+ : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "istypep.texref \t$d, $a;",
+ [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>;
+
+//===- Surface Stores -----------------------------------------------------===//
+
+let IsSust = 1 in {
+// Unformatted
+// .clamp variant
+def SUST_B_1D_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b8.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b16.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.1d.b32.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.1d.b64.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ "sust.b.1d.v2.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ "sust.b.1d.v2.b64.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ "sust.b.1d.v4.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_1D_ARRAY_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.a1d.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.a1d.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.a1d.v2.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.a1d.v2.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b8.clamp \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b16.clamp \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a1d.v4.b32.clamp \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ "sust.b.2d.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ "sust.b.2d.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.2d.v2.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.2d.v2.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b8.clamp \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b16.clamp \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.2d.v4.b32.clamp \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_ARRAY_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r),
+ "sust.b.a2d.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r),
+ "sust.b.a2d.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.a2d.v2.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.a2d.v2.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a2d.v4.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_3D_B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ "sust.b.3d.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ "sust.b.3d.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_V2B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.3d.v2.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B64_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.3d.v2.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V4B8_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B16_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B32_CLAMP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.3d.v4.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+// .trap variant
+def SUST_B_1D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.1d.b64.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ "sust.b.1d.v2.b64.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_1D_ARRAY_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.a1d.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.a1d.v2.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ "sust.b.2d.b64.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.2d.v2.b64.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_ARRAY_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r),
+ "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r),
+ "sust.b.a2d.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.a2d.v2.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_3D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ "sust.b.3d.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B64_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.3d.v2.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+// .zero variant
+def SUST_B_1D_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b8.zero \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.1d.b16.zero \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.1d.b32.zero \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.1d.b64.zero \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b8.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.b.1d.v2.b16.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ "sust.b.1d.v2.b32.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ "sust.b.1d.v2.b64.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b8.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.b.1d.v4.b16.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ "sust.b.1d.v4.b32.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_1D_ARRAY_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.b.a1d.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+ "sust.b.a1d.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+ "sust.b.a1d.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.a1d.v2.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.a1d.v2.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.a1d.v2.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b8.zero \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a1d.v4.b16.zero \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_1D_ARRAY_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a1d.v4.b32.zero \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b8.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.b.2d.b16.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ "sust.b.2d.b32.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ "sust.b.2d.b64.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b8.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.b.2d.v2.b16.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.b.2d.v2.b32.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ "sust.b.2d.v2.b64.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_B_2D_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b8.zero \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.2d.v4.b16.zero \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.2d.v4.b32.zero \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_2D_ARRAY_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.b.a2d.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r),
+ "sust.b.a2d.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r),
+ "sust.b.a2d.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.a2d.v2.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.a2d.v2.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.a2d.v2.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.a2d.v4.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_2D_ARRAY_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.a2d.v4.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_B_3D_B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.b.3d.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ "sust.b.3d.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ "sust.b.3d.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_B_3D_V2B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.b.3d.v2.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.b.3d.v2.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V2B64_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ "sust.b.3d.v2.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_B_3D_V4B8_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B16_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.b.3d.v4.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_B_3D_V4B32_ZERO
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.b.3d.v4.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+
+// Formatted
+
+def SUST_P_1D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.p.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ "sust.p.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ "sust.p.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.p.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ "sust.p.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ "sust.p.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.p.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_1D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+ Int16Regs:$b, Int16Regs:$a),
+ "sust.p.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_1D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+ Int32Regs:$b, Int32Regs:$a),
+ "sust.p.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_P_1D_ARRAY_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.p.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_ARRAY_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+ "sust.p.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_ARRAY_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+ "sust.p.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+ []>;
+def SUST_P_1D_ARRAY_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.p.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_ARRAY_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.p.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_ARRAY_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.p.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_1D_ARRAY_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_1D_ARRAY_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_1D_ARRAY_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.p.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_P_2D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.p.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ "sust.p.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ "sust.p.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.p.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_2D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g),
+ "sust.p.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_2D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ "sust.p.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+ []>;
+def SUST_P_2D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_2D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+ Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_2D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.p.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_P_2D_ARRAY_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.p.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_ARRAY_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r),
+ "sust.p.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_ARRAY_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r),
+ "sust.p.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+ []>;
+def SUST_P_2D_ARRAY_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.p.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_2D_ARRAY_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.p.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_2D_ARRAY_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.p.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_2D_ARRAY_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_2D_ARRAY_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_2D_ARRAY_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.p.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+
+
+def SUST_P_3D_B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.p.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_P_3D_B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ "sust.p.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_P_3D_B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ "sust.p.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+ []>;
+def SUST_P_3D_V2B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.p.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_3D_V2B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ "sust.p.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_3D_V2B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ "sust.p.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g\\};",
+ []>;
+def SUST_P_3D_V4B8_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_3D_V4B16_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ "sust.p.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+def SUST_P_3D_V4B32_TRAP
+ : NVPTXInst<(outs),
+ (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+ "\\{$r, $g, $b, $a\\};",
+ []>;
+}
+
+// Surface store instruction patterns
+// I'm not sure why we can't just include these in the instruction definitions,
+// but TableGen complains of type errors :(
+
+// .clamp variant
+def : Pat<(int_nvvm_sust_b_1d_i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_ARRAY_B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_ARRAY_B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_2D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_2D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B8_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B16_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_ARRAY_B32_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_ARRAY_B64_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ (SUST_B_2D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ (SUST_B_2D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B8_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B16_CLAMP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B8_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B16_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ (SUST_B_3D_B32_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ (SUST_B_3D_B64_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B8_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B16_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_3D_V2B32_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_3D_V2B64_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B8_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B16_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_3D_V4B32_CLAMP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .trap variant
+def : Pat<(int_nvvm_sust_b_1d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_ARRAY_B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_2D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B8_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B16_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_ARRAY_B32_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_ARRAY_B64_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ (SUST_B_2D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ (SUST_B_3D_B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ (SUST_B_3D_B64_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_3D_V2B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_3D_V2B64_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_3D_V4B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .zero variant
+def : Pat<(int_nvvm_sust_b_1d_i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
+ Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_B_1D_ARRAY_B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+ (SUST_B_1D_ARRAY_B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+ (SUST_B_1D_ARRAY_B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_1D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_1D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_1D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_1D_ARRAY_V4B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_1D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_2D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_2D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B8_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_B_2D_ARRAY_B16_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_B_2D_ARRAY_B32_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+ (SUST_B_2D_ARRAY_B64_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_2D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ (SUST_B_2D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+ Int64Regs:$g),
+ (SUST_B_2D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B8_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_2D_ARRAY_V4B16_ZERO Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_2D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B8_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_B_3D_B16_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ (SUST_B_3D_B32_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r),
+ (SUST_B_3D_B64_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B8_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_B_3D_V2B16_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ (SUST_B_3D_V2B32_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g),
+ (SUST_B_3D_V2B64_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B8_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_B_3D_V4B16_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_B_3D_V4B32_ZERO Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_P_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+ (SUST_P_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+ (SUST_P_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_P_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_P_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+ (SUST_P_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+ (SUST_P_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+ (SUST_P_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_P_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_P_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_P_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+ (SUST_P_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_P_2D_ARRAY_B8_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+ (SUST_P_2D_ARRAY_B16_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+ (SUST_P_2D_ARRAY_B32_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+ Int32Regs:$g),
+ (SUST_P_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+ Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
+ Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+ Int32Regs:$x, Int32Regs:$y,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_3d_i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_P_3D_B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r),
+ (SUST_P_3D_B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r),
+ (SUST_P_3D_B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_3D_V2B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g),
+ (SUST_P_3D_V2B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g),
+ (SUST_P_3D_V2B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_3D_V4B8_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+ (SUST_P_3D_V4B16_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
+ Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+ (SUST_P_3D_V4B32_TRAP Int64Regs:$s,
+ Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+ Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+//===-- Old PTX Back-end Intrinsics ---------------------------------------===//
+
+// These intrinsics are handled to retain compatibility with the old backend.
+
+// PTX Special Purpose Register Accessor Intrinsics
+
+class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+ : NVPTXInst<(outs Int64Regs:$d), (ins),
+ !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+ [(set Int64Regs:$d, (intop))]>;
+
+class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+ : NVPTXInst<(outs Int32Regs:$d), (ins),
+ !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+ [(set Int32Regs:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+def PTX_READ_TID_X : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
+ int_ptx_read_tid_x>;
+def PTX_READ_TID_Y : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
+ int_ptx_read_tid_y>;
+def PTX_READ_TID_Z : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
+ int_ptx_read_tid_z>;
+def PTX_READ_TID_W : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
+ int_ptx_read_tid_w>;
+
+def PTX_READ_NTID_X : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
+ int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
+ int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
+ int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
+ int_ptx_read_ntid_w>;
+
+def PTX_READ_LANEID : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
+ int_ptx_read_laneid>;
+def PTX_READ_WARPID : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
+ int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
+ int_ptx_read_nwarpid>;
+
+def PTX_READ_CTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
+ int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
+ int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
+ int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
+ int_ptx_read_ctaid_w>;
+
+def PTX_READ_NCTAID_X : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
+ int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
+ int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
+ int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
+ int_ptx_read_nctaid_w>;
+
+def PTX_READ_SMID : PTX_READ_SPECIAL_REGISTER_R32<"smid",
+ int_ptx_read_smid>;
+def PTX_READ_NSMID : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
+ int_ptx_read_nsmid>;
+def PTX_READ_GRIDID : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
+ int_ptx_read_gridid>;
+
+def PTX_READ_LANEMASK_EQ
+ : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
+def PTX_READ_LANEMASK_LE
+ : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
+def PTX_READ_LANEMASK_LT
+ : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
+def PTX_READ_LANEMASK_GE
+ : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
+def PTX_READ_LANEMASK_GT
+ : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
+
+def PTX_READ_CLOCK
+ : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
+def PTX_READ_CLOCK64
+ : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
+
+def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
+def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
+def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
+def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
+
+// PTX Parallel Synchronization and Communication Intrinsics
+
+def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+ [(int_ptx_bar_sync imm:$i)]>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
new file mode 100644
index 000000000000..f0c3663551bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -0,0 +1,205 @@
+//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+namespace llvm { FunctionPass *createLowerAggrCopies(); }
+
+char NVPTXLowerAggrCopies::ID = 0;
+
+// Lower MemTransferInst or load-store pair to loop
+static void convertTransferToLoop(
+ Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len,
+ //unsigned numLoads,
+ bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) {
+ Type *indType = len->getType();
+
+ BasicBlock *origBB = splitAt->getParent();
+ BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+ BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+ origBB->getTerminator()->setSuccessor(0, loopBB);
+ IRBuilder<> builder(origBB, origBB->getTerminator());
+
+ // srcAddr and dstAddr are expected to be pointer types,
+ // so no check is made here.
+ unsigned srcAS = dyn_cast<PointerType>(srcAddr->getType())->getAddressSpace();
+ unsigned dstAS = dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+ // Cast pointers to (char *)
+ srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS));
+ dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS));
+
+ IRBuilder<> loop(loopBB);
+ // The loop index (ind) is a phi node.
+ PHINode *ind = loop.CreatePHI(indType, 0);
+ // Incoming value for ind is 0
+ ind->addIncoming(ConstantInt::get(indType, 0), origBB);
+
+ // load from srcAddr+ind
+ Value *val = loop.CreateLoad(loop.CreateGEP(srcAddr, ind), srcVolatile);
+ // store at dstAddr+ind
+ loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), dstVolatile);
+
+ // The value for ind coming from backedge is (ind + 1)
+ Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1));
+ ind->addIncoming(newind, loopBB);
+
+ loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+// Lower MemSetInst to loop
+static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr,
+ Value *len, Value *val, LLVMContext &Context,
+ Function &F) {
+ BasicBlock *origBB = splitAt->getParent();
+ BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+ BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+ origBB->getTerminator()->setSuccessor(0, loopBB);
+ IRBuilder<> builder(origBB, origBB->getTerminator());
+
+ unsigned dstAS = dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+ // Cast pointer to the type of value getting stored
+ dstAddr =
+ builder.CreateBitCast(dstAddr, PointerType::get(val->getType(), dstAS));
+
+ IRBuilder<> loop(loopBB);
+ PHINode *ind = loop.CreatePHI(len->getType(), 0);
+ ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB);
+
+ loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), false);
+
+ Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1));
+ ind->addIncoming(newind, loopBB);
+
+ loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
+ SmallVector<LoadInst *, 4> aggrLoads;
+ SmallVector<MemTransferInst *, 4> aggrMemcpys;
+ SmallVector<MemSetInst *, 4> aggrMemsets;
+
+ const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout();
+ LLVMContext &Context = F.getParent()->getContext();
+
+ //
+ // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
+ //
+ //const BasicBlock *firstBB = &F.front(); // first BB in F
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+ //BasicBlock *bb = BI;
+ for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+ ++II) {
+ if (LoadInst *load = dyn_cast<LoadInst>(II)) {
+
+ if (load->hasOneUse() == false)
+ continue;
+
+ if (DL->getTypeStoreSize(load->getType()) < MaxAggrCopySize)
+ continue;
+
+ User *use = load->user_back();
+ if (StoreInst *store = dyn_cast<StoreInst>(use)) {
+ if (store->getOperand(0) != load) //getValueOperand
+ continue;
+ aggrLoads.push_back(load);
+ }
+ } else if (MemTransferInst *intr = dyn_cast<MemTransferInst>(II)) {
+ Value *len = intr->getLength();
+ // If the number of elements being copied is greater
+ // than MaxAggrCopySize, lower it to a loop
+ if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
+ if (len_int->getZExtValue() >= MaxAggrCopySize) {
+ aggrMemcpys.push_back(intr);
+ }
+ } else {
+ // turn variable length memcpy/memmov into loop
+ aggrMemcpys.push_back(intr);
+ }
+ } else if (MemSetInst *memsetintr = dyn_cast<MemSetInst>(II)) {
+ Value *len = memsetintr->getLength();
+ if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
+ if (len_int->getZExtValue() >= MaxAggrCopySize) {
+ aggrMemsets.push_back(memsetintr);
+ }
+ } else {
+ // turn variable length memset into loop
+ aggrMemsets.push_back(memsetintr);
+ }
+ }
+ }
+ }
+ if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) &&
+ (aggrMemsets.size() == 0))
+ return false;
+
+ //
+ // Do the transformation of an aggr load/copy/set to a loop
+ //
+ for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
+ LoadInst *load = aggrLoads[i];
+ StoreInst *store = dyn_cast<StoreInst>(*load->user_begin());
+ Value *srcAddr = load->getOperand(0);
+ Value *dstAddr = store->getOperand(1);
+ unsigned numLoads = DL->getTypeStoreSize(load->getType());
+ Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
+
+ convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
+ store->isVolatile(), Context, F);
+
+ store->eraseFromParent();
+ load->eraseFromParent();
+ }
+
+ for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) {
+ MemTransferInst *cpy = aggrMemcpys[i];
+ Value *len = cpy->getLength();
+ // llvm 2.7 version of memcpy does not have volatile
+ // operand yet. So always making it non-volatile
+ // optimistically, so that we don't see unnecessary
+ // st.volatile in ptx
+ convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false,
+ false, Context, F);
+ cpy->eraseFromParent();
+ }
+
+ for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) {
+ MemSetInst *memsetinst = aggrMemsets[i];
+ Value *len = memsetinst->getLength();
+ Value *val = memsetinst->getValue();
+ convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
+ F);
+ memsetinst->eraseFromParent();
+ }
+
+ return true;
+}
+
+FunctionPass *llvm::createLowerAggrCopies() {
+ return new NVPTXLowerAggrCopies();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
new file mode 100644
index 000000000000..5ec1fc969687
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -0,0 +1,48 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_LOWER_AGGR_COPIES_H
+#define NVPTX_LOWER_AGGR_COPIES_H
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXLowerAggrCopies : public FunctionPass {
+ static char ID;
+
+ NVPTXLowerAggrCopies() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DataLayoutPass>();
+ AU.addPreserved("stack-protector");
+ AU.addPreserved<MachineFunctionAnalysis>();
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ static const unsigned MaxAggrCopySize = 128;
+
+ const char *getPassName() const override {
+ return "Lower aggregate copies/intrinsics into loops";
+ }
+};
+
+extern FunctionPass *createLowerAggrCopies();
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
new file mode 100644
index 000000000000..137248b19a68
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -0,0 +1,47 @@
+//===-- NVPTXMCExpr.cpp - NVPTX specific MC expression classes ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCExpr.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-mcexpr"
+
+const NVPTXFloatMCExpr*
+NVPTXFloatMCExpr::Create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
+ return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
+}
+
+void NVPTXFloatMCExpr::PrintImpl(raw_ostream &OS) const {
+ bool Ignored;
+ unsigned NumHex;
+ APFloat APF = getAPFloat();
+
+ switch (Kind) {
+ default: llvm_unreachable("Invalid kind!");
+ case VK_NVPTX_SINGLE_PREC_FLOAT:
+ OS << "0f";
+ NumHex = 8;
+ APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &Ignored);
+ break;
+ case VK_NVPTX_DOUBLE_PREC_FLOAT:
+ OS << "0d";
+ NumHex = 16;
+ APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Ignored);
+ break;
+ }
+
+ APInt API = APF.bitcastToAPInt();
+ std::string HexStr(utohexstr(API.getZExtValue()));
+ if (HexStr.length() < NumHex)
+ OS << std::string(NumHex - HexStr.length(), '0');
+ OS << utohexstr(API.getZExtValue());
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
new file mode 100644
index 000000000000..554764930a9e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -0,0 +1,83 @@
+//===-- NVPTXMCExpr.h - NVPTX specific MC expression classes ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Modeled after ARMMCExpr
+
+#ifndef NVPTXMCEXPR_H
+#define NVPTXMCEXPR_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class NVPTXFloatMCExpr : public MCTargetExpr {
+public:
+ enum VariantKind {
+ VK_NVPTX_None,
+ VK_NVPTX_SINGLE_PREC_FLOAT, // FP constant in single-precision
+ VK_NVPTX_DOUBLE_PREC_FLOAT // FP constant in double-precision
+ };
+
+private:
+ const VariantKind Kind;
+ const APFloat Flt;
+
+ explicit NVPTXFloatMCExpr(VariantKind _Kind, APFloat _Flt)
+ : Kind(_Kind), Flt(_Flt) {}
+
+public:
+ /// @name Construction
+ /// @{
+
+ static const NVPTXFloatMCExpr *Create(VariantKind Kind, APFloat Flt,
+ MCContext &Ctx);
+
+ static const NVPTXFloatMCExpr *CreateConstantFPSingle(APFloat Flt,
+ MCContext &Ctx) {
+ return Create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
+ }
+
+ static const NVPTXFloatMCExpr *CreateConstantFPDouble(APFloat Flt,
+ MCContext &Ctx) {
+ return Create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
+ }
+
+ /// @}
+ /// @name Accessors
+ /// @{
+
+ /// getOpcode - Get the kind of this expression.
+ VariantKind getKind() const { return Kind; }
+
+ /// getSubExpr - Get the child of this expression.
+ APFloat getAPFloat() const { return Flt; }
+
+/// @}
+
+ void PrintImpl(raw_ostream &OS) const override;
+ bool EvaluateAsRelocatableImpl(MCValue &Res,
+ const MCAsmLayout *Layout) const override {
+ return false;
+ }
+ void visitUsedExpr(MCStreamer &Streamer) const override {};
+ const MCSection *FindAssociatedSection() const override {
+ return nullptr;
+ }
+
+ // There are no TLS NVPTXMCExprs at the moment.
+ void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+ static bool classof(const MCExpr *E) {
+ return E->getKind() == MCExpr::Target;
+ }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
new file mode 100644
index 000000000000..67fb39050797
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -0,0 +1,46 @@
+//===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is attached to a MachineFunction instance and tracks target-
+// dependent information
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
+private:
+ /// Stores a mapping from index to symbol name for removing image handles
+ /// on Fermi.
+ SmallVector<std::string, 8> ImageHandleList;
+
+public:
+ NVPTXMachineFunctionInfo(MachineFunction &MF) {}
+
+ /// Returns the index for the symbol \p Symbol. If the symbol was previously,
+ /// added, the same index is returned. Otherwise, the symbol is added and the
+ /// new index is returned.
+ unsigned getImageHandleSymbolIndex(const char *Symbol) {
+ // Is the symbol already present?
+ for (unsigned i = 0, e = ImageHandleList.size(); i != e; ++i)
+ if (ImageHandleList[i] == std::string(Symbol))
+ return i;
+ // Nope, insert it
+ ImageHandleList.push_back(Symbol);
+ return ImageHandleList.size()-1;
+ }
+
+ /// Returns the symbol name at the given index.
+ const char *getImageHandleSymbol(unsigned Idx) const {
+ assert(ImageHandleList.size() > Idx && "Bad index");
+ return ImageHandleList[Idx].c_str();
+ }
+};
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
new file mode 100644
index 000000000000..348ab0c4bf14
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -0,0 +1,227 @@
+//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified
+// to remove unneeded functionality and to handle virtual registers. Most code
+// here is a copy of PrologEpilogInserter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-prolog-epilog"
+
+namespace {
+class NVPTXPrologEpilogPass : public MachineFunctionPass {
+public:
+ static char ID;
+ NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ void calculateFrameObjectOffsets(MachineFunction &Fn);
+};
+}
+
+MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
+ return new NVPTXPrologEpilogPass();
+}
+
+char NVPTXPrologEpilogPass::ID = 0;
+
+bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameLowering &TFI = *TM.getFrameLowering();
+ const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+ bool Modified = false;
+
+ calculateFrameObjectOffsets(MF);
+
+ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) {
+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+ MachineInstr *MI = I;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ if (!MI->getOperand(i).isFI())
+ continue;
+ TRI.eliminateFrameIndex(MI, 0, i, nullptr);
+ Modified = true;
+ }
+ }
+ }
+
+ // Add function prolog/epilog
+ TFI.emitPrologue(MF);
+
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+ // If last instruction is a return instruction, add an epilogue
+ if (!I->empty() && I->back().isReturn())
+ TFI.emitEpilogue(MF, *I);
+ }
+
+ return Modified;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+ bool StackGrowsDown, int64_t &Offset,
+ unsigned &MaxAlign) {
+ // If the stack grows down, add the object size to find the lowest address.
+ if (StackGrowsDown)
+ Offset += MFI->getObjectSize(FrameIdx);
+
+ unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+ // If the alignment of this object is greater than that of the stack, then
+ // increase the stack alignment to match.
+ MaxAlign = std::max(MaxAlign, Align);
+
+ // Adjust to alignment boundary.
+ Offset = (Offset + Align - 1) / Align * Align;
+
+ if (StackGrowsDown) {
+ DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+ MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+ } else {
+ DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+ MFI->setObjectOffset(FrameIdx, Offset);
+ Offset += MFI->getObjectSize(FrameIdx);
+ }
+}
+
+void
+NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+ const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+ const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+
+ bool StackGrowsDown =
+ TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+ // Loop over all of the stack objects, assigning sequential addresses...
+ MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+ // Start at the beginning of the local area.
+ // The Offset is the distance from the stack top in the direction
+ // of stack growth -- so it's always nonnegative.
+ int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+ if (StackGrowsDown)
+ LocalAreaOffset = -LocalAreaOffset;
+ assert(LocalAreaOffset >= 0
+ && "Local area offset should be in direction of stack growth");
+ int64_t Offset = LocalAreaOffset;
+
+ // If there are fixed sized objects that are preallocated in the local area,
+ // non-fixed objects can't be allocated right at the start of local area.
+ // We currently don't support filling in holes in between fixed sized
+ // objects, so we adjust 'Offset' to point to the end of last fixed sized
+ // preallocated object.
+ for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+ int64_t FixedOff;
+ if (StackGrowsDown) {
+ // The maximum distance from the stack pointer is at lower address of
+ // the object -- which is given by offset. For down growing stack
+ // the offset is negative, so we negate the offset to get the distance.
+ FixedOff = -MFI->getObjectOffset(i);
+ } else {
+ // The maximum distance from the start pointer is at the upper
+ // address of the object.
+ FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+ }
+ if (FixedOff > Offset) Offset = FixedOff;
+ }
+
+ // NOTE: We do not have a call stack
+
+ unsigned MaxAlign = MFI->getMaxAlignment();
+
+ // No scavenger
+
+ // FIXME: Once this is working, then enable flag will change to a target
+ // check for whether the frame is large enough to want to use virtual
+ // frame index registers. Functions which don't want/need this optimization
+ // will continue to use the existing code path.
+ if (MFI->getUseLocalStackAllocationBlock()) {
+ unsigned Align = MFI->getLocalFrameMaxAlign();
+
+ // Adjust to alignment boundary.
+ Offset = (Offset + Align - 1) / Align * Align;
+
+ DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+ // Resolve offsets for objects in the local block.
+ for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+ std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+ int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+ DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+ FIOffset << "]\n");
+ MFI->setObjectOffset(Entry.first, FIOffset);
+ }
+ // Allocate the local block
+ Offset += MFI->getLocalFrameSize();
+
+ MaxAlign = std::max(Align, MaxAlign);
+ }
+
+ // No stack protector
+
+ // Then assign frame offsets to stack objects that are not used to spill
+ // callee saved registers.
+ for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+ if (MFI->isObjectPreAllocated(i) &&
+ MFI->getUseLocalStackAllocationBlock())
+ continue;
+ if (MFI->isDeadObjectIndex(i))
+ continue;
+
+ AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+ }
+
+ // No scavenger
+
+ if (!TFI.targetHandlesStackFrameRounding()) {
+ // If we have reserved argument space for call sites in the function
+ // immediately on entry to the current function, count it as part of the
+ // overall stack size.
+ if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+ Offset += MFI->getMaxCallFrameSize();
+
+ // Round up the size to a multiple of the alignment. If the function has
+ // any calls or alloca's, align to the target's StackAlignment value to
+ // ensure that the callee's frame or the alloca data is suitably aligned;
+ // otherwise, for leaf functions, align to the TransientStackAlignment
+ // value.
+ unsigned StackAlign;
+ if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+ (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+ StackAlign = TFI.getStackAlignment();
+ else
+ StackAlign = TFI.getTransientStackAlignment();
+
+ // If the frame pointer is eliminated, all frame offsets will be relative to
+ // SP not FP. Align to MaxAlign so this works.
+ StackAlign = std::max(StackAlign, MaxAlign);
+ unsigned AlignMask = StackAlign - 1;
+ Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+ }
+
+ // Update frame info to pretend that this is part of the stack...
+ int64_t StackSize = Offset - LocalAreaOffset;
+ MFI->setStackSize(StackSize);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
new file mode 100644
index 000000000000..358ccce39818
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -0,0 +1,111 @@
+//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXRegisterInfo.h"
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-reg-info"
+
+namespace llvm {
+std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
+ if (RC == &NVPTX::Float32RegsRegClass) {
+ return ".f32";
+ }
+ if (RC == &NVPTX::Float64RegsRegClass) {
+ return ".f64";
+ } else if (RC == &NVPTX::Int64RegsRegClass) {
+ return ".s64";
+ } else if (RC == &NVPTX::Int32RegsRegClass) {
+ return ".s32";
+ } else if (RC == &NVPTX::Int16RegsRegClass) {
+ return ".s16";
+ } else if (RC == &NVPTX::Int1RegsRegClass) {
+ return ".pred";
+ } else if (RC == &NVPTX::SpecialRegsRegClass) {
+ return "!Special!";
+ } else {
+ return "INTERNAL";
+ }
+ return "";
+}
+
+std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
+ if (RC == &NVPTX::Float32RegsRegClass) {
+ return "%f";
+ }
+ if (RC == &NVPTX::Float64RegsRegClass) {
+ return "%fd";
+ } else if (RC == &NVPTX::Int64RegsRegClass) {
+ return "%rd";
+ } else if (RC == &NVPTX::Int32RegsRegClass) {
+ return "%r";
+ } else if (RC == &NVPTX::Int16RegsRegClass) {
+ return "%rs";
+ } else if (RC == &NVPTX::Int1RegsRegClass) {
+ return "%p";
+ } else if (RC == &NVPTX::SpecialRegsRegClass) {
+ return "!Special!";
+ } else {
+ return "INTERNAL";
+ }
+ return "";
+}
+}
+
+NVPTXRegisterInfo::NVPTXRegisterInfo(const NVPTXSubtarget &st)
+ : NVPTXGenRegisterInfo(0), Is64Bit(st.is64Bit()) {}
+
+#define GET_REGINFO_TARGET_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+/// NVPTX Callee Saved Registers
+const MCPhysReg *
+NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ static const MCPhysReg CalleeSavedRegs[] = { 0 };
+ return CalleeSavedRegs;
+}
+
+BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ return Reserved;
+}
+
+void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
+ assert(SPAdj == 0 && "Unexpected");
+
+ MachineInstr &MI = *II;
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+ MachineFunction &MF = *MI.getParent()->getParent();
+ int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+ MI.getOperand(FIOperandNum + 1).getImm();
+
+ // Using I0 as the frame pointer
+ MI.getOperand(FIOperandNum).ChangeToRegister(NVPTX::VRFrame, false);
+ MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ return NVPTX::VRFrame;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
new file mode 100644
index 000000000000..a7594be121a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -0,0 +1,72 @@
+//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXREGISTERINFO_H
+#define NVPTXREGISTERINFO_H
+
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+#define GET_REGINFO_HEADER
+#include "NVPTXGenRegisterInfo.inc"
+
+namespace llvm {
+
+// Forward Declarations.
+class TargetInstrInfo;
+class NVPTXSubtarget;
+
+class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
+private:
+ bool Is64Bit;
+ // Hold Strings that can be free'd all together with NVPTXRegisterInfo
+ ManagedStringPool ManagedStrPool;
+
+public:
+ NVPTXRegisterInfo(const NVPTXSubtarget &st);
+
+ //------------------------------------------------------
+ // Pure virtual functions from TargetRegisterInfo
+ //------------------------------------------------------
+
+ // NVPTX callee saved registers
+ const MCPhysReg *
+ getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+
+ BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+ unsigned FIOperandNum,
+ RegScavenger *RS = nullptr) const override;
+
+ unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+ ManagedStringPool *getStrPool() const {
+ return const_cast<ManagedStringPool *>(&ManagedStrPool);
+ }
+
+ const char *getName(unsigned RegNo) const {
+ std::stringstream O;
+ O << "reg" << RegNo;
+ return getStrPool()->getManagedString(O.str().c_str())->c_str();
+ }
+
+};
+
+std::string getNVPTXRegClassName(const TargetRegisterClass *RC);
+std::string getNVPTXRegClassStr(const TargetRegisterClass *RC);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
new file mode 100644
index 000000000000..efcee6b6f2bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -0,0 +1,69 @@
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+ let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+ : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame : NVPTXReg<"%SP">;
+def VRFrameLocal : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot : NVPTXReg<"%Depot">;
+
+// We use virtual registers, but define a few physical registers here to keep
+// SDAG and the MachineInstr layers happy.
+foreach i = 0-4 in {
+ def P#i : NVPTXReg<"%p"#i>; // Predicate
+ def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
+ def R#i : NVPTXReg<"%r"#i>; // 32-bit
+ def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
+ def F#i : NVPTXReg<"%f"#i>; // 32-bit float
+ def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
+
+ // Arguments
+ def ia#i : NVPTXReg<"%ia"#i>;
+ def la#i : NVPTXReg<"%la"#i>;
+ def fa#i : NVPTXReg<"%fa"#i>;
+ def da#i : NVPTXReg<"%da"#i>;
+}
+
+foreach i = 0-31 in {
+ def ENVREG#i : NVPTXReg<"%envreg"#i>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register classes
+//===----------------------------------------------------------------------===//
+def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
+def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
+def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
+def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
+def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
+def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 4))>;
+def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
+def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
+
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot,
+ (sequence "ENVREG%u", 0, 31))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
new file mode 100644
index 000000000000..20d4e272341e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -0,0 +1,189 @@
+//===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// On Fermi, image handles are not supported. To work around this, we traverse
+// the machine code and replace image handles with concrete symbols. For this
+// to work reliably, inlining of all function call must be performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXReplaceImageHandles : public MachineFunctionPass {
+private:
+ static char ID;
+ DenseSet<MachineInstr *> InstrsToRemove;
+
+public:
+ NVPTXReplaceImageHandles();
+
+ bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "NVPTX Replace Image Handles";
+ }
+private:
+ bool processInstr(MachineInstr &MI);
+ void replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
+ bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF,
+ unsigned &Idx);
+};
+}
+
+char NVPTXReplaceImageHandles::ID = 0;
+
+NVPTXReplaceImageHandles::NVPTXReplaceImageHandles()
+ : MachineFunctionPass(ID) {}
+
+bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
+ bool Changed = false;
+ InstrsToRemove.clear();
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+ ++BI) {
+ for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+ Changed |= processInstr(MI);
+ }
+ }
+
+ // Now clean up any handle-access instructions
+ // This is needed in debug mode when code cleanup passes are not executed,
+ // but we need the handle access to be eliminated because they are not
+ // valid instructions when image handles are disabled.
+ for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(),
+ E = InstrsToRemove.end(); I != E; ++I) {
+ (*I)->eraseFromParent();
+ }
+ return Changed;
+}
+
+bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
+ MachineFunction &MF = *MI.getParent()->getParent();
+ const MCInstrDesc &MCID = MI.getDesc();
+
+ if (MCID.TSFlags & NVPTXII::IsTexFlag) {
+ // This is a texture fetch, so operand 4 is a texref and operand 5 is
+ // a samplerref
+ MachineOperand &TexHandle = MI.getOperand(4);
+ replaceImageHandle(TexHandle, MF);
+
+ if (!(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+ MachineOperand &SampHandle = MI.getOperand(5);
+ replaceImageHandle(SampHandle, MF);
+ }
+
+ return true;
+ } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+ unsigned VecSize =
+ 1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+ // For a surface load of vector size N, the Nth operand will be the surfref
+ MachineOperand &SurfHandle = MI.getOperand(VecSize);
+
+ replaceImageHandle(SurfHandle, MF);
+
+ return true;
+ } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
+ // This is a surface store, so operand 0 is a surfref
+ MachineOperand &SurfHandle = MI.getOperand(0);
+
+ replaceImageHandle(SurfHandle, MF);
+
+ return true;
+ } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
+ // This is a query, so operand 1 is a surfref/texref
+ MachineOperand &Handle = MI.getOperand(1);
+
+ replaceImageHandle(Handle, MF);
+
+ return true;
+ }
+
+ return false;
+}
+
+void NVPTXReplaceImageHandles::
+replaceImageHandle(MachineOperand &Op, MachineFunction &MF) {
+ unsigned Idx;
+ if (findIndexForHandle(Op, MF, Idx)) {
+ Op.ChangeToImmediate(Idx);
+ }
+}
+
+bool NVPTXReplaceImageHandles::
+findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ NVPTXMachineFunctionInfo *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
+
+ assert(Op.isReg() && "Handle is not in a reg?");
+
+ // Which instruction defines the handle?
+ MachineInstr &TexHandleDef = *MRI.getVRegDef(Op.getReg());
+
+ switch (TexHandleDef.getOpcode()) {
+ case NVPTX::LD_i64_avar: {
+ // The handle is a parameter value being loaded, replace with the
+ // parameter symbol
+ const NVPTXSubtarget &ST = MF.getTarget().getSubtarget<NVPTXSubtarget>();
+ if (ST.getDrvInterface() == NVPTX::CUDA) {
+ // For CUDA, we preserve the param loads coming from function arguments
+ return false;
+ }
+
+ assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
+ StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
+ std::string ParamBaseName = MF.getName();
+ ParamBaseName += "_param_";
+ assert(Sym.startswith(ParamBaseName) && "Invalid symbol reference");
+ unsigned Param = atoi(Sym.data()+ParamBaseName.size());
+ std::string NewSym;
+ raw_string_ostream NewSymStr(NewSym);
+ NewSymStr << MF.getFunction()->getName() << "_param_" << Param;
+
+ InstrsToRemove.insert(&TexHandleDef);
+ Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str());
+ return true;
+ }
+ case NVPTX::texsurf_handles: {
+ // The handle is a global variable, replace with the global variable name
+ assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!");
+ const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal();
+ assert(GV->hasName() && "Global sampler must be named!");
+ InstrsToRemove.insert(&TexHandleDef);
+ Idx = MFI->getImageHandleSymbolIndex(GV->getName().data());
+ return true;
+ }
+ case NVPTX::nvvm_move_i64:
+ case TargetOpcode::COPY: {
+ bool Res = findIndexForHandle(TexHandleDef.getOperand(1), MF, Idx);
+ if (Res) {
+ InstrsToRemove.insert(&TexHandleDef);
+ }
+ return Res;
+ }
+ default:
+ llvm_unreachable("Unknown instruction operating on handle");
+ }
+}
+
+MachineFunctionPass *llvm::createNVPTXReplaceImageHandlesPass() {
+ return new NVPTXReplaceImageHandles();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
new file mode 100644
index 000000000000..aa0436bf0da7
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
@@ -0,0 +1,48 @@
+//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTXSection class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_NVPTXSECTION_H
+#define LLVM_NVPTXSECTION_H
+
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCSection.h"
+#include <vector>
+
+namespace llvm {
+/// NVPTXSection - Represents a section in PTX
+/// PTX does not have sections. We create this class in order to use
+/// the ASMPrint interface.
+///
+class NVPTXSection : public MCSection {
+ virtual void anchor();
+public:
+ NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
+ virtual ~NVPTXSection() {}
+
+ /// Override this as NVPTX has its own way of printing switching
+ /// to a section.
+ void PrintSwitchToSection(const MCAsmInfo &MAI,
+ raw_ostream &OS,
+ const MCExpr *Subsection) const override {}
+
+ /// Base address of PTX sections is zero.
+ bool isBaseAddressKnownZero() const override { return true; }
+ bool UseCodeAlign() const override { return false; }
+ bool isVirtualSection() const override { return false; }
+ std::string getLabelBeginName() const override { return ""; }
+ std::string getLabelEndName() const override { return ""; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
new file mode 100644
index 000000000000..d5cded218362
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -0,0 +1,70 @@
+//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSubtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-subtarget"
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "NVPTXGenSubtargetInfo.inc"
+
+// Pin the vtable to this file.
+void NVPTXSubtarget::anchor() {}
+
+static std::string computeDataLayout(bool is64Bit) {
+ std::string Ret = "e";
+
+ if (!is64Bit)
+ Ret += "-p:32:32";
+
+ Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+ return Ret;
+}
+
+NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
+ StringRef FS) {
+ // Provide the default CPU if we don't have one.
+ if (CPU.empty() && FS.size())
+ llvm_unreachable("we are not using FeatureStr");
+ TargetName = CPU.empty() ? "sm_20" : CPU;
+
+ ParseSubtargetFeatures(TargetName, FS);
+
+ // Set default to PTX 3.2 (CUDA 5.5)
+ if (PTXVersion == 0) {
+ PTXVersion = 32;
+ }
+
+ return *this;
+}
+
+NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM,
+ bool is64Bit)
+ : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
+ SmVersion(20), DL(computeDataLayout(is64Bit)),
+ InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+ TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) {
+
+ Triple T(TT);
+
+ if (T.getOS() == Triple::NVCL)
+ drvInterface = NVPTX::NVCL;
+ else
+ drvInterface = NVPTX::CUDA;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
new file mode 100644
index 000000000000..4c41e4e470dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -0,0 +1,116 @@
+//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXSUBTARGET_H
+#define NVPTXSUBTARGET_H
+
+#include "NVPTX.h"
+#include "NVPTXFrameLowering.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "NVPTXGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
+ virtual void anchor();
+ std::string TargetName;
+ NVPTX::DrvInterface drvInterface;
+ bool Is64Bit;
+
+ // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
+ unsigned PTXVersion;
+
+ // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+ unsigned int SmVersion;
+
+ const DataLayout DL; // Calculates type size & alignment
+ NVPTXInstrInfo InstrInfo;
+ NVPTXTargetLowering TLInfo;
+ TargetSelectionDAGInfo TSInfo;
+
+ // NVPTX does not have any call stack frame, but need a NVPTX specific
+ // FrameLowering class because TargetFrameLowering is abstract.
+ NVPTXFrameLowering FrameLowering;
+
+public:
+ /// This constructor initializes the data members to match that
+ /// of the specified module.
+ ///
+ NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+ const std::string &FS, const TargetMachine &TM, bool is64Bit);
+
+ const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
+ const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
+ const DataLayout *getDataLayout() const { return &DL; }
+ const NVPTXRegisterInfo *getRegisterInfo() const {
+ return &InstrInfo.getRegisterInfo();
+ }
+ const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; }
+ const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+
+ bool hasBrkPt() const { return SmVersion >= 11; }
+ bool hasAtomRedG32() const { return SmVersion >= 11; }
+ bool hasAtomRedS32() const { return SmVersion >= 12; }
+ bool hasAtomRedG64() const { return SmVersion >= 12; }
+ bool hasAtomRedS64() const { return SmVersion >= 20; }
+ bool hasAtomRedGen32() const { return SmVersion >= 20; }
+ bool hasAtomRedGen64() const { return SmVersion >= 20; }
+ bool hasAtomAddF32() const { return SmVersion >= 20; }
+ bool hasVote() const { return SmVersion >= 12; }
+ bool hasDouble() const { return SmVersion >= 13; }
+ bool reqPTX20() const { return SmVersion >= 20; }
+ bool hasF32FTZ() const { return SmVersion >= 20; }
+ bool hasFMAF32() const { return SmVersion >= 20; }
+ bool hasFMAF64() const { return SmVersion >= 13; }
+ bool hasLDG() const { return SmVersion >= 32; }
+ bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); }
+ bool hasGenericLdSt() const { return SmVersion >= 20; }
+ inline bool hasHWROT32() const { return SmVersion >= 32; }
+ inline bool hasSWROT32() const {
+ return ((SmVersion >= 20) && (SmVersion < 32));
+ }
+ inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
+ inline bool hasROT64() const { return SmVersion >= 20; }
+
+ bool hasImageHandles() const {
+ // Enable handles for Kepler+, where CUDA supports indirect surfaces and
+ // textures
+ if (getDrvInterface() == NVPTX::CUDA)
+ return (SmVersion >= 30);
+
+ // Disabled, otherwise
+ return false;
+ }
+ bool is64Bit() const { return Is64Bit; }
+
+ unsigned int getSmVersion() const { return SmVersion; }
+ NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
+ std::string getTargetName() const { return TargetName; }
+
+ unsigned getPTXVersion() const { return PTXVersion; }
+
+ NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+};
+
+} // End llvm namespace
+
+#endif // NVPTXSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
new file mode 100644
index 000000000000..069a1b9966f0
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -0,0 +1,250 @@
+//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXAllocaHoisting.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVVMReflectPass(PassRegistry&);
+void initializeGenericToNVVMPass(PassRegistry&);
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+}
+
+extern "C" void LLVMInitializeNVPTXTarget() {
+ // Register the target.
+ RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
+ RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
+
+ // FIXME: This pass is really intended to be invoked during IR optimization,
+ // but it's very NVPTX-specific.
+ initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+ initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
+ initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
+ initializeNVPTXFavorNonGenericAddrSpacesPass(
+ *PassRegistry::getPassRegistry());
+}
+
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT,
+ StringRef CPU, StringRef FS,
+ const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL, bool is64bit)
+ : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+ Subtarget(TT, CPU, FS, *this, is64bit) {
+ initAsmInfo();
+}
+
+void NVPTXTargetMachine32::anchor() {}
+
+NVPTXTargetMachine32::NVPTXTargetMachine32(
+ const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void NVPTXTargetMachine64::anchor() {}
+
+NVPTXTargetMachine64::NVPTXTargetMachine64(
+ const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL)
+ : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+namespace {
+class NVPTXPassConfig : public TargetPassConfig {
+public:
+ NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+ : TargetPassConfig(TM, PM) {}
+
+ NVPTXTargetMachine &getNVPTXTargetMachine() const {
+ return getTM<NVPTXTargetMachine>();
+ }
+
+ void addIRPasses() override;
+ bool addInstSelector() override;
+ bool addPreRegAlloc() override;
+ bool addPostRegAlloc() override;
+ void addMachineSSAOptimization() override;
+
+ FunctionPass *createTargetRegisterAllocator(bool) override;
+ void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+ void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+};
+} // end anonymous namespace
+
+TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+ NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
+ return PassConfig;
+}
+
+void NVPTXPassConfig::addIRPasses() {
+ // The following passes are known to not play well with virtual regs hanging
+ // around after register allocation (which in our case, is *all* registers).
+ // We explicitly disable them here. We do, however, need some functionality
+ // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+ // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+ disablePass(&PrologEpilogCodeInserterID);
+ disablePass(&MachineCopyPropagationID);
+ disablePass(&BranchFolderPassID);
+ disablePass(&TailDuplicateID);
+
+ addPass(createNVPTXImageOptimizerPass());
+ TargetPassConfig::addIRPasses();
+ addPass(createNVPTXAssignValidGlobalNamesPass());
+ addPass(createGenericToNVVMPass());
+ addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+ addPass(createSeparateConstOffsetFromGEPPass());
+ // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used
+ // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates
+ // significantly better code than EarlyCSE for some of our benchmarks.
+ if (getOptLevel() == CodeGenOpt::Aggressive)
+ addPass(createGVNPass());
+ else
+ addPass(createEarlyCSEPass());
+ // Both FavorNonGenericAddrSpaces and SeparateConstOffsetFromGEP may leave
+ // some dead code. We could remove dead code in an ad-hoc manner, but that
+ // requires manual work and might be error-prone.
+ //
+ // The FavorNonGenericAddrSpaces pass shortcuts unnecessary addrspacecasts,
+ // and leave them unused.
+ //
+ // SeparateConstOffsetFromGEP rebuilds a new index from the old index, and the
+ // old index and some of its intermediate results may become unused.
+ addPass(createDeadCodeEliminationPass());
+}
+
+bool NVPTXPassConfig::addInstSelector() {
+ const NVPTXSubtarget &ST =
+ getTM<NVPTXTargetMachine>().getSubtarget<NVPTXSubtarget>();
+
+ addPass(createLowerAggrCopies());
+ addPass(createAllocaHoisting());
+ addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+
+ if (!ST.hasImageHandles())
+ addPass(createNVPTXReplaceImageHandlesPass());
+
+ return false;
+}
+
+bool NVPTXPassConfig::addPreRegAlloc() { return false; }
+bool NVPTXPassConfig::addPostRegAlloc() {
+ addPass(createNVPTXPrologEpilogPass());
+ return false;
+}
+
+FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
+ return nullptr; // No reg alloc
+}
+
+void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+ assert(!RegAllocPass && "NVPTX uses no regalloc!");
+ addPass(&PHIEliminationID);
+ addPass(&TwoAddressInstructionPassID);
+}
+
+void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+ assert(!RegAllocPass && "NVPTX uses no regalloc!");
+
+ addPass(&ProcessImplicitDefsID);
+ addPass(&LiveVariablesID);
+ addPass(&MachineLoopInfoID);
+ addPass(&PHIEliminationID);
+
+ addPass(&TwoAddressInstructionPassID);
+ addPass(&RegisterCoalescerID);
+
+ // PreRA instruction scheduling.
+ if (addPass(&MachineSchedulerID))
+ printAndVerify("After Machine Scheduling");
+
+
+ addPass(&StackSlotColoringID);
+
+ // FIXME: Needs physical registers
+ //addPass(&PostRAMachineLICMID);
+
+ printAndVerify("After StackSlotColoring");
+}
+
+void NVPTXPassConfig::addMachineSSAOptimization() {
+ // Pre-ra tail duplication.
+ if (addPass(&EarlyTailDuplicateID))
+ printAndVerify("After Pre-RegAlloc TailDuplicate");
+
+ // Optimize PHIs before DCE: removing dead PHI cycles may make more
+ // instructions dead.
+ addPass(&OptimizePHIsID);
+
+ // This pass merges large allocas. StackSlotColoring is a different pass
+ // which merges spill slots.
+ addPass(&StackColoringID);
+
+ // If the target requests it, assign local variables to stack slots relative
+ // to one another and simplify frame index references where possible.
+ addPass(&LocalStackSlotAllocationID);
+
+ // With optimization, dead code should already be eliminated. However
+ // there is one known exception: lowered code for arguments that are only
+ // used by tail calls, where the tail calls reuse the incoming stack
+ // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+ addPass(&DeadMachineInstructionElimID);
+ printAndVerify("After codegen DCE pass");
+
+ // Allow targets to insert passes that improve instruction level parallelism,
+ // like if-conversion. Such passes will typically need dominator trees and
+ // loop info, just like LICM and CSE below.
+ if (addILPOpts())
+ printAndVerify("After ILP optimizations");
+
+ addPass(&MachineLICMID);
+ addPass(&MachineCSEID);
+
+ addPass(&MachineSinkingID);
+ printAndVerify("After Machine LICM, CSE and Sinking passes");
+
+ addPass(&PeepholeOptimizerID);
+ printAndVerify("After codegen peephole optimization pass");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
new file mode 100644
index 000000000000..a7a1c8f4e171
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -0,0 +1,100 @@
+//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_TARGETMACHINE_H
+#define NVPTX_TARGETMACHINE_H
+
+#include "NVPTXSubtarget.h"
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// NVPTXTargetMachine
+///
+class NVPTXTargetMachine : public LLVMTargetMachine {
+ NVPTXSubtarget Subtarget;
+
+ // Hold Strings that can be free'd all together with NVPTXTargetMachine
+ ManagedStringPool ManagedStrPool;
+
+public:
+ NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+ const TargetOptions &Options, Reloc::Model RM,
+ CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
+
+ const TargetFrameLowering *getFrameLowering() const override {
+ return getSubtargetImpl()->getFrameLowering();
+ }
+ const NVPTXInstrInfo *getInstrInfo() const override {
+ return getSubtargetImpl()->getInstrInfo();
+ }
+ const DataLayout *getDataLayout() const override {
+ return getSubtargetImpl()->getDataLayout();
+ }
+ const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+ const NVPTXRegisterInfo *getRegisterInfo() const override {
+ return getSubtargetImpl()->getRegisterInfo();
+ }
+
+ const NVPTXTargetLowering *getTargetLowering() const override {
+ return getSubtargetImpl()->getTargetLowering();
+ }
+
+ const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+ return getSubtargetImpl()->getSelectionDAGInfo();
+ }
+
+ ManagedStringPool *getManagedStrPool() const {
+ return const_cast<ManagedStringPool *>(&ManagedStrPool);
+ }
+
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+ // Emission of machine code through JITCodeEmitter is not supported.
+ bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &,
+ bool = true) override {
+ return true;
+ }
+
+ // Emission of machine code through MCJIT is not supported.
+ bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &,
+ bool = true) override {
+ return true;
+ }
+
+}; // NVPTXTargetMachine.
+
+class NVPTXTargetMachine32 : public NVPTXTargetMachine {
+ virtual void anchor();
+public:
+ NVPTXTargetMachine32(const Target &T, StringRef TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+class NVPTXTargetMachine64 : public NVPTXTargetMachine {
+ virtual void anchor();
+public:
+ NVPTXTargetMachine64(const Target &T, StringRef TT, StringRef CPU,
+ StringRef FS, const TargetOptions &Options,
+ Reloc::Model RM, CodeModel::Model CM,
+ CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
new file mode 100644
index 000000000000..ba8086d78880
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -0,0 +1,105 @@
+//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+#define LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+
+#include "NVPTXSection.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <string>
+
+namespace llvm {
+class GlobalVariable;
+class Module;
+
+class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
+
+public:
+ NVPTXTargetObjectFile() {
+ TextSection = nullptr;
+ DataSection = nullptr;
+ BSSSection = nullptr;
+ ReadOnlySection = nullptr;
+
+ StaticCtorSection = nullptr;
+ StaticDtorSection = nullptr;
+ LSDASection = nullptr;
+ EHFrameSection = nullptr;
+ DwarfAbbrevSection = nullptr;
+ DwarfInfoSection = nullptr;
+ DwarfLineSection = nullptr;
+ DwarfFrameSection = nullptr;
+ DwarfPubTypesSection = nullptr;
+ DwarfDebugInlineSection = nullptr;
+ DwarfStrSection = nullptr;
+ DwarfLocSection = nullptr;
+ DwarfARangesSection = nullptr;
+ DwarfRangesSection = nullptr;
+ DwarfMacroInfoSection = nullptr;
+ }
+
+ virtual ~NVPTXTargetObjectFile();
+
+ void Initialize(MCContext &ctx, const TargetMachine &TM) override {
+ TargetLoweringObjectFile::Initialize(ctx, TM);
+ TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
+ DataSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel());
+ BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
+ ReadOnlySection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
+
+ StaticCtorSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ StaticDtorSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ LSDASection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ EHFrameSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfAbbrevSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfInfoSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfLineSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfFrameSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfPubTypesSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfDebugInlineSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfStrSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfLocSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfARangesSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfRangesSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ DwarfMacroInfoSection =
+ new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+ }
+
+ const MCSection *getSectionForConstant(SectionKind Kind,
+ const Constant *C) const override {
+ return ReadOnlySection;
+ }
+
+ const MCSection *getExplicitSectionGlobal(const GlobalValue *GV,
+ SectionKind Kind, Mangler &Mang,
+ const TargetMachine &TM) const override {
+ return DataSection;
+ }
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
new file mode 100644
index 000000000000..a9fd190b7ff0
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -0,0 +1,543 @@
+//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/MutexGuard.h"
+
+using namespace llvm;
+
+typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
+typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
+typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
+
+ManagedStatic<per_module_annot_t> annotationCache;
+static sys::Mutex Lock;
+
+void llvm::clearAnnotationCache(const llvm::Module *Mod) {
+ MutexGuard Guard(Lock);
+ annotationCache->erase(Mod);
+}
+
+static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+ MutexGuard Guard(Lock);
+ assert(md && "Invalid mdnode for annotation");
+ assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+ // start index = 1, to skip the global variable key
+ // increment = 2, to skip the value for each property-value pairs
+ for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+ // property
+ const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+ assert(prop && "Annotation property not a string");
+
+ // value
+ ConstantInt *Val = dyn_cast<ConstantInt>(md->getOperand(i + 1));
+ assert(Val && "Value operand not a constant int");
+
+ std::string keyname = prop->getString().str();
+ if (retval.find(keyname) != retval.end())
+ retval[keyname].push_back(Val->getZExtValue());
+ else {
+ std::vector<unsigned> tmp;
+ tmp.push_back(Val->getZExtValue());
+ retval[keyname] = tmp;
+ }
+ }
+}
+
+static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+ MutexGuard Guard(Lock);
+ NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations);
+ if (!NMD)
+ return;
+ key_val_pair_t tmp;
+ for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+ const MDNode *elem = NMD->getOperand(i);
+
+ Value *entity = elem->getOperand(0);
+ // entity may be null due to DCE
+ if (!entity)
+ continue;
+ if (entity != gv)
+ continue;
+
+ // accumulate annotations for entity in tmp
+ cacheAnnotationFromMD(elem, tmp);
+ }
+
+ if (tmp.empty()) // no annotations for this gv
+ return;
+
+ if ((*annotationCache).find(m) != (*annotationCache).end())
+ (*annotationCache)[m][gv] = tmp;
+ else {
+ global_val_annot_t tmp1;
+ tmp1[gv] = tmp;
+ (*annotationCache)[m] = tmp1;
+ }
+}
+
+bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
+ unsigned &retval) {
+ MutexGuard Guard(Lock);
+ const Module *m = gv->getParent();
+ if ((*annotationCache).find(m) == (*annotationCache).end())
+ cacheAnnotationFromMD(m, gv);
+ else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+ cacheAnnotationFromMD(m, gv);
+ if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+ return false;
+ retval = (*annotationCache)[m][gv][prop][0];
+ return true;
+}
+
+bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
+ std::vector<unsigned> &retval) {
+ MutexGuard Guard(Lock);
+ const Module *m = gv->getParent();
+ if ((*annotationCache).find(m) == (*annotationCache).end())
+ cacheAnnotationFromMD(m, gv);
+ else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+ cacheAnnotationFromMD(m, gv);
+ if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+ return false;
+ retval = (*annotationCache)[m][gv][prop];
+ return true;
+}
+
+bool llvm::isTexture(const llvm::Value &val) {
+ if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned annot;
+ if (llvm::findOneNVVMAnnotation(
+ gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISTEXTURE],
+ annot)) {
+ assert((annot == 1) && "Unexpected annotation on a texture symbol");
+ return true;
+ }
+ }
+ return false;
+}
+
+bool llvm::isSurface(const llvm::Value &val) {
+ if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned annot;
+ if (llvm::findOneNVVMAnnotation(
+ gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSURFACE],
+ annot)) {
+ assert((annot == 1) && "Unexpected annotation on a surface symbol");
+ return true;
+ }
+ }
+ return false;
+}
+
+bool llvm::isSampler(const llvm::Value &val) {
+ if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned annot;
+ if (llvm::findOneNVVMAnnotation(
+ gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+ annot)) {
+ assert((annot == 1) && "Unexpected annotation on a sampler symbol");
+ return true;
+ }
+ }
+ if (const Argument *arg = dyn_cast<Argument>(&val)) {
+ const Function *func = arg->getParent();
+ std::vector<unsigned> annot;
+ if (llvm::findAllNVVMAnnotation(
+ func, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+ annot)) {
+ if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+ return true;
+ }
+ }
+ return false;
+}
+
+bool llvm::isImageReadOnly(const llvm::Value &val) {
+ if (const Argument *arg = dyn_cast<Argument>(&val)) {
+ const Function *func = arg->getParent();
+ std::vector<unsigned> annot;
+ if (llvm::findAllNVVMAnnotation(func,
+ llvm::PropertyAnnotationNames[
+ llvm::PROPERTY_ISREADONLY_IMAGE_PARAM],
+ annot)) {
+ if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+ return true;
+ }
+ }
+ return false;
+}
+
+bool llvm::isImageWriteOnly(const llvm::Value &val) {
+ if (const Argument *arg = dyn_cast<Argument>(&val)) {
+ const Function *func = arg->getParent();
+ std::vector<unsigned> annot;
+ if (llvm::findAllNVVMAnnotation(func,
+ llvm::PropertyAnnotationNames[
+ llvm::PROPERTY_ISWRITEONLY_IMAGE_PARAM],
+ annot)) {
+ if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+ return true;
+ }
+ }
+ return false;
+}
+
+bool llvm::isImageReadWrite(const llvm::Value &val) {
+ if (const Argument *arg = dyn_cast<Argument>(&val)) {
+ const Function *func = arg->getParent();
+ std::vector<unsigned> annot;
+ if (llvm::findAllNVVMAnnotation(func,
+ llvm::PropertyAnnotationNames[
+ llvm::PROPERTY_ISREADWRITE_IMAGE_PARAM],
+ annot)) {
+ if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+ return true;
+ }
+ }
+ return false;
+}
+
+bool llvm::isImage(const llvm::Value &val) {
+ return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val) ||
+ llvm::isImageReadWrite(val);
+}
+
+bool llvm::isManaged(const llvm::Value &val) {
+ if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+ unsigned annot;
+ if(llvm::findOneNVVMAnnotation(gv,
+ llvm::PropertyAnnotationNames[llvm::PROPERTY_MANAGED],
+ annot)) {
+ assert((annot == 1) && "Unexpected annotation on a managed symbol");
+ return true;
+ }
+ }
+ return false;
+}
+
+std::string llvm::getTextureName(const llvm::Value &val) {
+ assert(val.hasName() && "Found texture variable with no name");
+ return val.getName();
+}
+
+std::string llvm::getSurfaceName(const llvm::Value &val) {
+ assert(val.hasName() && "Found surface variable with no name");
+ return val.getName();
+}
+
+std::string llvm::getSamplerName(const llvm::Value &val) {
+ assert(val.hasName() && "Found sampler variable with no name");
+ return val.getName();
+}
+
+bool llvm::getMaxNTIDx(const Function &F, unsigned &x) {
+ return (llvm::findOneNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_X], x));
+}
+
+bool llvm::getMaxNTIDy(const Function &F, unsigned &y) {
+ return (llvm::findOneNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Y], y));
+}
+
+bool llvm::getMaxNTIDz(const Function &F, unsigned &z) {
+ return (llvm::findOneNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Z], z));
+}
+
+bool llvm::getReqNTIDx(const Function &F, unsigned &x) {
+ return (llvm::findOneNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_X], x));
+}
+
+bool llvm::getReqNTIDy(const Function &F, unsigned &y) {
+ return (llvm::findOneNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Y], y));
+}
+
+bool llvm::getReqNTIDz(const Function &F, unsigned &z) {
+ return (llvm::findOneNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Z], z));
+}
+
+bool llvm::getMinCTASm(const Function &F, unsigned &x) {
+ return (llvm::findOneNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MINNCTAPERSM], x));
+}
+
+bool llvm::isKernelFunction(const Function &F) {
+ unsigned x = 0;
+ bool retval = llvm::findOneNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISKERNEL_FUNCTION], x);
+ if (retval == false) {
+ // There is no NVVM metadata, check the calling convention
+ if (F.getCallingConv() == llvm::CallingConv::PTX_Kernel)
+ return true;
+ else
+ return false;
+ }
+ return (x == 1);
+}
+
+bool llvm::getAlign(const Function &F, unsigned index, unsigned &align) {
+ std::vector<unsigned> Vs;
+ bool retval = llvm::findAllNVVMAnnotation(
+ &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_ALIGN], Vs);
+ if (retval == false)
+ return false;
+ for (int i = 0, e = Vs.size(); i < e; i++) {
+ unsigned v = Vs[i];
+ if ((v >> 16) == index) {
+ align = v & 0xFFFF;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
+ if (MDNode *alignNode = I.getMetadata("callalign")) {
+ for (int i = 0, n = alignNode->getNumOperands(); i < n; i++) {
+ if (const ConstantInt *CI =
+ dyn_cast<ConstantInt>(alignNode->getOperand(i))) {
+ unsigned v = CI->getZExtValue();
+ if ((v >> 16) == index) {
+ align = v & 0xFFFF;
+ return true;
+ }
+ if ((v >> 16) > index) {
+ return false;
+ }
+ }
+ }
+ }
+ return false;
+}
+
+bool llvm::isBarrierIntrinsic(Intrinsic::ID id) {
+ if ((id == Intrinsic::nvvm_barrier0) ||
+ (id == Intrinsic::nvvm_barrier0_popc) ||
+ (id == Intrinsic::nvvm_barrier0_and) ||
+ (id == Intrinsic::nvvm_barrier0_or) ||
+ (id == Intrinsic::cuda_syncthreads))
+ return true;
+ return false;
+}
+
+// Interface for checking all memory space transfer related intrinsics
+bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) {
+ if (id == Intrinsic::nvvm_ptr_local_to_gen ||
+ id == Intrinsic::nvvm_ptr_shared_to_gen ||
+ id == Intrinsic::nvvm_ptr_global_to_gen ||
+ id == Intrinsic::nvvm_ptr_constant_to_gen ||
+ id == Intrinsic::nvvm_ptr_gen_to_global ||
+ id == Intrinsic::nvvm_ptr_gen_to_shared ||
+ id == Intrinsic::nvvm_ptr_gen_to_local ||
+ id == Intrinsic::nvvm_ptr_gen_to_constant ||
+ id == Intrinsic::nvvm_ptr_gen_to_param) {
+ return true;
+ }
+
+ return false;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// provide an option to ignore GEP indicies for find out the base address only
+// which could be used in simple alias disambigurate.
+const Value *
+llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) {
+ V = V->stripPointerCasts();
+ while (true) {
+ if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+ if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+ V = IS->getArgOperand(0)->stripPointerCasts();
+ continue;
+ }
+ } else if (ignore_GEP_indices)
+ if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+ V = GEP->getPointerOperand()->stripPointerCasts();
+ continue;
+ }
+ break;
+ }
+ return V;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// - ignore GEP indicies for find out the base address only, and
+// - tracking PHINode
+// which could be used in simple alias disambigurate.
+const Value *
+llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
+ if (processed.find(V) != processed.end())
+ return nullptr;
+ processed.insert(V);
+
+ const Value *V2 = V->stripPointerCasts();
+ if (V2 != V && processed.find(V2) != processed.end())
+ return nullptr;
+ processed.insert(V2);
+
+ V = V2;
+
+ while (true) {
+ if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+ if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+ V = IS->getArgOperand(0)->stripPointerCasts();
+ continue;
+ }
+ } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+ V = GEP->getPointerOperand()->stripPointerCasts();
+ continue;
+ } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
+ if (V != V2 && processed.find(V) != processed.end())
+ return nullptr;
+ processed.insert(PN);
+ const Value *common = nullptr;
+ for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+ const Value *pv = PN->getIncomingValue(i);
+ const Value *base = skipPointerTransfer(pv, processed);
+ if (base) {
+ if (!common)
+ common = base;
+ else if (common != base)
+ return PN;
+ }
+ }
+ if (!common)
+ return PN;
+ V = common;
+ }
+ break;
+ }
+ return V;
+}
+
+// The following are some useful utilities for debuggung
+
+BasicBlock *llvm::getParentBlock(Value *v) {
+ if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+ return B;
+
+ if (Instruction *I = dyn_cast<Instruction>(v))
+ return I->getParent();
+
+ return nullptr;
+}
+
+Function *llvm::getParentFunction(Value *v) {
+ if (Function *F = dyn_cast<Function>(v))
+ return F;
+
+ if (Instruction *I = dyn_cast<Instruction>(v))
+ return I->getParent()->getParent();
+
+ if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+ return B->getParent();
+
+ return nullptr;
+}
+
+// Dump a block by name
+void llvm::dumpBlock(Value *v, char *blockName) {
+ Function *F = getParentFunction(v);
+ if (!F)
+ return;
+
+ for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
+ BasicBlock *B = it;
+ if (strcmp(B->getName().data(), blockName) == 0) {
+ B->dump();
+ return;
+ }
+ }
+}
+
+// Find an instruction by name
+Instruction *llvm::getInst(Value *base, char *instName) {
+ Function *F = getParentFunction(base);
+ if (!F)
+ return nullptr;
+
+ for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) {
+ Instruction *I = &*it;
+ if (strcmp(I->getName().data(), instName) == 0) {
+ return I;
+ }
+ }
+
+ return nullptr;
+}
+
+// Dump an instruction by nane
+void llvm::dumpInst(Value *base, char *instName) {
+ Instruction *I = getInst(base, instName);
+ if (I)
+ I->dump();
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v, std::set<Instruction *> *visited) {
+ if (Instruction *I = dyn_cast<Instruction>(v)) {
+
+ if (visited->find(I) != visited->end())
+ return;
+
+ visited->insert(I);
+
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+ dumpInstRec(I->getOperand(i), visited);
+
+ I->dump();
+ }
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v) {
+ std::set<Instruction *> visited;
+
+ //BasicBlock *B = getParentBlock(v);
+
+ dumpInstRec(v, &visited);
+}
+
+// Dump the parent for Instruction, block or function
+void llvm::dumpParent(Value *v) {
+ if (Instruction *I = dyn_cast<Instruction>(v)) {
+ I->getParent()->dump();
+ return;
+ }
+
+ if (BasicBlock *B = dyn_cast<BasicBlock>(v)) {
+ B->getParent()->dump();
+ return;
+ }
+
+ if (Function *F = dyn_cast<Function>(v)) {
+ F->getParent()->dump();
+ return;
+ }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
new file mode 100644
index 000000000000..446bfa1e112c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -0,0 +1,96 @@
+//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVVM specific utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXUTILITIES_H
+#define NVPTXUTILITIES_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include <cstdarg>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+#define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly"
+#define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly"
+
+void clearAnnotationCache(const llvm::Module *);
+
+bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
+bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
+ std::vector<unsigned> &);
+
+bool isTexture(const llvm::Value &);
+bool isSurface(const llvm::Value &);
+bool isSampler(const llvm::Value &);
+bool isImage(const llvm::Value &);
+bool isImageReadOnly(const llvm::Value &);
+bool isImageWriteOnly(const llvm::Value &);
+bool isImageReadWrite(const llvm::Value &);
+bool isManaged(const llvm::Value &);
+
+std::string getTextureName(const llvm::Value &);
+std::string getSurfaceName(const llvm::Value &);
+std::string getSamplerName(const llvm::Value &);
+
+bool getMaxNTIDx(const llvm::Function &, unsigned &);
+bool getMaxNTIDy(const llvm::Function &, unsigned &);
+bool getMaxNTIDz(const llvm::Function &, unsigned &);
+
+bool getReqNTIDx(const llvm::Function &, unsigned &);
+bool getReqNTIDy(const llvm::Function &, unsigned &);
+bool getReqNTIDz(const llvm::Function &, unsigned &);
+
+bool getMinCTASm(const llvm::Function &, unsigned &);
+bool isKernelFunction(const llvm::Function &);
+
+bool getAlign(const llvm::Function &, unsigned index, unsigned &);
+bool getAlign(const llvm::CallInst &, unsigned index, unsigned &);
+
+bool isBarrierIntrinsic(llvm::Intrinsic::ID);
+
+/// make_vector - Helper function which is useful for building temporary vectors
+/// to pass into type construction of CallInst ctors. This turns a null
+/// terminated list of pointers (or other value types) into a real live vector.
+///
+template <typename T> inline std::vector<T> make_vector(T A, ...) {
+ va_list Args;
+ va_start(Args, A);
+ std::vector<T> Result;
+ Result.push_back(A);
+ while (T Val = va_arg(Args, T))
+ Result.push_back(Val);
+ va_end(Args);
+ return Result;
+}
+
+bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id);
+const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices);
+const Value *
+skipPointerTransfer(const Value *V, std::set<const Value *> &processed);
+BasicBlock *getParentBlock(Value *v);
+Function *getParentFunction(Value *v);
+void dumpBlock(Value *v, char *blockName);
+Instruction *getInst(Value *base, char *instName);
+void dumpInst(Value *base, char *instName);
+void dumpInstRec(Value *v, std::set<Instruction *> *visited);
+void dumpInstRec(Value *v);
+void dumpParent(Value *v);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
new file mode 100644
index 000000000000..775df19be162
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
@@ -0,0 +1,1481 @@
+//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Vector Specific
+//-----------------------------------
+
+//
+// All vector instructions derive from NVPTXVecInst
+//
+
+class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+ NVPTXInst sInst=NOP>
+ : NVPTXInst<outs, ins, asmstr, pattern> {
+ NVPTXInst scalarInst=sInst;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
+// Extract v2i16
+def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+ (ins V2I16Regs:$src, i8imm:$c),
+ "mov.u16 \t$dst, $src${c:vecelem};",
+ [(set Int16Regs:$dst, (vector_extract
+ (v2i16 V2I16Regs:$src), imm:$c))],
+ IMOV16rr>;
+
+// Extract v4i16
+def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+ (ins V4I16Regs:$src, i8imm:$c),
+ "mov.u16 \t$dst, $src${c:vecelem};",
+ [(set Int16Regs:$dst, (vector_extract
+ (v4i16 V4I16Regs:$src), imm:$c))],
+ IMOV16rr>;
+
+// Extract v2i8
+def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+ (ins V2I8Regs:$src, i8imm:$c),
+ "mov.u16 \t$dst, $src${c:vecelem};",
+ [(set Int8Regs:$dst, (vector_extract
+ (v2i8 V2I8Regs:$src), imm:$c))],
+ IMOV8rr>;
+
+// Extract v4i8
+def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+ (ins V4I8Regs:$src, i8imm:$c),
+ "mov.u16 \t$dst, $src${c:vecelem};",
+ [(set Int8Regs:$dst, (vector_extract
+ (v4i8 V4I8Regs:$src), imm:$c))],
+ IMOV8rr>;
+
+// Extract v2i32
+def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+ (ins V2I32Regs:$src, i8imm:$c),
+ "mov.u32 \t$dst, $src${c:vecelem};",
+ [(set Int32Regs:$dst, (vector_extract
+ (v2i32 V2I32Regs:$src), imm:$c))],
+ IMOV32rr>;
+
+// Extract v2f32
+def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+ (ins V2F32Regs:$src, i8imm:$c),
+ "mov.f32 \t$dst, $src${c:vecelem};",
+ [(set Float32Regs:$dst, (vector_extract
+ (v2f32 V2F32Regs:$src), imm:$c))],
+ FMOV32rr>;
+
+// Extract v2i64
+def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
+ (ins V2I64Regs:$src, i8imm:$c),
+ "mov.u64 \t$dst, $src${c:vecelem};",
+ [(set Int64Regs:$dst, (vector_extract
+ (v2i64 V2I64Regs:$src), imm:$c))],
+ IMOV64rr>;
+
+// Extract v2f64
+def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
+ (ins V2F64Regs:$src, i8imm:$c),
+ "mov.f64 \t$dst, $src${c:vecelem};",
+ [(set Float64Regs:$dst, (vector_extract
+ (v2f64 V2F64Regs:$src), imm:$c))],
+ FMOV64rr>;
+
+// Extract v4i32
+def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+ (ins V4I32Regs:$src, i8imm:$c),
+ "mov.u32 \t$dst, $src${c:vecelem};",
+ [(set Int32Regs:$dst, (vector_extract
+ (v4i32 V4I32Regs:$src), imm:$c))],
+ IMOV32rr>;
+
+// Extract v4f32
+def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+ (ins V4F32Regs:$src, i8imm:$c),
+ "mov.f32 \t$dst, $src${c:vecelem};",
+ [(set Float32Regs:$dst, (vector_extract
+ (v4f32 V4F32Regs:$src), imm:$c))],
+ FMOV32rr>;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in {
+// Insert v2i8
+def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
+ (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c),
+ "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+ [(set V2I8Regs:$dst,
+ (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))],
+ IMOV8rr>;
+
+// Insert v4i8
+def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
+ (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c),
+ "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+ [(set V4I8Regs:$dst,
+ (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))],
+ IMOV8rr>;
+
+// Insert v2i16
+def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
+ (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c),
+ "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+ [(set V2I16Regs:$dst,
+ (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+ IMOV16rr>;
+
+// Insert v4i16
+def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
+ (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c),
+ "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+ [(set V4I16Regs:$dst,
+ (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+ IMOV16rr>;
+
+// Insert v2i32
+def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
+ (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c),
+ "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+ [(set V2I32Regs:$dst,
+ (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+ IMOV32rr>;
+
+// Insert v2f32
+def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
+ (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c),
+ "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+ [(set V2F32Regs:$dst,
+ (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+ FMOV32rr>;
+
+// Insert v2i64
+def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
+ (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c),
+ "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u64 \t$dst${c:vecelem}, $val;",
+ [(set V2I64Regs:$dst,
+ (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+ IMOV64rr>;
+
+// Insert v2f64
+def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
+ (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c),
+ "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.f64 \t$dst${c:vecelem}, $val;",
+ [(set V2F64Regs:$dst,
+ (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+ FMOV64rr>;
+
+// Insert v4i32
+def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
+ (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c),
+ "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+ [(set V4I32Regs:$dst,
+ (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+ IMOV32rr>;
+
+// Insert v4f32
+def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
+ (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c),
+ "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
+ "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+ [(set V4F32Regs:$dst,
+ (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+ FMOV32rr>;
+}
+
+class BinOpAsmString<string c> {
+ string s = c;
+}
+
+class V4AsmStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(!strconcat(
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"),
+ opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"),
+ opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>;
+
+class V2AsmStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>;
+
+class V4MADStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(!strconcat(
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"),
+ opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"),
+ opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>;
+
+class V2MADStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>;
+
+class V4UnaryStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(!strconcat(
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1;\n\t"),
+ opcode), " \t${dst}_2, ${a}_2;\n\t"),
+ opcode), " \t${dst}_3, ${a}_3;")>;
+
+class V2UnaryStr<string opcode> : BinOpAsmString<
+ !strconcat(!strconcat(!strconcat(
+ opcode, " \t${dst}_0, ${a}_0;\n\t"),
+ opcode), " \t${dst}_1, ${a}_1;")>;
+
+class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass,
+ NVPTXInst sInst=NOP> :
+ NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b),
+ asmstr.s,
+ [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))],
+ sInst>;
+
+class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1,
+ NVPTXRegClass regclass2, NVPTXInst sInst=NOP> :
+ NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b),
+ asmstr.s,
+ [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))],
+ sInst>;
+
+class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass,
+ NVPTXInst sInst=NOP> :
+ NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a),
+ asmstr.s,
+ [(set regclass:$dst, (OpNode regclass:$a))], sInst>;
+
+multiclass IntBinVOp<string asmstr, SDNode OpNode,
+ NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst
+ i16op=NOP, NVPTXInst i8op=NOP> {
+ def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs,
+ i64op>;
+ def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs,
+ i32op>;
+ def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs,
+ i32op>;
+ def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs,
+ i16op>;
+ def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs,
+ i16op>;
+ def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs,
+ i8op>;
+ def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs,
+ i8op>;
+}
+
+multiclass FloatBinVOp<string asmstr, SDNode OpNode,
+ NVPTXInst f64=NOP, NVPTXInst f32=NOP,
+ NVPTXInst f32_ftz=NOP> {
+ def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode,
+ V2F64Regs, f64>;
+ def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+ V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+ def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+ V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+ def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+ V4F32Regs, f32>;
+ def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+ V2F32Regs, f32>;
+}
+
+multiclass IntUnaryVOp<string asmstr, PatFrag OpNode,
+ NVPTXInst i64op=NOP, NVPTXInst i32op=NOP,
+ NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> {
+ def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode,
+ V2I64Regs, i64op>;
+ def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+ V4I32Regs, i32op>;
+ def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+ V2I32Regs, i32op>;
+ def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+ V4I16Regs, i16op>;
+ def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+ V2I16Regs, i16op>;
+ def V4I8 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+ V4I8Regs, i8op>;
+ def V2I8 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+ V2I8Regs, i8op>;
+}
+
+
+// Integer Arithmetic
+let VecInstType=isVecOther.Value in {
+defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>;
+defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>;
+
+def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs,
+ ADDCCi32rr>;
+def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs,
+ ADDCCi32rr>;
+def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs,
+ SUBCCi32rr>;
+def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs,
+ SUBCCi32rr>;
+def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs,
+ ADDCCCi32rr>;
+def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs,
+ ADDCCCi32rr>;
+def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs,
+ SUBCCCi32rr>;
+def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs,
+ SUBCCCi32rr>;
+
+def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs,
+ SHLi64rr>;
+def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs,
+ SHLi32rr>;
+def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs,
+ SHLi32rr>;
+def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs,
+ SHLi16rr>;
+def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs,
+ SHLi16rr>;
+def ShiftLV2I8 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs, V2I32Regs,
+ SHLi8rr>;
+def ShiftLV4I8 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs, V4I32Regs,
+ SHLi8rr>;
+}
+
+// cvt to v*i32, helpers for shift
+class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr,
+ NVPTXInst sInst=NOP> :
+ NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>;
+
+class VecCVTStrHelper<string op, string dest, string src> {
+ string s=!strconcat(op, !strconcat("\t",
+ !strconcat(dest, !strconcat(", ", !strconcat(src, ";")))));
+}
+
+class Vec2CVTStr<string op> {
+ string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+ !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s));
+}
+
+class Vec4CVTStr<string op> {
+ string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+ !strconcat("\n\t",
+ !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s,
+ !strconcat("\n\t",
+ !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s,
+ !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s))))));
+}
+
+let VecInstType=isVecOther.Value in {
+def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs,
+ Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs,
+ Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs,
+ Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs,
+ Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs,
+ Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>;
+}
+
+def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2),
+ (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2),
+ (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2),
+ (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2),
+ (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2),
+ (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+let VecInstType=isVecOther.Value in {
+def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs,
+ SRAi64rr>;
+def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs,
+ SRAi32rr>;
+def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs,
+ SRAi32rr>;
+def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs,
+ SRAi16rr>;
+def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs,
+ SRAi16rr>;
+def ShiftRAV2I8 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs, V2I32Regs,
+ SRAi8rr>;
+def ShiftRAV4I8 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs, V4I32Regs,
+ SRAi8rr>;
+
+def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs,
+ SRLi64rr>;
+def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs,
+ SRLi32rr>;
+def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs,
+ SRLi32rr>;
+def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs,
+ SRLi16rr>;
+def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs,
+ SRLi16rr>;
+def ShiftRLV2I8 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs, V2I32Regs,
+ SRLi8rr>;
+def ShiftRLV4I8 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs, V4I32Regs,
+ SRLi8rr>;
+
+defm VMult : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr,
+ MULTi8rr>;
+defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr,
+ MULTHSi16rr,
+ MULTHSi8rr>;
+defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr,
+ MULTHUi16rr,
+ MULTHUi8rr>;
+defm VSDiv : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr,
+ SDIVi8rr>;
+defm VUDiv : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr,
+ UDIVi8rr>;
+defm VSRem : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr,
+ SREMi8rr>;
+defm VURem : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr,
+ UREMi8rr>;
+}
+
+def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2),
+ (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2),
+ (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2),
+ (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2),
+ (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2),
+ (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2),
+ (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2),
+ (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2),
+ (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2),
+ (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2),
+ (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+multiclass VMAD<string asmstr, NVPTXRegClass regclassv4,
+ NVPTXRegClass regclassv2,
+ SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP,
+ Predicate Pred> {
+ def V4 : NVPTXVecInst<(outs regclassv4:$dst),
+ (ins regclassv4:$a, regclassv4:$b, regclassv4:$c),
+ V4MADStr<asmstr>.s,
+ [(set regclassv4:$dst,
+ (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))],
+ sop>,
+ Requires<[Pred]>;
+ def V2 : NVPTXVecInst<(outs regclassv2:$dst),
+ (ins regclassv2:$a, regclassv2:$b, regclassv2:$c),
+ V2MADStr<asmstr>.s,
+ [(set regclassv2:$dst,
+ (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))],
+ sop>,
+ Requires<[Pred]>;
+}
+
+multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+ Predicate Pred> {
+ def V2 : NVPTXVecInst<(outs regclass:$dst),
+ (ins regclass:$a, regclass:$b, regclass:$c),
+ V2MADStr<asmstr>.s,
+ [(set regclass:$dst, (add
+ (mul regclass:$a, regclass:$b), regclass:$c))], sop>,
+ Requires<[Pred]>;
+}
+multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+ Predicate Pred> {
+ def V2 : NVPTXVecInst<(outs regclass:$dst),
+ (ins regclass:$a, regclass:$b, regclass:$c),
+ V2MADStr<asmstr>.s,
+ [(set regclass:$dst, (fadd
+ (fmul regclass:$a, regclass:$b), regclass:$c))], sop>,
+ Requires<[Pred]>;
+}
+
+let VecInstType=isVecOther.Value in {
+defm I8MAD : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>;
+defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr,
+ true>;
+defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr,
+ true>;
+defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>;
+
+defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>;
+
+defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>;
+defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>;
+defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>;
+
+defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+ FMAD32_ftzrrr, doFMADF32_ftz>;
+defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+ FMA32_ftzrrr, doFMAF32_ftz>;
+defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr,
+ doFMADF32>;
+defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr,
+ doFMAF32>;
+defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>;
+}
+
+let VecInstType=isVecOther.Value in {
+def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs,
+ FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs,
+ FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs,
+ FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs,
+ FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs,
+ FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs,
+ FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>;
+def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>;
+def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>;
+}
+
+def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>;
+
+let VecInstType=isVecOther.Value in {
+def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs,
+ FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs,
+ FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>;
+def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>;
+def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>;
+
+// Logical Arithmetic
+defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>;
+defm VOr : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>;
+defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>;
+
+defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>;
+}
+
+
+multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+ def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)),
+ (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c, V2F32Regs:$a)>,
+ Requires<[Pred]>;
+
+ def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c),
+ (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>,
+ Requires<[Pred]>;
+}
+
+defm V2FMAF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>;
+defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>;
+defm V2FMAF32ext : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>;
+defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>;
+
+multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+ def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)),
+ (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c, V4F32Regs:$a)>,
+ Requires<[Pred]>;
+
+ def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c),
+ (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>,
+ Requires<[Pred]>;
+}
+
+defm V4FMAF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>;
+defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>;
+defm V4FMAF32ext : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>;
+defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>;
+
+multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+ def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)),
+ (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>,
+ Requires<[Pred]>;
+
+ def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c),
+ (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>,
+ Requires<[Pred]>;
+}
+
+defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>;
+
+class VecModStr<string vecsize, string elem, string extra, string l="">
+{
+ string t1 = !strconcat("${c", elem);
+ string t2 = !strconcat(t1, ":vecv");
+ string t3 = !strconcat(t2, vecsize);
+ string t4 = !strconcat(t3, extra);
+ string t5 = !strconcat(t4, l);
+ string s = !strconcat(t5, "}");
+}
+class ShuffleOneLine<string vecsize, string elem, string type>
+{
+ string t1 = VecModStr<vecsize, elem, "comm", "1">.s;
+ string t2 = !strconcat(t1, "mov.");
+ string t3 = !strconcat(t2, type);
+ string t4 = !strconcat(t3, " \t${dst}_");
+ string t5 = !strconcat(t4, elem);
+ string t6 = !strconcat(t5, ", $src1");
+ string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s);
+ string t8 = !strconcat(t7, ";\n\t");
+ string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s);
+ string t10 = !strconcat(t9, "mov.");
+ string t11 = !strconcat(t10, type);
+ string t12 = !strconcat(t11, " \t${dst}_");
+ string t13 = !strconcat(t12, elem);
+ string t14 = !strconcat(t13, ", $src2");
+ string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s);
+ string s = !strconcat(t15, ";");
+}
+class ShuffleAsmStr2<string type>
+{
+ string t1 = ShuffleOneLine<"2", "0", type>.s;
+ string t2 = !strconcat(t1, "\n\t");
+ string s = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s);
+}
+class ShuffleAsmStr4<string type>
+{
+ string t1 = ShuffleOneLine<"4", "0", type>.s;
+ string t2 = !strconcat(t1, "\n\t");
+ string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s);
+ string t4 = !strconcat(t3, "\n\t");
+ string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s);
+ string t6 = !strconcat(t5, "\n\t");
+ string s = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s);
+}
+
+let neverHasSideEffects=1, VecInstType=isVecShuffle.Value in {
+def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst),
+ (ins V4F32Regs:$src1, V4F32Regs:$src2,
+ i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+ ShuffleAsmStr4<"f32">.s),
+ [], FMOV32rr>;
+
+def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst),
+ (ins V4I32Regs:$src1, V4I32Regs:$src2,
+ i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+ ShuffleAsmStr4<"u32">.s),
+ [], IMOV32rr>;
+
+def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst),
+ (ins V4I16Regs:$src1, V4I16Regs:$src2,
+ i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+ ShuffleAsmStr4<"u16">.s),
+ [], IMOV16rr>;
+
+def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst),
+ (ins V4I8Regs:$src1, V4I8Regs:$src2,
+ i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+ ShuffleAsmStr4<"u16">.s),
+ [], IMOV8rr>;
+
+def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst),
+ (ins V2F32Regs:$src1, V2F32Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"f32">.s),
+ [], FMOV32rr>;
+
+def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst),
+ (ins V2I32Regs:$src1, V2I32Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"u32">.s),
+ [], IMOV32rr>;
+
+def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst),
+ (ins V2I8Regs:$src1, V2I8Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"u16">.s),
+ [], IMOV8rr>;
+
+def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst),
+ (ins V2I16Regs:$src1, V2I16Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"u16">.s),
+ [], IMOV16rr>;
+
+def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst),
+ (ins V2F64Regs:$src1, V2F64Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"f64">.s),
+ [], FMOV64rr>;
+
+def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst),
+ (ins V2I64Regs:$src1, V2I64Regs:$src2,
+ i8imm:$c0, i8imm:$c1),
+ !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+ ShuffleAsmStr2<"u64">.s),
+ [], IMOV64rr>;
+}
+
+def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return CurDAG->getTargetConstant(SVOp->getMaskElt(0), MVT::i32);
+}]>;
+def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return CurDAG->getTargetConstant(SVOp->getMaskElt(1), MVT::i32);
+}]>;
+def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return CurDAG->getTargetConstant(SVOp->getMaskElt(2), MVT::i32);
+}]>;
+def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return CurDAG->getTargetConstant(SVOp->getMaskElt(3), MVT::i32);
+}]>;
+
+// The spurious call is here to silence a compiler warning about N being
+// unused.
+def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs),
+ [{ N->getGluedNode(); return true; }]>;
+
+def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)),
+ (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)),
+ (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+ (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)),
+ (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)),
+ (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)),
+ (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+ (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)),
+ (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)),
+ (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+ (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)),
+ (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)),
+ (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+ (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)),
+ (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2,
+ (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+ NVPTXInst si>
+ : NVPTXVecInst<(outs vclass:$dst),
+ (ins sclass:$a1, sclass:$a2),
+ !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"),
+ [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))],
+ si>;
+class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+ NVPTXInst si>
+ : NVPTXVecInst<(outs vclass:$dst),
+ (ins sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4),
+ !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"),
+ [(set vclass:$dst,
+ (build_vector sclass:$a1, sclass:$a2,
+ sclass:$a3, sclass:$a4))], si>;
+
+let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in {
+def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs,
+ FMOV32rr>;
+def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs,
+ FMOV64rr>;
+
+def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs,
+ IMOV32rr>;
+def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs,
+ IMOV64rr>;
+def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs,
+ IMOV16rr>;
+def Build_Vector2_i8 : Build_Vector2<"mov.v2.u16", V2I8Regs, Int8Regs,
+ IMOV8rr>;
+
+def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs,
+ FMOV32rr>;
+
+def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs,
+ IMOV32rr>;
+def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs,
+ IMOV16rr>;
+def Build_Vector4_i8 : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs,
+ IMOV8rr>;
+}
+
+class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP>
+ : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src),
+ !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"),
+ [], sop>;
+
+let isAsCheapAsAMove=1, neverHasSideEffects=1, IsSimpleMove=1,
+ VecInstType=isVecOther.Value in {
+def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>;
+def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>;
+
+def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>;
+def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>;
+
+def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>;
+def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>;
+
+def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>;
+def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>;
+
+def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>;
+def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>;
+}
+
+// extract subvector patterns
+def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR",
+ SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>;
+
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)),
+ (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0),
+ (V4f32Extract V4F32Regs:$src, 1))>;
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)),
+ (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2),
+ (V4f32Extract V4F32Regs:$src, 3))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)),
+ (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0),
+ (V4i32Extract V4I32Regs:$src, 1))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)),
+ (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2),
+ (V4i32Extract V4I32Regs:$src, 3))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)),
+ (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0),
+ (V4i16Extract V4I16Regs:$src, 1))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)),
+ (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2),
+ (V4i16Extract V4I16Regs:$src, 3))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)),
+ (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0),
+ (V4i8Extract V4I8Regs:$src, 1))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)),
+ (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2),
+ (V4i8Extract V4I8Regs:$src, 3))>;
+
+// Select instructions
+class Select_OneLine<string type, string pos> {
+ string t1 = !strconcat("selp.", type);
+ string t2 = !strconcat(t1, " \t${dst}_");
+ string t3 = !strconcat(t2, pos);
+ string t4 = !strconcat(t3, ", ${src1}_");
+ string t5 = !strconcat(t4, pos);
+ string t6 = !strconcat(t5, ", ${src2}_");
+ string t7 = !strconcat(t6, pos);
+ string s = !strconcat(t7, ", $p;");
+}
+
+class Select_Str2<string type> {
+ string t1 = Select_OneLine<type, "0">.s;
+ string t2 = !strconcat(t1, "\n\t");
+ string s = !strconcat(t2, Select_OneLine<type, "1">.s);
+}
+
+class Select_Str4<string type> {
+ string t1 = Select_OneLine<type, "0">.s;
+ string t2 = !strconcat(t1, "\n\t");
+ string t3 = !strconcat(t2, Select_OneLine<type, "1">.s);
+ string t4 = !strconcat(t3, "\n\t");
+ string t5 = !strconcat(t4, Select_OneLine<type, "2">.s);
+ string t6 = !strconcat(t5, "\n\t");
+ string s = !strconcat(t6, Select_OneLine<type, "3">.s);
+
+}
+
+class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop>
+ : NVPTXVecInst<(outs vclass:$dst),
+ (ins vclass:$src1, vclass:$src2, Int1Regs:$p),
+ asmstr,
+ [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1,
+ vclass:$src2))],
+ sop>;
+
+let VecInstType=isVecOther.Value in {
+def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>;
+def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>;
+def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>;
+def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>;
+def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>;
+def V4I8_Select : Vec_Select<V4I8Regs, Select_Str4<"b16">.s, SELECTi8rr>;
+def V2I8_Select : Vec_Select<V2I8Regs, Select_Str2<"b16">.s, SELECTi8rr>;
+
+def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>;
+def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>;
+def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>;
+}
+
+// Comparison instructions
+
+// setcc convenience fragments.
+def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOEQ)>;
+def vsetogt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOGT)>;
+def vsetoge : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOGE)>;
+def vsetolt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOLT)>;
+def vsetole : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETOLE)>;
+def vsetone : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETONE)>;
+def vseto : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETO)>;
+def vsetuo : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUO)>;
+def vsetueq : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUEQ)>;
+def vsetugt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUGT)>;
+def vsetuge : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUGE)>;
+def vsetult : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETULT)>;
+def vsetule : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETULE)>;
+def vsetune : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETUNE)>;
+def vseteq : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETEQ)>;
+def vsetgt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETGT)>;
+def vsetge : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETGE)>;
+def vsetlt : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETLT)>;
+def vsetle : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETLE)>;
+def vsetne : PatFrag<(ops node:$lhs, node:$rhs),
+ (setcc node:$lhs, node:$rhs, SETNE)>;
+
+class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass,
+ NVPTXInst sop>
+ : NVPTXVecInst<(outs outrclass:$dst),
+ (ins inrclass:$a, inrclass:$b),
+ "Unsupported",
+ [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))],
+ sop>;
+
+multiclass Vec_Compare_All<PatFrag op,
+ NVPTXInst inst8,
+ NVPTXInst inst16,
+ NVPTXInst inst32,
+ NVPTXInst inst64>
+{
+ def V2I8 : Vec_Compare<op, V2I8Regs, V2I8Regs, inst8>;
+ def V4I8 : Vec_Compare<op, V4I8Regs, V4I8Regs, inst8>;
+ def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>;
+ def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>;
+ def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>;
+ def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>;
+ def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>;
+}
+
+let VecInstType=isVecOther.Value in {
+ defm VecSGT : Vec_Compare_All<vsetgt, ISetSGTi8rr_toi8, ISetSGTi16rr_toi16,
+ ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>;
+ defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16,
+ ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>;
+ defm VecSLT : Vec_Compare_All<vsetlt, ISetSLTi8rr_toi8, ISetSLTi16rr_toi16,
+ ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>;
+ defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16,
+ ISetULTi32rr_toi32, ISetULTi64rr_toi64>;
+ defm VecSGE : Vec_Compare_All<vsetge, ISetSGEi8rr_toi8, ISetSGEi16rr_toi16,
+ ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>;
+ defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16,
+ ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>;
+ defm VecSLE : Vec_Compare_All<vsetle, ISetSLEi8rr_toi8, ISetSLEi16rr_toi16,
+ ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>;
+ defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16,
+ ISetULEi32rr_toi32, ISetULEi64rr_toi64>;
+ defm VecSEQ : Vec_Compare_All<vseteq, ISetSEQi8rr_toi8, ISetSEQi16rr_toi16,
+ ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>;
+ defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16,
+ ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>;
+ defm VecSNE : Vec_Compare_All<vsetne, ISetSNEi8rr_toi8, ISetSNEi16rr_toi16,
+ ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>;
+ defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16,
+ ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>;
+}
+
+multiclass FVec_Compare_All<PatFrag op,
+ NVPTXInst instf32,
+ NVPTXInst instf64>
+{
+ def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>;
+ def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>;
+ def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>;
+}
+
+let VecInstType=isVecOther.Value in {
+ defm FVecGT : FVec_Compare_All<vsetogt, FSetGTf32rr_toi32,
+ FSetGTf64rr_toi64>;
+ defm FVecLT : FVec_Compare_All<vsetolt, FSetLTf32rr_toi32,
+ FSetLTf64rr_toi64>;
+ defm FVecGE : FVec_Compare_All<vsetoge, FSetGEf32rr_toi32,
+ FSetGEf64rr_toi64>;
+ defm FVecLE : FVec_Compare_All<vsetole, FSetLEf32rr_toi32,
+ FSetLEf64rr_toi64>;
+ defm FVecEQ : FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32,
+ FSetEQf64rr_toi64>;
+ defm FVecNE : FVec_Compare_All<vsetone, FSetNEf32rr_toi32,
+ FSetNEf64rr_toi64>;
+
+ defm FVecUGT : FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32,
+ FSetUGTf64rr_toi64>;
+ defm FVecULT : FVec_Compare_All<vsetult, FSetULTf32rr_toi32,
+ FSetULTf64rr_toi64>;
+ defm FVecUGE : FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32,
+ FSetUGEf64rr_toi64>;
+ defm FVecULE : FVec_Compare_All<vsetule, FSetULEf32rr_toi32,
+ FSetULEf64rr_toi64>;
+ defm FVecUEQ : FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32,
+ FSetUEQf64rr_toi64>;
+ defm FVecUNE : FVec_Compare_All<vsetune, FSetUNEf32rr_toi32,
+ FSetUNEf64rr_toi64>;
+
+ defm FVecNUM : FVec_Compare_All<vseto, FSetNUMf32rr_toi32,
+ FSetNUMf64rr_toi64>;
+ defm FVecNAN : FVec_Compare_All<vsetuo, FSetNANf32rr_toi32,
+ FSetNANf64rr_toi64>;
+}
+
+class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4),
+ (ins i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("ld.param", opstr),
+ "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>;
+
+class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs regclass:$d1, regclass:$d2),
+ (ins i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("ld.param", opstr),
+ "\t{{$d1, $d2}}, [retval0+$b];"), []>;
+
+
+class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+ i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[param$a+$b], {{$s1, $s2}};"), []>;
+
+class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+ i32imm:$a),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> :
+ NVPTXInst<(outs),
+ (ins regclass:$s1, regclass:$s2, i32imm:$a),
+ !strconcat(!strconcat("st.param", opstr),
+ "\t[func_retval+$a], {{$s1, $s2}};"), []>;
+
+def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">;
+def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">;
+def LoadParamScalar4I8 : LoadParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">;
+def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">;
+def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">;
+def LoadParamScalar2I8 : LoadParamScalar2Inst<Int32Regs, ".v2.b8">;
+
+def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">;
+def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">;
+def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreParamScalar4I8 : StoreParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreParamScalar2I8 : StoreParamScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreRetvalScalar4I8 : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreRetvalScalar2I8 : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">;
+
+class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>:
+ NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b),
+ "loadparam : $dst <- [$a, $b]",
+ [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))],
+ sop>;
+
+class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>
+ : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+ "storeparam : [$a, $b] <- $val",
+ [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>;
+
+class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr,
+ NVPTXInst sop=NOP>
+ : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a),
+ "storeretval : retval[$a] <- $val",
+ [(StoreRetval (i32 imm:$a), regclass:$val)], sop>;
+
+let VecInstType=isVecLD.Value in {
+def LoadParamV4I32 : LoadParamVecInst<V4I32Regs, ".v4.b32",
+ LoadParamScalar4I32>;
+def LoadParamV4I16 : LoadParamVecInst<V4I16Regs, ".v4.b16",
+ LoadParamScalar4I16>;
+def LoadParamV4I8 : LoadParamVecInst<V4I8Regs, ".v4.b8",
+ LoadParamScalar4I8>;
+
+def LoadParamV2I64 : LoadParamVecInst<V2I64Regs, ".v2.b64",
+ LoadParamScalar2I64>;
+def LoadParamV2I32 : LoadParamVecInst<V2I32Regs, ".v2.b32",
+ LoadParamScalar2I32>;
+def LoadParamV2I16 : LoadParamVecInst<V2I16Regs, ".v2.b16",
+ LoadParamScalar2I16>;
+def LoadParamV2I8 : LoadParamVecInst<V2I8Regs, ".v2.b8",
+ LoadParamScalar2I8>;
+
+def LoadParamV4F32 : LoadParamVecInst<V4F32Regs, ".v4.f32",
+ LoadParamScalar4F32>;
+def LoadParamV2F32 : LoadParamVecInst<V2F32Regs, ".v2.f32",
+ LoadParamScalar2F32>;
+def LoadParamV2F64 : LoadParamVecInst<V2F64Regs, ".v2.f64",
+ LoadParamScalar2F64>;
+}
+
+let VecInstType=isVecST.Value in {
+def StoreParamV4I32 : StoreParamVecInst<V4I32Regs, ".v4.b32",
+ StoreParamScalar4I32>;
+def StoreParamV4I16 : StoreParamVecInst<V4I16Regs, ".v4.b16",
+ StoreParamScalar4I16>;
+def StoreParamV4I8 : StoreParamVecInst<V4I8Regs, ".v4.b8",
+ StoreParamScalar4I8>;
+
+def StoreParamV2I64 : StoreParamVecInst<V2I64Regs, ".v2.b64",
+ StoreParamScalar2I64>;
+def StoreParamV2I32 : StoreParamVecInst<V2I32Regs, ".v2.b32",
+ StoreParamScalar2I32>;
+def StoreParamV2I16 : StoreParamVecInst<V2I16Regs, ".v2.b16",
+ StoreParamScalar2I16>;
+def StoreParamV2I8 : StoreParamVecInst<V2I8Regs, ".v2.b8",
+ StoreParamScalar2I8>;
+
+def StoreParamV4F32 : StoreParamVecInst<V4F32Regs, ".v4.f32",
+ StoreParamScalar4F32>;
+def StoreParamV2F32 : StoreParamVecInst<V2F32Regs, ".v2.f32",
+ StoreParamScalar2F32>;
+def StoreParamV2F64 : StoreParamVecInst<V2F64Regs, ".v2.f64",
+ StoreParamScalar2F64>;
+
+def StoreRetvalV4I32 : StoreRetvalVecInst<V4I32Regs, ".v4.b32",
+ StoreRetvalScalar4I32>;
+def StoreRetvalV4I16 : StoreRetvalVecInst<V4I16Regs, ".v4.b16",
+ StoreRetvalScalar4I16>;
+def StoreRetvalV4I8 : StoreRetvalVecInst<V4I8Regs, ".v4.b8",
+ StoreRetvalScalar4I8>;
+
+def StoreRetvalV2I64 : StoreRetvalVecInst<V2I64Regs, ".v2.b64",
+ StoreRetvalScalar2I64>;
+def StoreRetvalV2I32 : StoreRetvalVecInst<V2I32Regs, ".v2.b32",
+ StoreRetvalScalar2I32>;
+def StoreRetvalV2I16 : StoreRetvalVecInst<V2I16Regs, ".v2.b16",
+ StoreRetvalScalar2I16>;
+def StoreRetvalV2I8 : StoreRetvalVecInst<V2I8Regs, ".v2.b8",
+ StoreRetvalScalar2I8>;
+
+def StoreRetvalV4F32 : StoreRetvalVecInst<V4F32Regs, ".v4.f32",
+ StoreRetvalScalar4F32>;
+def StoreRetvalV2F32 : StoreRetvalVecInst<V2F32Regs, ".v2.f32",
+ StoreRetvalScalar2F32>;
+def StoreRetvalV2F64 : StoreRetvalVecInst<V2F64Regs, ".v2.f64",
+ StoreRetvalScalar2F64>;
+
+}
+
+
+// Int vector to int scalar bit convert
+// v4i8 -> i32
+def : Pat<(i32 (bitconvert V4I8Regs:$s)),
+ (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+ (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>;
+// v4i16 -> i64
+def : Pat<(i64 (bitconvert V4I16Regs:$s)),
+ (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+ (V4i16Extract V4I16Regs:$s,1),
+ (V4i16Extract V4I16Regs:$s,2),
+ (V4i16Extract V4I16Regs:$s,3))>;
+// v2i8 -> i16
+def : Pat<(i16 (bitconvert V2I8Regs:$s)),
+ (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>;
+// v2i16 -> i32
+def : Pat<(i32 (bitconvert V2I16Regs:$s)),
+ (V2I16toI32 (V2i16Extract V2I16Regs:$s,0),
+ (V2i16Extract V2I16Regs:$s,1))>;
+// v2i32 -> i64
+def : Pat<(i64 (bitconvert V2I32Regs:$s)),
+ (V2I32toI64 (V2i32Extract V2I32Regs:$s,0),
+ (V2i32Extract V2I32Regs:$s,1))>;
+
+// Int scalar to int vector bit convert
+let VecInstType=isVecDest.Value in {
+// i32 -> v4i8
+def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s),
+ "Error!",
+ [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))],
+ I32toV4I8>;
+// i64 -> v4i16
+def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s),
+ "Error!",
+ [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))],
+ I64toV4I16>;
+// i16 -> v2i8
+def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s),
+ "Error!",
+ [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))],
+ I16toV2I8>;
+// i32 -> v2i16
+def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s),
+ "Error!",
+ [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))],
+ I32toV2I16>;
+// i64 -> v2i32
+def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s),
+ "Error!",
+ [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))],
+ I64toV2I32>;
+}
+
+// Int vector to int vector bit convert
+// v4i8 -> v2i16
+def : Pat<(v2i16 (bitconvert V4I8Regs:$s)),
+ (VecI32toV2I16
+ (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+ (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> v2i32
+def : Pat<(v2i32 (bitconvert V4I16Regs:$s)),
+ (VecI64toV2I32
+ (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+ (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> v4i8
+def : Pat<(v4i8 (bitconvert V2I16Regs:$s)),
+ (VecI32toV4I8
+ (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> v4i16
+def : Pat<(v4i16 (bitconvert V2I32Regs:$s)),
+ (VecI64toV4I16
+ (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+// v2i64 -> v4i32
+def : Pat<(v4i32 (bitconvert V2I64Regs:$s)),
+ (Build_Vector4_i32
+ (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0),
+ (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1),
+ (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0),
+ (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>;
+// v4i32 -> v2i64
+def : Pat<(v2i64 (bitconvert V4I32Regs:$s)),
+ (Build_Vector2_i64
+ (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)),
+ (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>;
+
+// Fp scalar to fp vector convert
+// f64 -> v2f32
+let VecInstType=isVecDest.Value in {
+def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s),
+ "Error!",
+ [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))],
+ F64toV2F32>;
+}
+
+// Fp vector to fp scalar convert
+// v2f32 -> f64
+def : Pat<(f64 (bitconvert V2F32Regs:$s)),
+ (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>;
+
+// Fp scalar to int vector convert
+// f32 -> v4i8
+def : Pat<(v4i8 (bitconvert Float32Regs:$s)),
+ (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f32 -> v2i16
+def : Pat<(v2i16 (bitconvert Float32Regs:$s)),
+ (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f64 -> v4i16
+def : Pat<(v4i16 (bitconvert Float64Regs:$s)),
+ (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>;
+// f64 -> v2i32
+def : Pat<(v2i32 (bitconvert Float64Regs:$s)),
+ (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>;
+
+// Int vector to fp scalar convert
+// v4i8 -> f32
+def : Pat<(f32 (bitconvert V4I8Regs:$s)),
+ (BITCONVERT_32_I2F
+ (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+ (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> f64
+def : Pat<(f64 (bitconvert V4I16Regs:$s)),
+ (BITCONVERT_64_I2F
+ (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+ (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> f32
+def : Pat<(f32 (bitconvert V2I16Regs:$s)),
+ (BITCONVERT_32_I2F
+ (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> f64
+def : Pat<(f64 (bitconvert V2I32Regs:$s)),
+ (BITCONVERT_64_I2F
+ (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+
+// Int scalar to fp vector convert
+// i64 -> v2f32
+def : Pat<(v2f32 (bitconvert Int64Regs:$s)),
+ (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>;
+
+// Fp vector to int scalar convert
+// v2f32 -> i64
+def : Pat<(i64 (bitconvert V2F32Regs:$s)),
+ (BITCONVERT_64_F2I
+ (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>;
+
+// Int vector to fp vector convert
+// v2i64 -> v4f32
+def : Pat<(v4f32 (bitconvert V2I64Regs:$s)),
+ (Build_Vector4_f32
+ (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+ (V2i64Extract V2I64Regs:$s, 0)), 0)),
+ (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+ (V2i64Extract V2I64Regs:$s, 0)), 1)),
+ (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+ (V2i64Extract V2I64Regs:$s, 1)), 0)),
+ (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+ (V2i64Extract V2I64Regs:$s, 1)), 1)))>;
+// v2i64 -> v2f64
+def : Pat<(v2f64 (bitconvert V2I64Regs:$s)),
+ (Build_Vector2_f64
+ (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)),
+ (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>;
+// v2i32 -> v2f32
+def : Pat<(v2f32 (bitconvert V2I32Regs:$s)),
+ (Build_Vector2_f32
+ (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)),
+ (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>;
+// v4i32 -> v2f64
+def : Pat<(v2f64 (bitconvert V4I32Regs:$s)),
+ (Build_Vector2_f64
+ (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0),
+ (V4i32Extract V4I32Regs:$s,1))),
+ (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2),
+ (V4i32Extract V4I32Regs:$s,3))))>;
+// v4i32 -> v4f32
+def : Pat<(v4f32 (bitconvert V4I32Regs:$s)),
+ (Build_Vector4_f32
+ (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)),
+ (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)),
+ (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)),
+ (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>;
+// v4i16 -> v2f32
+def : Pat<(v2f32 (bitconvert V4I16Regs:$s)),
+ (VecF64toV2F32 (BITCONVERT_64_I2F
+ (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+ (V4i16Extract V4I16Regs:$s,1),
+ (V4i16Extract V4I16Regs:$s,2),
+ (V4i16Extract V4I16Regs:$s,3))))>;
+
+// Fp vector to int vector convert
+// v2i64 <- v4f32
+def : Pat<(v2i64 (bitconvert V4F32Regs:$s)),
+ (Build_Vector2_i64
+ (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0),
+ (V4f32Extract V4F32Regs:$s,1))),
+ (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2),
+ (V4f32Extract V4F32Regs:$s,3))))>;
+// v2i64 <- v2f64
+def : Pat<(v2i64 (bitconvert V2F64Regs:$s)),
+ (Build_Vector2_i64
+ (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)),
+ (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>;
+// v2i32 <- v2f32
+def : Pat<(v2i32 (bitconvert V2F32Regs:$s)),
+ (Build_Vector2_i32
+ (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)),
+ (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>;
+// v4i32 <- v2f64
+def : Pat<(v4i32 (bitconvert V2F64Regs:$s)),
+ (Build_Vector4_i32
+ (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+ (V2f64Extract V2F64Regs:$s, 0)), 0)),
+ (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+ (V2f64Extract V2F64Regs:$s, 0)), 1)),
+ (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+ (V2f64Extract V2F64Regs:$s, 1)), 0)),
+ (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+ (V2f64Extract V2F64Regs:$s, 1)), 1)))>;
+// v4i32 <- v4f32
+def : Pat<(v4i32 (bitconvert V4F32Regs:$s)),
+ (Build_Vector4_i32
+ (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)),
+ (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)),
+ (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)),
+ (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>;
+// v4i16 <- v2f32
+def : Pat<(v4i16 (bitconvert V2F32Regs:$s)),
+ (VecI64toV4I16 (BITCONVERT_64_F2I
+ (V2F32toF64 (V2f32Extract V2F32Regs:$s,0),
+ (V2f32Extract V2F32Regs:$s,1))))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp
new file mode 100644
index 000000000000..5f074b33a2d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp
@@ -0,0 +1,90 @@
+//===-- NVPTXutil.cpp - Functions exported to CodeGen --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXutil.h"
+#include "NVPTX.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+bool isParamLoad(const MachineInstr *MI) {
+ if ((MI->getOpcode() != NVPTX::LD_i32_avar) &&
+ (MI->getOpcode() != NVPTX::LD_i64_avar))
+ return false;
+ if (MI->getOperand(2).isImm() == false)
+ return false;
+ if (MI->getOperand(2).getImm() != NVPTX::PTXLdStInstCode::PARAM)
+ return false;
+ return true;
+}
+
+#define DATA_MASK 0x7f
+#define DIGIT_WIDTH 7
+#define MORE_BYTES 0x80
+
+static int encode_leb128(uint64_t val, int *nbytes, char *space, int splen) {
+ char *a;
+ char *end = space + splen;
+
+ a = space;
+ do {
+ unsigned char uc;
+
+ if (a >= end)
+ return 1;
+ uc = val & DATA_MASK;
+ val >>= DIGIT_WIDTH;
+ if (val != 0)
+ uc |= MORE_BYTES;
+ *a = uc;
+ a++;
+ } while (val);
+ *nbytes = a - space;
+ return 0;
+}
+
+#undef DATA_MASK
+#undef DIGIT_WIDTH
+#undef MORE_BYTES
+
+uint64_t encode_leb128(const char *str) {
+ union {
+ uint64_t x;
+ char a[8];
+ } temp64;
+
+ temp64.x = 0;
+
+ for (unsigned i = 0, e = strlen(str); i != e; ++i)
+ temp64.a[i] = str[e - 1 - i];
+
+ char encoded[16];
+ int nbytes;
+
+ int retval = encode_leb128(temp64.x, &nbytes, encoded, 16);
+
+ (void) retval;
+ assert(retval == 0 && "Encoding to leb128 failed");
+
+ assert(nbytes <= 8 &&
+ "Cannot support register names with leb128 encoding > 8 bytes");
+
+ temp64.x = 0;
+ for (int i = 0; i < nbytes; ++i)
+ temp64.a[i] = encoded[i];
+
+ return temp64.x;
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h
new file mode 100644
index 000000000000..d1d117159486
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h
@@ -0,0 +1,25 @@
+//===-- NVPTXutil.h - Functions exported to CodeGen --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_UTIL_H
+#define LLVM_TARGET_NVPTX_UTIL_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+bool isParamLoad(const MachineInstr *);
+uint64_t encode_leb128(const char *str);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
new file mode 100644
index 000000000000..a8d6b95ae4ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -0,0 +1,222 @@
+//===- NVVMReflect.cpp - NVVM Emulate conditional compilation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces occurrences of __nvvm_reflect("string") with an
+// integer based on -nvvm-reflect-list string=<int> option given to this pass.
+// If an undefined string value is seen in a call to __nvvm_reflect("string"),
+// a default value of 0 will be used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-reflect"
+
+namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
+
+namespace {
+class NVVMReflect : public ModulePass {
+private:
+ StringMap<int> VarMap;
+ typedef DenseMap<std::string, int>::iterator VarMapIter;
+
+public:
+ static char ID;
+ NVVMReflect() : ModulePass(ID) {
+ initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+ VarMap.clear();
+ }
+
+ NVVMReflect(const StringMap<int> &Mapping)
+ : ModulePass(ID) {
+ initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+ for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
+ I != E; ++I) {
+ VarMap[(*I).getKey()] = (*I).getValue();
+ }
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+ bool runOnModule(Module &) override;
+
+private:
+ bool handleFunction(Function *ReflectFunction);
+ void setVarMap();
+};
+}
+
+ModulePass *llvm::createNVVMReflectPass() {
+ return new NVVMReflect();
+}
+
+ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
+ return new NVVMReflect(Mapping);
+}
+
+static cl::opt<bool>
+NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
+ cl::desc("NVVM reflection, enabled by default"));
+
+char NVVMReflect::ID = 0;
+INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
+ "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
+ false)
+
+static cl::list<std::string>
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
+ cl::desc("A list of string=num assignments"),
+ cl::ValueRequired);
+
+/// The command line can look as follows :
+/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
+/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
+/// ReflectList vector. First, each of ReflectList[i] is 'split'
+/// using "," as the delimiter. Then each of this part is split
+/// using "=" as the delimiter.
+void NVVMReflect::setVarMap() {
+ for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
+ DEBUG(dbgs() << "Option : " << ReflectList[i] << "\n");
+ SmallVector<StringRef, 4> NameValList;
+ StringRef(ReflectList[i]).split(NameValList, ",");
+ for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
+ SmallVector<StringRef, 2> NameValPair;
+ NameValList[j].split(NameValPair, "=");
+ assert(NameValPair.size() == 2 && "name=val expected");
+ std::stringstream ValStream(NameValPair[1]);
+ int Val;
+ ValStream >> Val;
+ assert((!(ValStream.fail())) && "integer value expected");
+ VarMap[NameValPair[0]] = Val;
+ }
+ }
+}
+
+bool NVVMReflect::handleFunction(Function *ReflectFunction) {
+ // Validate _reflect function
+ assert(ReflectFunction->isDeclaration() &&
+ "_reflect function should not have a body");
+ assert(ReflectFunction->getReturnType()->isIntegerTy() &&
+ "_reflect's return type should be integer");
+
+ std::vector<Instruction *> ToRemove;
+
+ // Go through the uses of ReflectFunction in this Function.
+ // Each of them should a CallInst with a ConstantArray argument.
+ // First validate that. If the c-string corresponding to the
+ // ConstantArray can be found successfully, see if it can be
+ // found in VarMap. If so, replace the uses of CallInst with the
+ // value found in VarMap. If not, replace the use with value 0.
+ for (User *U : ReflectFunction->users()) {
+ assert(isa<CallInst>(U) && "Only a call instruction can use _reflect");
+ CallInst *Reflect = cast<CallInst>(U);
+
+ assert((Reflect->getNumOperands() == 2) &&
+ "Only one operand expect for _reflect function");
+ // In cuda, we will have an extra constant-to-generic conversion of
+ // the string.
+ const Value *Str = Reflect->getArgOperand(0);
+ if (isa<CallInst>(Str)) {
+ // CUDA path
+ const CallInst *ConvCall = cast<CallInst>(Str);
+ Str = ConvCall->getArgOperand(0);
+ }
+ assert(isa<ConstantExpr>(Str) &&
+ "Format of _reflect function not recognized");
+ const ConstantExpr *GEP = cast<ConstantExpr>(Str);
+
+ const Value *Sym = GEP->getOperand(0);
+ assert(isa<Constant>(Sym) && "Format of _reflect function not recognized");
+
+ const Constant *SymStr = cast<Constant>(Sym);
+
+ assert(isa<ConstantDataSequential>(SymStr->getOperand(0)) &&
+ "Format of _reflect function not recognized");
+
+ assert(cast<ConstantDataSequential>(SymStr->getOperand(0))->isCString() &&
+ "Format of _reflect function not recognized");
+
+ std::string ReflectArg =
+ cast<ConstantDataSequential>(SymStr->getOperand(0))->getAsString();
+
+ ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
+ DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
+
+ int ReflectVal = 0; // The default value is 0
+ if (VarMap.find(ReflectArg) != VarMap.end()) {
+ ReflectVal = VarMap[ReflectArg];
+ }
+ Reflect->replaceAllUsesWith(
+ ConstantInt::get(Reflect->getType(), ReflectVal));
+ ToRemove.push_back(Reflect);
+ }
+ if (ToRemove.size() == 0)
+ return false;
+
+ for (unsigned i = 0, e = ToRemove.size(); i != e; ++i)
+ ToRemove[i]->eraseFromParent();
+ return true;
+}
+
+bool NVVMReflect::runOnModule(Module &M) {
+ if (!NVVMReflectEnabled)
+ return false;
+
+ setVarMap();
+
+
+ bool Res = false;
+ std::string Name;
+ Type *Tys[1];
+ Type *I8Ty = Type::getInt8Ty(M.getContext());
+ Function *ReflectFunction;
+
+ // Check for standard overloaded versions of llvm.nvvm.reflect
+
+ for (unsigned i = 0; i != 5; ++i) {
+ Tys[0] = PointerType::get(I8Ty, i);
+ Name = Intrinsic::getName(Intrinsic::nvvm_reflect, Tys);
+ ReflectFunction = M.getFunction(Name);
+ if(ReflectFunction != 0) {
+ Res |= handleFunction(ReflectFunction);
+ }
+ }
+
+ ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
+ // If reflect function is not used, then there will be
+ // no entry in the module.
+ if (ReflectFunction != 0)
+ Res |= handleFunction(ReflectFunction);
+
+ return Res;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
new file mode 100644
index 000000000000..cc7d4dc5ece7
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheNVPTXTarget32;
+Target llvm::TheNVPTXTarget64;
+
+extern "C" void LLVMInitializeNVPTXTargetInfo() {
+ RegisterTarget<Triple::nvptx> X(TheNVPTXTarget32, "nvptx",
+ "NVIDIA PTX 32-bit");
+ RegisterTarget<Triple::nvptx64> Y(TheNVPTXTarget64, "nvptx64",
+ "NVIDIA PTX 64-bit");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
new file mode 100644
index 000000000000..02c5a94c3d03
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
@@ -0,0 +1,122 @@
+#ifndef CL_COMMON_DEFINES_H
+#define CL_COMMON_DEFINES_H
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+
+//
+// Common defines for Image intrinsics
+// Channel order
+enum {
+ CLK_R = 0x10B0,
+ CLK_A = 0x10B1,
+ CLK_RG = 0x10B2,
+ CLK_RA = 0x10B3,
+ CLK_RGB = 0x10B4,
+ CLK_RGBA = 0x10B5,
+ CLK_BGRA = 0x10B6,
+ CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+ CLK_xRGB = 0x10B7,
+#endif
+
+ CLK_INTENSITY = 0x10B8,
+ CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ ,
+ CLK_Rx = 0x10BA,
+ CLK_RGx = 0x10BB,
+ CLK_RGBx = 0x10BC
+#endif
+};
+
+typedef enum clk_channel_type {
+ // valid formats for float return types
+ CLK_SNORM_INT8 = 0x10D0, // four channel RGBA unorm8
+ CLK_SNORM_INT16 = 0x10D1, // four channel RGBA unorm16
+ CLK_UNORM_INT8 = 0x10D2, // four channel RGBA unorm8
+ CLK_UNORM_INT16 = 0x10D3, // four channel RGBA unorm16
+ CLK_HALF_FLOAT = 0x10DD, // four channel RGBA half
+ CLK_FLOAT = 0x10DE, // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ CLK_UNORM_SHORT_565 = 0x10D4,
+ CLK_UNORM_SHORT_555 = 0x10D5,
+ CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+ // valid only for integer return types
+ CLK_SIGNED_INT8 = 0x10D7,
+ CLK_SIGNED_INT16 = 0x10D8,
+ CLK_SIGNED_INT32 = 0x10D9,
+ CLK_UNSIGNED_INT8 = 0x10DA,
+ CLK_UNSIGNED_INT16 = 0x10DB,
+ CLK_UNSIGNED_INT32 = 0x10DC,
+
+ // CI SPI for CPU
+ __CLK_UNORM_INT8888, // four channel ARGB unorm8
+ __CLK_UNORM_INT8888R, // four channel BGRA unorm8
+
+ __CLK_VALID_IMAGE_TYPE_COUNT,
+ __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+ __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4, // number of bits required to
+ // represent any image type
+ __CLK_VALID_IMAGE_TYPE_MASK = (1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS) - 1
+} clk_channel_type;
+
+typedef enum clk_sampler_type {
+ __CLK_ADDRESS_BASE = 0,
+ CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE,
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR,
+#endif
+ __CLK_ADDRESS_MASK =
+ CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP_TO_EDGE |
+ CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+ __CLK_ADDRESS_BITS = 3, // number of bits required to
+ // represent address info
+
+ __CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS,
+ CLK_NORMALIZED_COORDS_FALSE = 0,
+ CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE,
+ __CLK_NORMALIZED_MASK =
+ CLK_NORMALIZED_COORDS_FALSE | CLK_NORMALIZED_COORDS_TRUE,
+ __CLK_NORMALIZED_BITS = 1, // number of bits required to
+ // represent normalization
+
+ __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS,
+ CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE,
+ CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE,
+ CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE,
+ __CLK_FILTER_MASK =
+ CLK_FILTER_NEAREST | CLK_FILTER_LINEAR | CLK_FILTER_ANISOTROPIC,
+ __CLK_FILTER_BITS = 2, // number of bits required to
+ // represent address info
+
+ __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
+ CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE,
+ CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE,
+ CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE,
+ __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR | CLK_MIP_ANISOTROPIC,
+ __CLK_MIP_BITS = 2,
+
+ __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS,
+ __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK |
+ __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+
+ __CLK_ANISOTROPIC_RATIO_BITS = 5,
+ __CLK_ANISOTROPIC_RATIO_MASK =
+ (int) 0x80000000 >> (__CLK_ANISOTROPIC_RATIO_BITS - 1)
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+#endif // CL_COMMON_DEFINES_H